make lacuna nodes lacunae from the beginning
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Tabular.pm
CommitLineData
d9e873d0 1package Text::Tradition::Parser::Tabular;
2
3use strict;
4use warnings;
5use Text::CSV_XS;
6
7=head1 NAME
8
9Text::Tradition::Parser::Tabular
10
3b853983 11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $t_from_file = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'Tabular',
18 'file' => '/path/to/collation.csv',
19 'sep_char' => ','
20 );
21
22 my $t_from_string = Text::Tradition->new(
23 'name' => 'my text',
24 'input' => 'Tabular',
25 'string' => $tab_separated_collation,
26 'sep_char' => "\t",
27 );
28
d9e873d0 29=head1 DESCRIPTION
30
31Parser module for Text::Tradition to read an alignment table format, such as CSV.
32
33=head1 METHODS
34
e867486f 35=head2 B<parse>( $tradition, $option_hash )
3b853983 36
37Takes an initialized tradition and a set of options; creates the
38appropriate nodes and edges on the graph, as well as the appropriate
39witness objects. The $option_hash must contain either a 'file' or a
40'string' argument with the table to be parsed; it may also contain a
41'sep_char' argument to specify how the fields are separated.
42
43The table should have witnesses arranged in columns, with the witness sigla
44in the first row. Empty cells are interpreted as omissions (and thus
45stemmatologically relevant.) Longer lacunae in the text, to be disregarded
46in cladistic analysis, may be represented by filling the appropriate cells
47with the tag '#LACUNA#'.
48
49If a witness name ends in the collation's ac_label, it will be treated as
50an 'ante-correction' version of the 'main' witness whose sigil it shares.
51
52=begin testing
53
54use Text::Tradition;
55binmode STDOUT, ":utf8";
56binmode STDERR, ":utf8";
57eval { no warnings; binmode $DB::OUT, ":utf8"; };
58
59my $csv = 't/data/florilegium.csv';
60my $t = Text::Tradition->new(
61 'name' => 'inline',
62 'input' => 'Tabular',
63 'file' => $csv,
64 'sep_char' => ',',
65 );
d9e873d0 66
3b853983 67is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
d9e873d0 68
3b853983 69### TODO Check these figures
70if( $t ) {
1d310495 71 is( scalar $t->collation->readings, 312, "Collation has all readings" );
72 is( scalar $t->collation->paths, 363, "Collation has all paths" );
3b853983 73 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
74}
75
76=end testing
d9e873d0 77
78=cut
79
80sub parse {
dfc37e38 81 my( $tradition, $opts ) = @_;
d9e873d0 82 my $c = $tradition->collation; # shorthand
dfc37e38 83 my $csv = Text::CSV_XS->new( {
84 binary => 1, # binary for UTF-8
85 sep_char => exists $opts->{'sep_char'} ? $opts->{'sep_char'} : "\t" }
86 );
dfc37e38 87
d9e873d0 88 my $alignment_table;
dfc37e38 89 if( exists $opts->{'string' } ) {
90 my @lines = split( "\n", $opts->{'string'} );
91 foreach my $l ( @lines ) {
92 my $status = $csv->parse( $l );
93 if( $status ) {
94 push( @$alignment_table, [ $csv->fields ] );
95 } else {
96 warn "Could not parse line $l: " . $csv->error_input;
97 }
98 }
99 } elsif( exists $opts->{'file'} ) {
bb11025b 100 open( my $fh, $opts->{'file'} )
101 or warn "Could not open input file " . $opts->{'file'};
102 binmode( $fh, ':utf8' );
dfc37e38 103 while( my $row = $csv->getline( $fh ) ) {
104 push( @$alignment_table, $row );
d9e873d0 105 }
dfc37e38 106 close $fh;
107 } else {
108 warn "Could not find string or file option to parse";
109 return;
d9e873d0 110 }
dfc37e38 111
d9e873d0 112 # Set up the witnesses we find in the first line
113 my @witnesses;
3b853983 114 my %ac_wits; # Track these for later removal
d9e873d0 115 foreach my $sigil ( @{$alignment_table->[0]} ) {
116 my $wit = $tradition->add_witness( 'sigil' => $sigil );
117 $wit->path( [ $c->start ] );
118 push( @witnesses, $wit );
3b853983 119 my $aclabel = $c->ac_label;
120 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
121 $ac_wits{$1} = $wit;
122 }
d9e873d0 123 }
124
125 # Now for the next rows, make nodes as necessary, assign their ranks, and
126 # add them to the witness paths.
d9e873d0 127 foreach my $idx ( 1 .. $#{$alignment_table} ) {
128 my $row = $alignment_table->[$idx];
129 my $nodes = make_nodes( $c, $row, $idx );
130 foreach my $w ( 0 .. $#{$row} ) {
131 # push the appropriate node onto the appropriate witness path
132 my $word = $row->[$w];
133 if( $word ) {
134 my $reading = $nodes->{$word};
135 my $wit = $witnesses[$w];
136 push( @{$wit->path}, $reading );
137 } # else skip it for empty readings.
138 }
139 }
140
eca16057 141 # Collapse our lacunae into a single node and
142 # push the end node onto all paths.
d9e873d0 143 $c->end->rank( scalar @$alignment_table );
144 foreach my $wit ( @witnesses ) {
eca16057 145 my $p = $wit->path;
146 my $last_rdg = shift @$p;
147 my $new_p = [ $last_rdg ];
148 foreach my $rdg ( @$p ) {
9f213786 149 if( $rdg->is_lacuna && $last_rdg->is_lacuna ) {
eca16057 150 # If we are in a lacuna already, drop this node.
9f213786 151 $c->del_reading( $rdg );
eca16057 152 } else {
9f213786 153 # Save the reading, lacuna or no.
eca16057 154 push( @$new_p, $rdg );
9f213786 155 $last_rdg = $rdg;
eca16057 156 }
157 }
158 push( @$new_p, $c->end );
159 $wit->path( $new_p );
d9e873d0 160 }
161
3b853983 162 # Fold any a.c. witnesses into their main witness objects, and
163 # delete the independent a.c. versions.
164 foreach my $a ( keys %ac_wits ) {
165 my $main_wit = $tradition->witness( $a );
166 next unless $main_wit;
167 my $ac_wit = $ac_wits{$a};
168 $main_wit->uncorrected_path( $ac_wit->path );
169 $tradition->del_witness( $ac_wit );
170 }
171
d9e873d0 172 # Join up the paths.
173 $c->make_witness_paths;
d9e873d0 174}
175
176sub make_nodes {
177 my( $collation, $row, $index ) = @_;
178 my %unique;
179 foreach my $w ( @$row ) {
180 $unique{$w} = 1 if $w;
181 }
182 my $ctr = 1;
183 foreach my $w ( keys %unique ) {
a753cc84 184 my $rargs = {
a753cc84 185 'id' => "$index,$ctr",
186 'rank' => $index,
187 'text' => $w,
188 };
9f213786 189 $rargs->{'is_lacuna'} = 1 if $w eq '#LACUNA#';
a753cc84 190 my $r = $collation->add_reading( $rargs );
d9e873d0 191 $unique{$w} = $r;
a753cc84 192 $ctr++;
d9e873d0 193 }
194 return \%unique;
195}
196
3b853983 1971;
198
199=head1 LICENSE
200
201This package is free software and is provided "as is" without express
202or implied warranty. You can redistribute it and/or modify it under
203the same terms as Perl itself.
204
205=head1 AUTHOR
206
207Tara L Andrews E<lt>aurum@cpan.orgE<gt>