Commit | Line | Data |
d9e873d0 |
1 | package Text::Tradition::Parser::Tabular; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Text::CSV_XS; |
6 | |
7 | =head1 NAME |
8 | |
9 | Text::Tradition::Parser::Tabular |
10 | |
3b853983 |
11 | =head1 SYNOPSIS |
12 | |
13 | use Text::Tradition; |
14 | |
15 | my $t_from_file = Text::Tradition->new( |
16 | 'name' => 'my text', |
17 | 'input' => 'Tabular', |
18 | 'file' => '/path/to/collation.csv', |
19 | 'sep_char' => ',' |
20 | ); |
21 | |
22 | my $t_from_string = Text::Tradition->new( |
23 | 'name' => 'my text', |
24 | 'input' => 'Tabular', |
25 | 'string' => $tab_separated_collation, |
26 | 'sep_char' => "\t", |
27 | ); |
28 | |
d9e873d0 |
29 | =head1 DESCRIPTION |
30 | |
31 | Parser module for Text::Tradition to read an alignment table format, such as CSV. |
32 | |
33 | =head1 METHODS |
34 | |
e867486f |
35 | =head2 B<parse>( $tradition, $option_hash ) |
3b853983 |
36 | |
37 | Takes an initialized tradition and a set of options; creates the |
38 | appropriate nodes and edges on the graph, as well as the appropriate |
39 | witness objects. The $option_hash must contain either a 'file' or a |
40 | 'string' argument with the table to be parsed; it may also contain a |
41 | 'sep_char' argument to specify how the fields are separated. |
42 | |
43 | The table should have witnesses arranged in columns, with the witness sigla |
44 | in the first row. Empty cells are interpreted as omissions (and thus |
45 | stemmatologically relevant.) Longer lacunae in the text, to be disregarded |
46 | in cladistic analysis, may be represented by filling the appropriate cells |
47 | with the tag '#LACUNA#'. |
48 | |
49 | If a witness name ends in the collation's ac_label, it will be treated as |
50 | an 'ante-correction' version of the 'main' witness whose sigil it shares. |
51 | |
52 | =begin testing |
53 | |
54 | use Text::Tradition; |
55 | binmode STDOUT, ":utf8"; |
56 | binmode STDERR, ":utf8"; |
57 | eval { no warnings; binmode $DB::OUT, ":utf8"; }; |
58 | |
59 | my $csv = 't/data/florilegium.csv'; |
60 | my $t = Text::Tradition->new( |
61 | 'name' => 'inline', |
62 | 'input' => 'Tabular', |
63 | 'file' => $csv, |
64 | 'sep_char' => ',', |
65 | ); |
d9e873d0 |
66 | |
3b853983 |
67 | is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" ); |
d9e873d0 |
68 | |
3b853983 |
69 | ### TODO Check these figures |
70 | if( $t ) { |
71 | is( scalar $t->collation->readings, 313, "Collation has all readings" ); |
72 | is( scalar $t->collation->paths, 2877, "Collation has all paths" ); |
73 | is( scalar $t->witnesses, 13, "Collation has all witnesses" ); |
74 | } |
75 | |
76 | =end testing |
d9e873d0 |
77 | |
78 | =cut |
79 | |
80 | sub parse { |
dfc37e38 |
81 | my( $tradition, $opts ) = @_; |
d9e873d0 |
82 | my $c = $tradition->collation; # shorthand |
dfc37e38 |
83 | my $csv = Text::CSV_XS->new( { |
84 | binary => 1, # binary for UTF-8 |
85 | sep_char => exists $opts->{'sep_char'} ? $opts->{'sep_char'} : "\t" } |
86 | ); |
dfc37e38 |
87 | |
d9e873d0 |
88 | my $alignment_table; |
dfc37e38 |
89 | if( exists $opts->{'string' } ) { |
90 | my @lines = split( "\n", $opts->{'string'} ); |
91 | foreach my $l ( @lines ) { |
92 | my $status = $csv->parse( $l ); |
93 | if( $status ) { |
94 | push( @$alignment_table, [ $csv->fields ] ); |
95 | } else { |
96 | warn "Could not parse line $l: " . $csv->error_input; |
97 | } |
98 | } |
99 | } elsif( exists $opts->{'file'} ) { |
bb11025b |
100 | open( my $fh, $opts->{'file'} ) |
101 | or warn "Could not open input file " . $opts->{'file'}; |
102 | binmode( $fh, ':utf8' ); |
dfc37e38 |
103 | while( my $row = $csv->getline( $fh ) ) { |
104 | push( @$alignment_table, $row ); |
d9e873d0 |
105 | } |
dfc37e38 |
106 | close $fh; |
107 | } else { |
108 | warn "Could not find string or file option to parse"; |
109 | return; |
d9e873d0 |
110 | } |
dfc37e38 |
111 | |
d9e873d0 |
112 | # Set up the witnesses we find in the first line |
113 | my @witnesses; |
3b853983 |
114 | my %ac_wits; # Track these for later removal |
d9e873d0 |
115 | foreach my $sigil ( @{$alignment_table->[0]} ) { |
116 | my $wit = $tradition->add_witness( 'sigil' => $sigil ); |
117 | $wit->path( [ $c->start ] ); |
118 | push( @witnesses, $wit ); |
3b853983 |
119 | my $aclabel = $c->ac_label; |
120 | if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) { |
121 | $ac_wits{$1} = $wit; |
122 | } |
d9e873d0 |
123 | } |
124 | |
125 | # Now for the next rows, make nodes as necessary, assign their ranks, and |
126 | # add them to the witness paths. |
d9e873d0 |
127 | foreach my $idx ( 1 .. $#{$alignment_table} ) { |
128 | my $row = $alignment_table->[$idx]; |
129 | my $nodes = make_nodes( $c, $row, $idx ); |
130 | foreach my $w ( 0 .. $#{$row} ) { |
131 | # push the appropriate node onto the appropriate witness path |
132 | my $word = $row->[$w]; |
133 | if( $word ) { |
134 | my $reading = $nodes->{$word}; |
135 | my $wit = $witnesses[$w]; |
136 | push( @{$wit->path}, $reading ); |
137 | } # else skip it for empty readings. |
138 | } |
139 | } |
140 | |
eca16057 |
141 | |
142 | # Collapse our lacunae into a single node and |
143 | # push the end node onto all paths. |
d9e873d0 |
144 | $c->end->rank( scalar @$alignment_table ); |
145 | foreach my $wit ( @witnesses ) { |
eca16057 |
146 | my $p = $wit->path; |
147 | my $last_rdg = shift @$p; |
148 | my $new_p = [ $last_rdg ]; |
149 | foreach my $rdg ( @$p ) { |
150 | if( $rdg->text eq '#LACUNA#' ) { |
151 | # If we are in a lacuna already, drop this node. |
152 | # Otherwise make a lacuna node and drop this node. |
153 | unless( $last_rdg->is_lacuna ) { |
a753cc84 |
154 | my $l = $c->add_reading( { |
155 | 'collation' => $c, |
156 | 'id' => $rdg->name, |
157 | 'is_lacuna' => 1, |
158 | } ); |
eca16057 |
159 | push( @$new_p, $l ); |
160 | $last_rdg = $l; |
161 | } |
162 | $c->del_reading( $rdg ); |
163 | } else { |
164 | # No lacuna, save the reading. |
165 | push( @$new_p, $rdg ); |
166 | } |
167 | } |
168 | push( @$new_p, $c->end ); |
169 | $wit->path( $new_p ); |
d9e873d0 |
170 | } |
171 | |
3b853983 |
172 | # Fold any a.c. witnesses into their main witness objects, and |
173 | # delete the independent a.c. versions. |
174 | foreach my $a ( keys %ac_wits ) { |
175 | my $main_wit = $tradition->witness( $a ); |
176 | next unless $main_wit; |
177 | my $ac_wit = $ac_wits{$a}; |
178 | $main_wit->uncorrected_path( $ac_wit->path ); |
179 | $tradition->del_witness( $ac_wit ); |
180 | } |
181 | |
d9e873d0 |
182 | # Join up the paths. |
183 | $c->make_witness_paths; |
d9e873d0 |
184 | } |
185 | |
186 | sub make_nodes { |
187 | my( $collation, $row, $index ) = @_; |
188 | my %unique; |
189 | foreach my $w ( @$row ) { |
190 | $unique{$w} = 1 if $w; |
191 | } |
192 | my $ctr = 1; |
193 | foreach my $w ( keys %unique ) { |
a753cc84 |
194 | my $rargs = { |
195 | 'collation' => $collation, |
196 | 'id' => "$index,$ctr", |
197 | 'rank' => $index, |
198 | 'text' => $w, |
199 | }; |
200 | my $r = $collation->add_reading( $rargs ); |
d9e873d0 |
201 | $unique{$w} = $r; |
a753cc84 |
202 | $ctr++; |
d9e873d0 |
203 | } |
204 | return \%unique; |
205 | } |
206 | |
3b853983 |
207 | 1; |
208 | |
209 | =head1 LICENSE |
210 | |
211 | This package is free software and is provided "as is" without express |
212 | or implied warranty. You can redistribute it and/or modify it under |
213 | the same terms as Perl itself. |
214 | |
215 | =head1 AUTHOR |
216 | |
217 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |