Commit | Line | Data |
a731e73a |
1 | package Text::Tradition::Parser::JSON; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use JSON qw/ from_json /; |
6 | |
7 | =head1 NAME |
8 | |
9 | Text::Tradition::Parser::JSON |
10 | |
11 | =head1 SYNOPSIS |
12 | |
13 | use Text::Tradition; |
14 | |
15 | my $tradition = Text::Tradition->new( |
16 | 'name' => 'my text', |
17 | 'input' => 'JSON', |
18 | 'string' => $json_encoded_utf8, |
19 | ); |
20 | |
21 | =head1 DESCRIPTION |
22 | |
23 | Parser module for Text::Tradition to read a JSON alignment table format such |
24 | as that produced by CollateX. |
25 | |
26 | =head1 METHODS |
27 | |
28 | =head2 B<parse>( $tradition, $option_hash ) |
29 | |
30 | Takes an initialized tradition and a set of options; creates the |
31 | appropriate nodes and edges on the graph, as well as the appropriate |
32 | witness objects. The $option_hash must contain either a 'file' or a |
33 | 'string' argument with the JSON structure to be parsed. |
34 | |
35 | The structure of the JSON is thus: |
36 | |
37 | { alignment => [ { witness => "SIGIL", |
38 | tokens => [ { t => "TEXT" }, ... ] }, |
39 | { witness => "SIG2", |
40 | tokens => [ { t => "TEXT" }, ... ] }, |
41 | ... ], |
42 | }; |
43 | |
44 | |
45 | Longer lacunae in the text, to be disregarded in cladistic analysis, may be |
46 | represented with the meta-reading '#LACUNA#'. Multiple lacuna tags in sequence |
47 | are collapsed into a single multi-reading lacuna. |
48 | |
49 | If a witness name ends in the collation's ac_label, it will be treated as |
50 | an extra layer of the 'main' witness whose sigil it shares. |
51 | |
52 | =begin testing |
53 | |
54 | use Text::Tradition; |
55 | binmode STDOUT, ":utf8"; |
56 | binmode STDERR, ":utf8"; |
57 | eval { no warnings; binmode $DB::OUT, ":utf8"; }; |
58 | |
59 | use_ok( 'Text::Tradition::Parser::JSON' ); |
60 | |
61 | open( JSFILE, 't/data/cx16.json' ); |
62 | binmode JSFILE, ':utf8'; |
63 | my @lines = <JSFILE>; |
64 | close JSFILE; |
65 | |
66 | my $t = Text::Tradition->new( |
67 | 'name' => 'json', |
68 | 'input' => 'JSON', |
69 | 'string' => join( '', @lines ), |
70 | ); |
71 | |
72 | is( ref( $t ), 'Text::Tradition', "Parsed a JSON alignment" ); |
73 | if( $t ) { |
74 | is( scalar $t->collation->readings, 26, "Collation has all readings" ); |
75 | is( scalar $t->collation->paths, 32, "Collation has all paths" ); |
76 | is( scalar $t->witnesses, 3, "Collation has all witnesses" ); |
77 | } |
78 | |
b0b4421a |
79 | my %seen_wits; |
80 | map { $seen_wits{$_} = 0 } qw/ A B C /; |
81 | # Check that we have the right witnesses |
82 | foreach my $wit ( $t->witnesses ) { |
83 | $seen_wits{$wit->sigil} = 1; |
84 | } |
85 | is( scalar keys %seen_wits, 3, "No extra witnesses were made" ); |
86 | foreach my $k ( keys %seen_wits ) { |
87 | ok( $seen_wits{$k}, "Witness $k still exists" ); |
88 | } |
89 | |
90 | # Check that the witnesses have the right texts |
91 | foreach my $wit ( $t->witnesses ) { |
92 | my $origtext = join( ' ', @{$wit->text} ); |
93 | my $graphtext = $t->collation->path_text( $wit->sigil ); |
94 | is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil ); |
95 | } |
96 | |
a731e73a |
97 | =end testing |
98 | |
99 | =cut |
100 | |
101 | sub parse { |
102 | my( $tradition, $opts ) = @_; |
103 | my $c = $tradition->collation; |
104 | |
105 | my $table = from_json( $opts->{'string'} ); |
106 | |
107 | # Create the witnesses |
30f0df34 |
108 | my @witnesses; # Keep the ordered list of our witnesses |
a731e73a |
109 | my %ac_wits; # Track these for later removal |
110 | foreach my $sigil ( map { $_->{'witness'} } @{$table->{'alignment'}} ) { |
4889be4f |
111 | my $wit; |
7c2ed85e |
112 | if( $tradition->has_witness( $sigil ) ) { |
4889be4f |
113 | $wit = $tradition->witness( $sigil ); |
114 | $wit->is_collated( 1 ); |
115 | } else { |
116 | $wit = $tradition->add_witness( |
117 | 'sigil' => $sigil, 'sourcetype' => 'collation' ); |
118 | } |
a731e73a |
119 | $wit->path( [ $c->start ] ); |
120 | push( @witnesses, $wit ); |
121 | my $aclabel = $c->ac_label; |
122 | if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) { |
b0b4421a |
123 | $ac_wits{$sigil} = $1; |
a731e73a |
124 | } |
125 | } |
b0b4421a |
126 | |
127 | # Save the original witness text for consistency checking. We do this |
128 | # in a separate loop to make sure we have all base witnesses defined, |
129 | # and to make sure that our munging and comparing later doesn't affect |
130 | # the original text. |
131 | foreach my $intext ( @{$table->{'alignment'}} ) { |
132 | my $rs = $intext->{'witness'}; |
133 | my $is_layer = exists $ac_wits{$rs}; |
134 | my $wit = $tradition->witness( $is_layer ? $ac_wits{$rs} : $rs ); |
135 | my @tokens = grep { $_ && $_->{'t'} !~ /^\#.*\#$/ } @{$intext->{'tokens'}}; |
136 | my @words = map { _restore_punct( $_ ) } @tokens; |
137 | $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words ); |
138 | } |
a731e73a |
139 | |
140 | # Create the readings in each row |
141 | my $length = exists $table->{'length'} |
142 | ? $table->{'length'} |
143 | : scalar @{$table->{'alignment'}->[0]->{'tokens'}}; |
144 | |
145 | foreach my $idx ( 0 .. $length - 1 ) { |
146 | my @tokens = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}}; |
147 | my @readings = make_nodes( $c, $idx, @tokens ); |
148 | foreach my $w ( 0 .. $#readings ) { |
149 | # push the appropriate node onto the appropriate witness path |
150 | my $rdg = $readings[$w]; |
151 | if( $rdg ) { |
152 | my $wit = $witnesses[$w]; |
153 | push( @{$wit->path}, $rdg ); |
154 | } # else skip it for empty readings. |
155 | } |
156 | } |
157 | |
158 | # Collapse our lacunae into a single node and |
159 | # push the end node onto all paths. |
160 | $c->end->rank( $length ); |
161 | foreach my $wit ( @witnesses ) { |
162 | my $p = $wit->path; |
163 | my $last_rdg = shift @$p; |
164 | my $new_p = [ $last_rdg ]; |
165 | foreach my $rdg ( @$p ) { |
166 | # Omit the reading if we are in a lacuna already. |
167 | next if $rdg->is_lacuna && $last_rdg->is_lacuna; |
168 | # Save the reading otherwise. |
169 | push( @$new_p, $rdg ); |
170 | $last_rdg = $rdg; |
171 | } |
172 | push( @$new_p, $c->end ); |
173 | $wit->path( $new_p ); |
174 | } |
175 | |
176 | # Fold any a.c. witnesses into their main witness objects, and |
177 | # delete the independent a.c. versions. |
178 | foreach my $a ( keys %ac_wits ) { |
b0b4421a |
179 | my $ac_wit = $tradition->witness( $a ); |
180 | my $main_wit = $tradition->witness( $ac_wits{$a} ); |
a731e73a |
181 | next unless $main_wit; |
a731e73a |
182 | $main_wit->uncorrected_path( $ac_wit->path ); |
183 | $tradition->del_witness( $ac_wit ); |
184 | } |
185 | |
186 | # Join up the paths. |
187 | $c->make_witness_paths; |
188 | # Delete our unused lacuna nodes. |
189 | foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) { |
190 | $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg ); |
191 | } |
202ccb18 |
192 | |
193 | # Note that our ranks and common readings are set. |
194 | $c->_graphcalc_done(1); |
a731e73a |
195 | } |
196 | |
197 | =head2 make_nodes( $collation, $index, @tokenlist ) |
198 | |
199 | Create readings from the unique tokens in @tokenlist, and set their rank to |
200 | $index. Returns an array of readings of the same size as the original @tokenlist. |
201 | |
202 | =cut |
203 | |
204 | sub make_nodes { |
205 | my( $c, $idx, @tokens ) = @_; |
206 | my %unique; |
30f0df34 |
207 | my @readings; |
15db7774 |
208 | my $commonctr = 0; |
30f0df34 |
209 | foreach my $j ( 0 .. $#tokens ) { |
210 | if( $tokens[$j] ) { |
b0b4421a |
211 | my $word = _restore_punct( $tokens[$j] ); |
30f0df34 |
212 | my $rdg; |
b0b4421a |
213 | if( exists( $unique{$word} ) ) { |
214 | $rdg = $unique{$word}; |
30f0df34 |
215 | } else { |
10e4b1ac |
216 | my %args = ( 'id' => 'r' . join( '.', $idx, $j+1 ), |
15db7774 |
217 | 'rank' => $idx, |
b0b4421a |
218 | 'text' => $word, |
30f0df34 |
219 | 'collation' => $c ); |
15db7774 |
220 | if( $word eq '#LACUNA#' ) { |
221 | $args{'is_lacuna'} = 1 |
222 | } else { |
223 | $commonctr++; |
224 | } |
30f0df34 |
225 | $rdg = Text::Tradition::Collation::Reading->new( %args ); |
b0b4421a |
226 | $unique{$word} = $rdg; |
30f0df34 |
227 | } |
228 | push( @readings, $rdg ); |
229 | } else { |
15db7774 |
230 | $commonctr++; |
30f0df34 |
231 | push( @readings, undef ); |
232 | } |
a731e73a |
233 | } |
15db7774 |
234 | if( $commonctr == 1 ) { |
235 | # Whichever reading isn't a lacuna is a common node. |
236 | foreach my $rdg ( values %unique ) { |
237 | next if $rdg->is_lacuna; |
238 | $rdg->is_common( 1 ); |
239 | } |
240 | } |
a731e73a |
241 | map { $c->add_reading( $_ ) } values( %unique ); |
30f0df34 |
242 | return @readings; |
a731e73a |
243 | } |
244 | |
b0b4421a |
245 | # Utility function for parsing JSON from nCritic |
246 | sub _restore_punct { |
247 | my( $token ) = @_; |
248 | my $word = $token->{'t'}; |
249 | return $word unless exists $token->{'punctuation'}; |
250 | foreach my $p ( sort { $a->{pos} <=> $b->{pos} } @{$token->{'punctuation'}} ) { |
251 | substr( $word, $p->{pos}, 0, $p->{char} ); |
252 | } |
253 | return $word; |
254 | } |
255 | |
a731e73a |
256 | 1; |
257 | |
258 | =head1 LICENSE |
259 | |
260 | This package is free software and is provided "as is" without express |
261 | or implied warranty. You can redistribute it and/or modify it under |
262 | the same terms as Perl itself. |
263 | |
264 | =head1 AUTHOR |
265 | |
266 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |