make the rest of the tests work with the new Witness
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / JSON.pm
CommitLineData
a731e73a 1package Text::Tradition::Parser::JSON;
2
3use strict;
4use warnings;
5use JSON qw/ from_json /;
6
7=head1 NAME
8
9Text::Tradition::Parser::JSON
10
11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $tradition = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'JSON',
18 'string' => $json_encoded_utf8,
19 );
20
21=head1 DESCRIPTION
22
23Parser module for Text::Tradition to read a JSON alignment table format such
24as that produced by CollateX.
25
26=head1 METHODS
27
28=head2 B<parse>( $tradition, $option_hash )
29
30Takes an initialized tradition and a set of options; creates the
31appropriate nodes and edges on the graph, as well as the appropriate
32witness objects. The $option_hash must contain either a 'file' or a
33'string' argument with the JSON structure to be parsed.
34
35The structure of the JSON is thus:
36
37 { alignment => [ { witness => "SIGIL",
38 tokens => [ { t => "TEXT" }, ... ] },
39 { witness => "SIG2",
40 tokens => [ { t => "TEXT" }, ... ] },
41 ... ],
42 };
43
44
45Longer lacunae in the text, to be disregarded in cladistic analysis, may be
46represented with the meta-reading '#LACUNA#'. Multiple lacuna tags in sequence
47are collapsed into a single multi-reading lacuna.
48
49If a witness name ends in the collation's ac_label, it will be treated as
50an extra layer of the 'main' witness whose sigil it shares.
51
52=begin testing
53
54use Text::Tradition;
55binmode STDOUT, ":utf8";
56binmode STDERR, ":utf8";
57eval { no warnings; binmode $DB::OUT, ":utf8"; };
58
59use_ok( 'Text::Tradition::Parser::JSON' );
60
61open( JSFILE, 't/data/cx16.json' );
62binmode JSFILE, ':utf8';
63my @lines = <JSFILE>;
64close JSFILE;
65
66my $t = Text::Tradition->new(
67 'name' => 'json',
68 'input' => 'JSON',
69 'string' => join( '', @lines ),
70);
71
72is( ref( $t ), 'Text::Tradition', "Parsed a JSON alignment" );
73if( $t ) {
74 is( scalar $t->collation->readings, 26, "Collation has all readings" );
75 is( scalar $t->collation->paths, 32, "Collation has all paths" );
76 is( scalar $t->witnesses, 3, "Collation has all witnesses" );
77}
78
b0b4421a 79my %seen_wits;
80map { $seen_wits{$_} = 0 } qw/ A B C /;
81# Check that we have the right witnesses
82foreach my $wit ( $t->witnesses ) {
83 $seen_wits{$wit->sigil} = 1;
84}
85is( scalar keys %seen_wits, 3, "No extra witnesses were made" );
86foreach my $k ( keys %seen_wits ) {
87 ok( $seen_wits{$k}, "Witness $k still exists" );
88}
89
90# Check that the witnesses have the right texts
91foreach my $wit ( $t->witnesses ) {
92 my $origtext = join( ' ', @{$wit->text} );
93 my $graphtext = $t->collation->path_text( $wit->sigil );
94 is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
95}
96
a731e73a 97=end testing
98
99=cut
100
101sub parse {
102 my( $tradition, $opts ) = @_;
103 my $c = $tradition->collation;
104
105 my $table = from_json( $opts->{'string'} );
106
107 # Create the witnesses
30f0df34 108 my @witnesses; # Keep the ordered list of our witnesses
a731e73a 109 my %ac_wits; # Track these for later removal
110 foreach my $sigil ( map { $_->{'witness'} } @{$table->{'alignment'}} ) {
82fa4d57 111 my $wit = $tradition->add_witness(
112 'sigil' => $sigil, 'sourcetype' => 'collation' );
a731e73a 113 $wit->path( [ $c->start ] );
114 push( @witnesses, $wit );
115 my $aclabel = $c->ac_label;
116 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
b0b4421a 117 $ac_wits{$sigil} = $1;
a731e73a 118 }
119 }
b0b4421a 120
121 # Save the original witness text for consistency checking. We do this
122 # in a separate loop to make sure we have all base witnesses defined,
123 # and to make sure that our munging and comparing later doesn't affect
124 # the original text.
125 foreach my $intext ( @{$table->{'alignment'}} ) {
126 my $rs = $intext->{'witness'};
127 my $is_layer = exists $ac_wits{$rs};
128 my $wit = $tradition->witness( $is_layer ? $ac_wits{$rs} : $rs );
129 my @tokens = grep { $_ && $_->{'t'} !~ /^\#.*\#$/ } @{$intext->{'tokens'}};
130 my @words = map { _restore_punct( $_ ) } @tokens;
131 $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
132 }
a731e73a 133
134 # Create the readings in each row
135 my $length = exists $table->{'length'}
136 ? $table->{'length'}
137 : scalar @{$table->{'alignment'}->[0]->{'tokens'}};
138
139 foreach my $idx ( 0 .. $length - 1 ) {
140 my @tokens = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
141 my @readings = make_nodes( $c, $idx, @tokens );
142 foreach my $w ( 0 .. $#readings ) {
143 # push the appropriate node onto the appropriate witness path
144 my $rdg = $readings[$w];
145 if( $rdg ) {
146 my $wit = $witnesses[$w];
147 push( @{$wit->path}, $rdg );
148 } # else skip it for empty readings.
149 }
150 }
151
152 # Collapse our lacunae into a single node and
153 # push the end node onto all paths.
154 $c->end->rank( $length );
155 foreach my $wit ( @witnesses ) {
156 my $p = $wit->path;
157 my $last_rdg = shift @$p;
158 my $new_p = [ $last_rdg ];
159 foreach my $rdg ( @$p ) {
160 # Omit the reading if we are in a lacuna already.
161 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
162 # Save the reading otherwise.
163 push( @$new_p, $rdg );
164 $last_rdg = $rdg;
165 }
166 push( @$new_p, $c->end );
167 $wit->path( $new_p );
168 }
169
170 # Fold any a.c. witnesses into their main witness objects, and
171 # delete the independent a.c. versions.
172 foreach my $a ( keys %ac_wits ) {
b0b4421a 173 my $ac_wit = $tradition->witness( $a );
174 my $main_wit = $tradition->witness( $ac_wits{$a} );
a731e73a 175 next unless $main_wit;
a731e73a 176 $main_wit->uncorrected_path( $ac_wit->path );
177 $tradition->del_witness( $ac_wit );
178 }
179
180 # Join up the paths.
181 $c->make_witness_paths;
182 # Delete our unused lacuna nodes.
183 foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
184 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
185 }
202ccb18 186
187 # Note that our ranks and common readings are set.
188 $c->_graphcalc_done(1);
a731e73a 189}
190
191=head2 make_nodes( $collation, $index, @tokenlist )
192
193Create readings from the unique tokens in @tokenlist, and set their rank to
194$index. Returns an array of readings of the same size as the original @tokenlist.
195
196=cut
197
198sub make_nodes {
199 my( $c, $idx, @tokens ) = @_;
200 my %unique;
30f0df34 201 my @readings;
15db7774 202 my $commonctr = 0;
30f0df34 203 foreach my $j ( 0 .. $#tokens ) {
204 if( $tokens[$j] ) {
b0b4421a 205 my $word = _restore_punct( $tokens[$j] );
30f0df34 206 my $rdg;
b0b4421a 207 if( exists( $unique{$word} ) ) {
208 $rdg = $unique{$word};
30f0df34 209 } else {
210 my %args = ( 'id' => join( ',', $idx, $j+1 ),
15db7774 211 'rank' => $idx,
b0b4421a 212 'text' => $word,
30f0df34 213 'collation' => $c );
15db7774 214 if( $word eq '#LACUNA#' ) {
215 $args{'is_lacuna'} = 1
216 } else {
217 $commonctr++;
218 }
30f0df34 219 $rdg = Text::Tradition::Collation::Reading->new( %args );
b0b4421a 220 $unique{$word} = $rdg;
30f0df34 221 }
222 push( @readings, $rdg );
223 } else {
15db7774 224 $commonctr++;
30f0df34 225 push( @readings, undef );
226 }
a731e73a 227 }
15db7774 228 if( $commonctr == 1 ) {
229 # Whichever reading isn't a lacuna is a common node.
230 foreach my $rdg ( values %unique ) {
231 next if $rdg->is_lacuna;
232 $rdg->is_common( 1 );
233 }
234 }
a731e73a 235 map { $c->add_reading( $_ ) } values( %unique );
30f0df34 236 return @readings;
a731e73a 237}
238
b0b4421a 239# Utility function for parsing JSON from nCritic
240sub _restore_punct {
241 my( $token ) = @_;
242 my $word = $token->{'t'};
243 return $word unless exists $token->{'punctuation'};
244 foreach my $p ( sort { $a->{pos} <=> $b->{pos} } @{$token->{'punctuation'}} ) {
245 substr( $word, $p->{pos}, 0, $p->{char} );
246 }
247 return $word;
248}
249
a731e73a 2501;
251
252=head1 LICENSE
253
254This package is free software and is provided "as is" without express
255or implied warranty. You can redistribute it and/or modify it under
256the same terms as Perl itself.
257
258=head1 AUTHOR
259
260Tara L Andrews E<lt>aurum@cpan.orgE<gt>