base/lib/Text/Tradition/Parser/JSON.pm

   1 package Text::Tradition::Parser::JSON;
   2
   3 use strict;
   4 use warnings;
   5 use JSON qw/ from_json /;
   6
   7 =head1 NAME
   8
   9 Text::Tradition::Parser::JSON
  10
  11 =head1 SYNOPSIS
  12
  13   use Text::Tradition;
  14
  15   my $tradition = Text::Tradition->new(
  16     'name' => 'my text',
  17     'input' => 'JSON',
  18     'string' => $json_encoded_utf8,
  19     );
  20
  21 =head1 DESCRIPTION
  22
  23 Parser module for Text::Tradition to read a JSON alignment table format such
  24 as that produced by CollateX.
  25
  26 =head1 METHODS
  27
  28 =head2 B<parse>( $tradition, $option_hash )
  29
  30 Takes an initialized tradition and a set of options; creates the
  31 appropriate nodes and edges on the graph, as well as the appropriate
  32 witness objects.  The $option_hash must contain either a 'file' or a
  33 'string' argument with the JSON structure to be parsed.
  34
  35 The structure of the JSON is thus:
  36
  37  { alignment => [ { witness => "SIGIL",
  38                     tokens => [ { t => "TEXT" }, ... ] },
  39                   { witness => "SIG2",
  40                     tokens => [ { t => "TEXT" }, ... ] },
  41                     ... ],
  42  };
  43
  44
  45 Longer lacunae in the text, to be disregarded in cladistic analysis, may be
  46 represented with the meta-reading '#LACUNA#'.  Multiple lacuna tags in sequence
  47 are collapsed into a single multi-reading lacuna.
  48
  49 If a witness name ends in the collation's ac_label, it will be treated as
  50 an extra layer of the 'main' witness whose sigil it shares.
  51
  52 =begin testing
  53
  54 use Text::Tradition;
  55 binmode STDOUT, ":utf8";
  56 binmode STDERR, ":utf8";
  57 eval { no warnings; binmode $DB::OUT, ":utf8"; };
  58
  59 use_ok( 'Text::Tradition::Parser::JSON' );
  60
  61 open( JSFILE, 't/data/cx16.json' );
  62 binmode JSFILE, ':utf8';
  63 my @lines = <JSFILE>;
  64 close JSFILE;
  65
  66 my $t = Text::Tradition->new(
  67     'name' => 'json',
  68     'input' => 'JSON',
  69     'string' => join( '', @lines ),
  70 );
  71
  72 is( ref( $t ), 'Text::Tradition', "Parsed a JSON alignment" );
  73 if( $t ) {
  74     is( scalar $t->collation->readings, 26, "Collation has all readings" );
  75     is( scalar $t->collation->paths, 32, "Collation has all paths" );
  76     is( scalar $t->witnesses, 3, "Collation has all witnesses" );
  77 }
  78
  79 my %seen_wits;
  80 map { $seen_wits{$_} = 0 } qw/ A B C /;
  81 # Check that we have the right witnesses
  82 foreach my $wit ( $t->witnesses ) {
  83         $seen_wits{$wit->sigil} = 1;
  84 }
  85 is( scalar keys %seen_wits, 3, "No extra witnesses were made" );
  86 foreach my $k ( keys %seen_wits ) {
  87         ok( $seen_wits{$k}, "Witness $k still exists" );
  88 }
  89
  90 # Check that the witnesses have the right texts
  91 foreach my $wit ( $t->witnesses ) {
  92         my $origtext = join( ' ', @{$wit->text} );
  93         my $graphtext = $t->collation->path_text( $wit->sigil );
  94         is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
  95 }
  96
  97 =end testing
  98
  99 =cut
 100
 101 sub parse {
 102         my( $tradition, $opts ) = @_;
 103         my $c = $tradition->collation;
 104
 105         my $table = from_json( $opts->{'string'} );
 106
 107         # Create the witnesses
 108         my @witnesses; # Keep the ordered list of our witnesses
 109     my %ac_wits;  # Track these for later removal
 110     foreach my $sigil ( map { $_->{'witness'} } @{$table->{'alignment'}} ) {
 111         my $wit;
 112         if( $tradition->has_witness( $sigil ) ) {
 113                 $wit = $tradition->witness( $sigil );
 114                 $wit->is_collated( 1 );
 115         } else {
 116                         $wit = $tradition->add_witness(
 117                                 'sigil' => $sigil, 'sourcetype' => 'collation' );
 118                 }
 119         $wit->path( [ $c->start ] );
 120         push( @witnesses, $wit );
 121         my $aclabel = $c->ac_label;
 122         if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
 123             $ac_wits{$sigil} = $1;
 124         }
 125     }
 126
 127     # Save the original witness text for consistency checking. We do this
 128     # in a separate loop to make sure we have all base witnesses defined,
 129     # and to make sure that our munging and comparing later doesn't affect
 130     # the original text.
 131     foreach my $intext ( @{$table->{'alignment'}} ) {
 132         my $rs = $intext->{'witness'};
 133         my $is_layer = exists $ac_wits{$rs};
 134         my $wit = $tradition->witness( $is_layer ? $ac_wits{$rs} : $rs );
 135         my @tokens = grep { $_ && $_->{'t'} !~ /^\#.*\#$/ } @{$intext->{'tokens'}};
 136         my @words = map { _restore_punct( $_ ) } @tokens;
 137         $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
 138         }
 139
 140         # Create the readings in each row
 141     my $length = exists $table->{'length'}
 142         ? $table->{'length'}
 143         : scalar @{$table->{'alignment'}->[0]->{'tokens'}};
 144
 145     foreach my $idx ( 0 .. $length - 1 ) {
 146         my @tokens = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
 147         my @readings = make_nodes( $c, $idx, @tokens );
 148         foreach my $w ( 0 .. $#readings ) {
 149             # push the appropriate node onto the appropriate witness path
 150             my $rdg = $readings[$w];
 151             if( $rdg ) {
 152                 my $wit = $witnesses[$w];
 153                 push( @{$wit->path}, $rdg );
 154             } # else skip it for empty readings.
 155         }
 156     }
 157
 158     # Collapse our lacunae into a single node and
 159     # push the end node onto all paths.
 160     $c->end->rank( $length );
 161     foreach my $wit ( @witnesses ) {
 162         my $p = $wit->path;
 163         my $last_rdg = shift @$p;
 164         my $new_p = [ $last_rdg ];
 165         foreach my $rdg ( @$p ) {
 166                 # Omit the reading if we are in a lacuna already.
 167                 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
 168                         # Save the reading otherwise.
 169                         push( @$new_p, $rdg );
 170                         $last_rdg = $rdg;
 171         }
 172         push( @$new_p, $c->end );
 173         $wit->path( $new_p );
 174     }
 175
 176     # Fold any a.c. witnesses into their main witness objects, and
 177     # delete the independent a.c. versions.
 178     foreach my $a ( keys %ac_wits ) {
 179         my $ac_wit = $tradition->witness( $a );
 180         my $main_wit = $tradition->witness( $ac_wits{$a} );
 181         next unless $main_wit;
 182         $main_wit->uncorrected_path( $ac_wit->path );
 183         $tradition->del_witness( $ac_wit );
 184     }
 185
 186     # Join up the paths.
 187     $c->make_witness_paths;
 188     # Delete our unused lacuna nodes.
 189         foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
 190                 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
 191         }
 192
 193         # Note that our ranks and common readings are set.
 194         $c->_graphcalc_done(1);
 195 }
 196
 197 =head2 make_nodes( $collation, $index, @tokenlist )
 198
 199 Create readings from the unique tokens in @tokenlist, and set their rank to
 200 $index.  Returns an array of readings of the same size as the original @tokenlist.
 201
 202 =cut
 203
 204 sub make_nodes {
 205         my( $c, $idx, @tokens ) = @_;
 206         my %unique;
 207         my @readings;
 208         my $commonctr = 0;
 209         foreach my $j ( 0 .. $#tokens ) {
 210                 if( $tokens[$j] ) {
 211                         my $word = _restore_punct( $tokens[$j] );
 212                         my $rdg;
 213                         if( exists( $unique{$word} ) ) {
 214                                 $rdg = $unique{$word};
 215                         } else {
 216                                 my %args = ( 'id' => 'r' . join( '.', $idx, $j+1 ),
 217                                         'rank' => $idx,
 218                                         'text' => $word,
 219                                         'collation' => $c );
 220                                 if( $word eq '#LACUNA#' ) {
 221                                         $args{'is_lacuna'} = 1
 222                                 } else {
 223                                         $commonctr++;
 224                                 }
 225                                 $rdg = Text::Tradition::Collation::Reading->new( %args );
 226                                 $unique{$word} = $rdg;
 227                         }
 228                         push( @readings, $rdg );
 229                 } else {
 230                         $commonctr++;
 231                         push( @readings, undef );
 232                 }
 233         }
 234         if( $commonctr == 1 ) {
 235                 # Whichever reading isn't a lacuna is a common node.
 236                 foreach my $rdg ( values %unique ) {
 237                         next if $rdg->is_lacuna;
 238                         $rdg->is_common( 1 );
 239                 }
 240         }
 241         map { $c->add_reading( $_ ) } values( %unique );
 242         return @readings;
 243 }
 244
 245 # Utility function for parsing JSON from nCritic
 246 sub _restore_punct {
 247         my( $token ) = @_;
 248         my $word = $token->{'t'};
 249         return $word unless exists $token->{'punctuation'};
 250         foreach my $p ( sort { $a->{pos} <=> $b->{pos} } @{$token->{'punctuation'}} ) {
 251                 substr( $word, $p->{pos}, 0, $p->{char} );
 252         }
 253         return $word;
 254 }
 255
 256 1;
 257
 258 =head1 LICENSE
 259
 260 This package is free software and is provided "as is" without express
 261 or implied warranty.  You can redistribute it and/or modify it under
 262 the same terms as Perl itself.
 263
 264 =head1 AUTHOR
 265
 266 Tara L Andrews E<lt>aurum@cpan.orgE<gt>