From: Tara L Andrews Date: Tue, 17 Jan 2012 20:18:20 +0000 (+0100) Subject: add JSON alignment table parsing X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=a731e73a29eb1faeee2db782961c8ebe1f67239f;p=scpubgit%2Fstemmatology.git add JSON alignment table parsing --- diff --git a/Makefile.PL b/Makefile.PL index a03074d..acd2439 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -12,6 +12,7 @@ requires( 'File::Which' ); requires( 'Graph' ); requires( 'Graph::Reader::Dot' ); requires( 'IPC::Run' ); +requires( 'JSON' ); requires( 'KiokuDB::Backend::DBI' ); requires( 'KiokuDB::GC::Naive' ); requires( 'KiokuDB::TypeMap' ); diff --git a/TODO b/TODO index 950e7ea..6fe8703 100644 --- a/TODO +++ b/TODO @@ -1,2 +1,3 @@ * support and test deletion of a tradition * support and test deletion of a relationship +* make proper tests for Collation::Reading diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm index 7c6f58e..8abeb6a 100644 --- a/lib/Text/Tradition.pm +++ b/lib/Text/Tradition.pm @@ -128,6 +128,8 @@ following: =item * CTE - a TEI XML format produced by Classical Text Editor +=item * JSON - an alignment table in JSON format, as produced by CollateX and other tools + =item * KUL - a specific CSV format for variants, not documented here =item * TEI - a TEI parallel segmentation format file @@ -254,7 +256,7 @@ sub BUILD { $self->_save_collation( $collation ); # Call the appropriate parser on the given data - my @format_standalone = qw/ Self CollateText CollateX CTE TEI Tabular /; + my @format_standalone = qw/ Self CollateText CollateX CTE JSON TEI Tabular /; my @format_basetext = qw/ KUL /; my $use_base; my $format = $init_args->{'input'}; diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm index 0b57ff8..6664fee 100644 --- a/lib/Text/Tradition/Collation/Reading.pm +++ b/lib/Text/Tradition/Collation/Reading.pm @@ -139,9 +139,27 @@ around BUILDARGS => sub { $args = { @_ }; } + # Did we get a JSON token to parse into a reading? If so, massage it. + if( exists $args->{'json'} ) { + my $j = delete $args->{'json'}; + + # If we have separated punctuation and don't want it, restore it. + if( exists $j->{'punctuation'} + && exists $args->{'separate_punctuation'} + && !$args->{'separate_punctuation'} ) { + $args->{'text'} = _restore_punct( $j->{'t'}, $j->{'punctuation'} ); + + # In all other cases, keep text and punct as they are. + } else { + $args->{'text'} = $j->{'t'}; + # we don't use comparison or canonical forms here + $args->{'punctuation'} = $j->{'punctuation'} + if exists $j->{'punctuation'}; + } + } + # If one of our special booleans is set, we change the text and the # ID to match. - if( exists $args->{'is_lacuna'} && !exists $args->{'text'} ) { $args->{'text'} = '#LACUNA#'; } elsif( exists $args->{'is_start'} ) { @@ -161,7 +179,8 @@ around BUILDARGS => sub { # Post-process the given text, stripping punctuation if we are asked. sub BUILD { my $self = shift; - if( $self->separate_punctuation && !$self->is_meta ) { + if( $self->separate_punctuation && !$self->is_meta + && !$self->punctuation ) { my $pos = 0; my $wspunct = ''; # word sans punctuation foreach my $char ( split( //, $self->text ) ) { @@ -178,12 +197,16 @@ sub BUILD { sub punctuated_form { my $self = shift; - my $word = $self->text; - foreach my $p ( sort { $a->{pos} <=> $b->{pos} } $self->punctuation ) { + return _restore_punct( $self->text, $self->punctuation ); +} + +sub _restore_punct { + my( $word, @punct ) = @_; + foreach my $p ( sort { $a->{pos} <=> $b->{pos} } @punct ) { substr( $word, $p->{pos}, 0, $p->{char} ); } return $word; -} +} =head2 is_meta diff --git a/lib/Text/Tradition/Parser/JSON.pm b/lib/Text/Tradition/Parser/JSON.pm new file mode 100644 index 0000000..8d0aa8c --- /dev/null +++ b/lib/Text/Tradition/Parser/JSON.pm @@ -0,0 +1,191 @@ +package Text::Tradition::Parser::JSON; + +use strict; +use warnings; +use JSON qw/ from_json /; + +=head1 NAME + +Text::Tradition::Parser::JSON + +=head1 SYNOPSIS + + use Text::Tradition; + + my $tradition = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'JSON', + 'string' => $json_encoded_utf8, + ); + +=head1 DESCRIPTION + +Parser module for Text::Tradition to read a JSON alignment table format such +as that produced by CollateX. + +=head1 METHODS + +=head2 B( $tradition, $option_hash ) + +Takes an initialized tradition and a set of options; creates the +appropriate nodes and edges on the graph, as well as the appropriate +witness objects. The $option_hash must contain either a 'file' or a +'string' argument with the JSON structure to be parsed. + +The structure of the JSON is thus: + + { alignment => [ { witness => "SIGIL", + tokens => [ { t => "TEXT" }, ... ] }, + { witness => "SIG2", + tokens => [ { t => "TEXT" }, ... ] }, + ... ], + }; + + +Longer lacunae in the text, to be disregarded in cladistic analysis, may be +represented with the meta-reading '#LACUNA#'. Multiple lacuna tags in sequence +are collapsed into a single multi-reading lacuna. + +If a witness name ends in the collation's ac_label, it will be treated as +an extra layer of the 'main' witness whose sigil it shares. + +=begin testing + +use Text::Tradition; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; +eval { no warnings; binmode $DB::OUT, ":utf8"; }; + +use_ok( 'Text::Tradition::Parser::JSON' ); + +open( JSFILE, 't/data/cx16.json' ); +binmode JSFILE, ':utf8'; +my @lines = ; +close JSFILE; + +my $t = Text::Tradition->new( + 'name' => 'json', + 'input' => 'JSON', + 'string' => join( '', @lines ), +); + +is( ref( $t ), 'Text::Tradition', "Parsed a JSON alignment" ); +if( $t ) { + is( scalar $t->collation->readings, 26, "Collation has all readings" ); + is( scalar $t->collation->paths, 32, "Collation has all paths" ); + is( scalar $t->witnesses, 3, "Collation has all witnesses" ); +} + +=end testing + +=cut + +sub parse { + my( $tradition, $opts ) = @_; + my $c = $tradition->collation; + + my $table = from_json( $opts->{'string'} ); + + # Create the witnesses + my @witnesses; + my %ac_wits; # Track these for later removal + foreach my $sigil ( map { $_->{'witness'} } @{$table->{'alignment'}} ) { + my $wit = $tradition->add_witness( 'sigil' => $sigil ); + $wit->path( [ $c->start ] ); + push( @witnesses, $wit ); + my $aclabel = $c->ac_label; + if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) { + $ac_wits{$1} = $wit; + } + } + + # Create the readings in each row + my $length = exists $table->{'length'} + ? $table->{'length'} + : scalar @{$table->{'alignment'}->[0]->{'tokens'}}; + + foreach my $idx ( 0 .. $length - 1 ) { + my @tokens = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}}; + my @readings = make_nodes( $c, $idx, @tokens ); + foreach my $w ( 0 .. $#readings ) { + # push the appropriate node onto the appropriate witness path + my $rdg = $readings[$w]; + if( $rdg ) { + my $wit = $witnesses[$w]; + push( @{$wit->path}, $rdg ); + } # else skip it for empty readings. + } + } + + # Collapse our lacunae into a single node and + # push the end node onto all paths. + $c->end->rank( $length ); + foreach my $wit ( @witnesses ) { + my $p = $wit->path; + my $last_rdg = shift @$p; + my $new_p = [ $last_rdg ]; + foreach my $rdg ( @$p ) { + # Omit the reading if we are in a lacuna already. + next if $rdg->is_lacuna && $last_rdg->is_lacuna; + # Save the reading otherwise. + push( @$new_p, $rdg ); + $last_rdg = $rdg; + } + push( @$new_p, $c->end ); + $wit->path( $new_p ); + } + + # Fold any a.c. witnesses into their main witness objects, and + # delete the independent a.c. versions. + foreach my $a ( keys %ac_wits ) { + my $main_wit = $tradition->witness( $a ); + next unless $main_wit; + my $ac_wit = $ac_wits{$a}; + $main_wit->uncorrected_path( $ac_wit->path ); + $tradition->del_witness( $ac_wit ); + } + + # Join up the paths. + $c->make_witness_paths; + # Delete our unused lacuna nodes. + foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) { + $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg ); + } +} + +=head2 make_nodes( $collation, $index, @tokenlist ) + +Create readings from the unique tokens in @tokenlist, and set their rank to +$index. Returns an array of readings of the same size as the original @tokenlist. + +=cut + +sub make_nodes { + my( $c, $idx, @tokens ) = @_; + my %unique; + my $ctr = 1; + foreach my $t ( @tokens ) { + next unless $t; + my $id = join( ',', $idx, $ctr++ ); + my $rdg = Text::Tradition::Collation::Reading->new( + 'id' => $id, 'json' => $t, 'collation' => $c ); + my $comptoken = $c->collapse_punctuation ? $rdg->text + : $rdg->punctuated_form; + $unique{$comptoken} = $rdg; + $t->{'comptoken'} = $comptoken; + } + map { $c->add_reading( $_ ) } values( %unique ); + return map { $_ && $unique{$_->{'comptoken'}} } @tokens; +} + +1; + +=head1 LICENSE + +This package is free software and is provided "as is" without express +or implied warranty. You can redistribute it and/or modify it under +the same terms as Perl itself. + +=head1 AUTHOR + +Tara L Andrews Eaurum@cpan.orgE diff --git a/t/data/cx16.json b/t/data/cx16.json new file mode 100644 index 0000000..e0e73a9 --- /dev/null +++ b/t/data/cx16.json @@ -0,0 +1 @@ +{"length":18,"alignment":[{"witness":"A","tokens":[{"t":"when"},{"t":"april"},{"t":"with"},{"t":"his"},{"t":"showers"},{"t":"sweet"},{"t":"with"},null,{"t":"fruit"},{"t":"the"},{"t":"drought"},{"t":"of"},{"t":"march"},{"t":"has"},{"t":"pierced"},{"t":"unto"},{"t":"the"},{"t":"root"}]},{"witness":"B","tokens":[{"t":"when"},null,null,null,{"t":"showers"},{"t":"sweet"},{"t":"with"},{"t":"april"},{"t":"fruit"},{"t":"the"},{"t":"march"},{"t":"of"},{"t":"drought"},{"t":"has"},{"t":"pierced"},{"t":"to"},{"t":"the"},{"t":"root"}]},{"witness":"C","tokens":[{"t":"when"},null,null,null,{"t":"showers"},{"t":"sweet"},{"t":"with"},{"t":"april"},{"t":"fruit"},{"t":"teh"},{"t":"drought"},{"t":"of"},{"t":"march"},{"t":"has"},{"t":"pierced"},{"t":"teh"},{"t":"rood"},null]}]} \ No newline at end of file diff --git a/t/text_tradition_parser_json.t b/t/text_tradition_parser_json.t new file mode 100644 index 0000000..658c585 --- /dev/null +++ b/t/text_tradition_parser_json.t @@ -0,0 +1,40 @@ +#!/usr/bin/perl -w + +use strict; +use Test::More 'no_plan'; +$| = 1; + + + +# =begin testing +{ +use Text::Tradition; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; +eval { no warnings; binmode $DB::OUT, ":utf8"; }; + +use_ok( 'Text::Tradition::Parser::JSON' ); + +open( JSFILE, 't/data/cx16.json' ); +binmode JSFILE, ':utf8'; +my @lines = ; +close JSFILE; + +my $t = Text::Tradition->new( + 'name' => 'json', + 'input' => 'JSON', + 'string' => join( '', @lines ), +); + +is( ref( $t ), 'Text::Tradition', "Parsed a JSON alignment" ); +if( $t ) { + is( scalar $t->collation->readings, 26, "Collation has all readings" ); + is( scalar $t->collation->paths, 32, "Collation has all paths" ); + is( scalar $t->witnesses, 3, "Collation has all witnesses" ); +} +} + + + + +1;