From: Tara L Andrews Date: Thu, 3 May 2012 10:57:11 +0000 (+0200) Subject: read lexeme info in GraphML parsing X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=70745e700723084e7f980b0bf65ce8e134931f44;p=scpubgit%2Fstemmatology.git read lexeme info in GraphML parsing --- diff --git a/Makefile.PL b/Makefile.PL index 75a438a..949a95c 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -34,5 +34,6 @@ requires( 'YAML::XS' ); # For the morphology stuff requires( 'Lingua::TagSet::Multext' ); requires( 'Lingua::TagSet::TreeTagger' ); +requires( 'Lingua::Features::Structure' ); build_requires( 'Test::Warn' ); &WriteAll; diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm index 29a8aa1..f090b34 100644 --- a/lib/Text/Tradition/Collation/Reading.pm +++ b/lib/Text/Tradition/Collation/Reading.pm @@ -2,6 +2,7 @@ package Text::Tradition::Collation::Reading; use Moose; use Module::Load; +use Text::Tradition::Error; use YAML::XS; use overload '""' => \&_stringify, 'fallback' => 1; @@ -195,6 +196,14 @@ around BUILDARGS => sub { $class->$orig( $args ); }; +# Look for a lexeme-string argument in the build args. +sub BUILD { + my( $self, $args ) = @_; + if( exists $args->{'lexemes'} ) { + $self->_deserialize_lexemes( $args->{'lexemes'} ); + } +} + =head2 is_meta A meta attribute (ha ha), which should be true if any of our 'special' @@ -321,6 +330,7 @@ sub lemmatize { # For graph serialization. Return a string representation of the associated # reading lexemes. +# TODO Push this in to the Lexeme package. sub _serialize_lexemes { my $self = shift; my @lexstrs; @@ -335,7 +345,42 @@ sub _serialize_lexemes { } return join( '|R|', @lexstrs ); } - + +sub _deserialize_lexemes { + my( $self, $data ) = @_; + return unless $data; + + # Need to have the lexeme modules in order to have lexemes. + eval { + use Text::Tradition::Collation::Reading::Lexeme; + use Text::Tradition::Collation::Reading::WordForm; + }; + throw( $@ ) if $@; + + # Good to go - add the lexemes. + my @lexemes; + foreach my $lexdata ( split( /\|R\|/, $data ) ) { + my( $lang, $lstring, $form, $allforms ) = split( /\|L\|/, $lexdata ); + my @wfdata; + push( @wfdata, $form ) if $form; + push( @wfdata, split( /\|M\|/, $allforms ) ); + my @wforms; + foreach my $wd ( @wfdata ) { + my $wf = Text::Tradition::Collation::Reading::WordForm->new( + 'serial' => $wd ); + push( @wforms, $wf ); + } + my %largs = ( 'language' => $lang, 'string' => $lstring ); + if( $form ) { + $largs{'form'} = shift @wforms; + $largs{'is_disambiguated'} = 1; + } + $largs{'wordform_matchlist'} = \@wforms; + push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new( %largs ) ); + } + $self->clear_lexemes; + $self->add_lexeme( @lexemes ); +} ## Utility methods @@ -344,7 +389,12 @@ sub TO_JSON { return $self->text; } -## TODO will need a throw() here +sub throw { + Text::Tradition::Error->throw( + 'ident' => 'Reading error', + 'message' => $_[0], + ); +} no Moose; __PACKAGE__->meta->make_immutable; diff --git a/lib/Text/Tradition/Collation/Reading/WordForm.pm b/lib/Text/Tradition/Collation/Reading/WordForm.pm index 4c81929..46a9f80 100644 --- a/lib/Text/Tradition/Collation/Reading/WordForm.pm +++ b/lib/Text/Tradition/Collation/Reading/WordForm.pm @@ -1,6 +1,7 @@ package Text::Tradition::Collation::Reading::WordForm; use Moose; +use Lingua::Features::Structure; =head1 NAME @@ -61,6 +62,19 @@ has 'morphology' => ( required => 1, ); +around BUILDARGS => sub { + my $orig = shift; + my $class = shift; + my $args = @_ == 1 ? $_[0] : { @_ }; + if( exists $args->{'serial'} ) { + my( $lang, $lemma, $morph ) = split( /\+\+/, delete $args->{'serial'} ); + $args->{'language'} = $lang; + $args->{'lemma'} = $lemma; + $args->{'morphology'} = Lingua::Features::Structure->from_string( $morph ); + } + $class->$orig( $args ); +}; + =head2 to_string Returns a string combination of language/lemma/morphology that can be used diff --git a/t/lexeme_serialize.t b/t/lexeme_serialize.t new file mode 100644 index 0000000..6bb714a --- /dev/null +++ b/t/lexeme_serialize.t @@ -0,0 +1,40 @@ +use lib 'lib'; +use strict; +use warnings; +use Test::More; +use Text::Tradition; + +eval "use Flemm"; +plan skip_all => "Flemm 3.1 required" if $@; + +binmode( STDOUT, ':utf8' ); +binmode( STDERR, ':utf8' ); + +my $tf = Text::Tradition->new( + 'input' => 'Self', + 'file' => 't/data/besoin.xml', + 'language' => 'French' ); + +$tf->lemmatize(); +my $graphmlstr = $tf->collation->as_graphml; +like( $graphmlstr, qr/graphml xmlns/, + "Serialized tradition after lemmatization" ); + +my $tf2 = Text::Tradition->new( + input => 'Self', + string => $graphmlstr, + language => 'French' ); + +is( ref $tf2, 'Text::Tradition', "Re-parsed tradition with lemmatization" ); +is( $tf->name, $tf2->name, "Traditions have same name" ); +foreach my $r ( $tf->collation->readings ) { + my $r2 = $tf2->collation->reading( $r->id ); + is( ref $r2, 'Text::Tradition::Collation::Reading', + "Reading $r exists in new tradition" ); + if( $r2 ) { + is( scalar $r->lexemes, scalar $r2->lexemes, + "Same number of lexemes in new tradition for $r" ); + } +} + +done_testing();