From: Tara L Andrews Date: Wed, 2 May 2012 13:28:03 +0000 (+0200) Subject: make French morphology use Lingua objects; add tests X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=6ad2ce785dc85432a6227c7b2efdae0364959714;p=scpubgit%2Fstemmatology.git make French morphology use Lingua objects; add tests --- diff --git a/Makefile.PL b/Makefile.PL index ba2ecf5..cf2a5e2 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -30,5 +30,8 @@ requires( 'TryCatch' ); requires( 'XML::Easy::Syntax' ); requires( 'XML::LibXML' ); requires( 'XML::LibXML::XPathContext' ); +# For the morphology stuff +requires( 'Lingua::TagSet::Multext' ); +requires( 'Lingua::TagSet::TreeTagger' ); build_requires( 'Test::Warn' ); &WriteAll; diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index b246c7e..cc26b0e 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -284,6 +284,9 @@ sub add_reading { my( $self, $reading ) = @_; unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) { my %args = %$reading; + if( $self->tradition->has_language && !exists $args{'language'} ) { + $args{'language'} = $self->tradition->language; + } $reading = Text::Tradition::Collation::Reading->new( 'collation' => $self, %args ); @@ -1386,21 +1389,13 @@ sub path_text { $start = $self->start unless $start; $end = $self->end unless $end; my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit ); - return $self->_text_from_path( @path ); -} - -# Utility function so that we can cheat and use it when we need both the path -# and its text. -sub _text_from_path { - my( $self, @path ) = @_; my $pathtext = ''; my $last; foreach my $r ( @path ) { - if( $r->join_prior || !$last || $last->join_next ) { - $pathtext .= $r->text; - } else { - $pathtext .= ' ' . $r->text; - } + unless ( $r->join_prior || !$last || $last->join_next ) { + $pathtext .= ' '; + } + $pathtext .= $r->text; $last = $r; } return $pathtext; diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm index 11ec36e..4bdaecb 100644 --- a/lib/Text/Tradition/Collation/Reading.pm +++ b/lib/Text/Tradition/Collation/Reading.pm @@ -1,6 +1,7 @@ package Text::Tradition::Collation::Reading; use Moose; +use Module::Load; use overload '""' => \&_stringify, 'fallback' => 1; =head1 NAME @@ -89,7 +90,7 @@ has 'text' => ( has 'language' => ( is => 'ro', isa => 'Str', - default => 'Default', + predicate => 'has_language', ); has 'is_start' => ( @@ -277,47 +278,29 @@ sub _stringify { A few methods to try to tack on morphological information. -=head2 use_lexemes +=head2 lexemes -TBD +=head2 has_lexemes + +=head2 clear_lexemes + +=head2 add_lexeme + +=head2 lemmatize =cut -# sub use_lexemes { -# my( $self, @lexemes ) = @_; -# # The lexemes need to be the same as $self->text. -# my $cmpstr = $self->has_normal_form ? lc( $self->normal_form ) : lc( $self->text ); -# $cmpstr =~ s/[\s-]+//g; -# my $lexstr = lc( join( '', @lexemes ) ); -# $lexstr =~ s/[\s-]+//g; -# unless( $lexstr eq $cmpstr ) { -# warn "Cannot split " . $self->text . " into " . join( '.', @lexemes ); -# return; -# } -# $self->_clear_morph; -# map { $self->_add_morph( { $_ => [] } ) } @lexemes; -# } -# -# sub add_morphological_tag { -# my( $self, $lexeme, $opts ) = @_; -# my $struct; -# unless( $opts ) { -# # No lexeme was passed; use reading text. -# $opts = $lexeme; -# $lexeme = $self->text; -# $self->use_lexemes( $lexeme ); -# } -# # Get the correct container -# ( $struct ) = grep { exists $_->{$lexeme} } $self->lexemes; -# unless( $struct ) { -# warn "No lexeme $lexeme exists in this reading"; -# return; -# } -# # Now make the morph object and add it to this lexeme. -# my $morph_obj = Text::Tradition::Collation::Reading::Morphology->new( $opts ); -# # TODO Check for existence -# push( @{$struct->{$lexeme}}, $morph_obj ); -# } +sub lemmatize { + my $self = shift; + unless( $self->has_language ) { + warn "Please set a language to lemmatize a tradition"; + return; + } + my $mod = "Text::Tradition::Language::" . $self->language; + load( $mod ); + $mod->can( 'reading_lookup' )->( $self ); + +} ## Utility methods @@ -331,87 +314,4 @@ sub TO_JSON { no Moose; __PACKAGE__->meta->make_immutable; -################################################### -### Morphology objects, to be attached to readings -################################################### - -package Text::Tradition::Collation::Reading::Morphology; - -use Moose; - -has 'lemma' => ( - is => 'ro', - isa => 'Str', - required => 1, - ); - -has 'code' => ( - is => 'ro', - isa => 'Str', - required => 1, - ); - -has 'language' => ( - is => 'ro', - isa => 'Str', - required => 1, - ); - -## Transmute codes into comparison arrays for our various languages. - -around BUILDARGS => sub { - my $orig = shift; - my $class = shift; - my $args; - if( @_ == 1 && ref( $_[0] ) ) { - $args = shift; - } else { - $args = { @_ }; - } - if( exists( $args->{'serial'} ) ) { - my( $lemma, $code ) = split( /!!/, delete $args->{'serial'} ); - $args->{'lemma'} = $lemma; - $args->{'code'} = $code; - } - $class->$orig( $args ); -}; - -sub serialization { - my $self = shift; - return join( '!!', $self->lemma, $self->code ); -}; - -sub comparison_array { - my $self = shift; - if( $self->language eq 'French' ) { - my @array; - my @bits = split( /\+/, $self->code ); - # First push the non k/v parts. - while( @bits && $bits[0] !~ /=/ ) { - push( @array, shift @bits ); - } - while( @array < 2 ) { - push( @array, undef ); - } - # Now push the k/v parts in a known order. - my @fields = qw/ Pers Nb Temps Genre Spec Fonc /; - my %props; - map { my( $k, $v ) = split( /=/, $_ ); $props{$k} = $v; } @bits; - foreach my $k ( @fields ) { - push( @array, $props{$k} ); - } - # Give the answer. - return @array; - } elsif( $self->language eq 'English' ) { - # Do something as yet undetermined - } else { - # Latin or Greek or Armenian, just split the chars - return split( '', $self->code ); - } -}; - -no Moose; -__PACKAGE__->meta->make_immutable; - 1; - diff --git a/lib/Text/Tradition/Collation/Reading/WordForm.pm b/lib/Text/Tradition/Collation/Reading/WordForm.pm index 8f519f9..4c81929 100644 --- a/lib/Text/Tradition/Collation/Reading/WordForm.pm +++ b/lib/Text/Tradition/Collation/Reading/WordForm.pm @@ -57,27 +57,22 @@ has 'lemma' => ( has 'morphology' => ( is => 'ro', - isa => 'ArrayRef', + isa => 'Lingua::Features::Structure', required => 1, ); -around BUILDARGS => sub { - my $orig = shift; - my $class = shift; - my %args = @_ == 1 ? %{$_[0]} : @_; - unless( ref( $args{'morphology'} ) ) { - my @morph = split( '', $args{'morphology'} ); - $args{'morphology'} = \@morph; - } - $class->$orig( %args ); -}; - -sub _stringify { +=head2 to_string + +Returns a string combination of language/lemma/morphology that can be used +in equivalence testing. + +=cut + +sub to_string { my $self = shift; - return sprintf( "%s//%s//%s", $self->language, $self->lemma, - join( '|', @{$self->morphology} ) ); + return join( '++', $self->language, $self->lemma, $self->morphology->to_string ); } - + no Moose; __PACKAGE__->meta->make_immutable; diff --git a/lib/Text/Tradition/Language/French.pm b/lib/Text/Tradition/Language/French.pm index 8028c98..780b9de 100644 --- a/lib/Text/Tradition/Language/French.pm +++ b/lib/Text/Tradition/Language/French.pm @@ -1,7 +1,9 @@ package Text::Tradition::Language::French; use Encode qw/ encode_utf8 decode_utf8 /; -use IPC::Run qw/ run binary /; +use IPC::Run qw/ run /; +use Lingua::TagSet::Multext; +use Lingua::TagSet::TreeTagger; use Module::Load; use Text::Tradition::Collation::Reading::Lexeme; use Text::Tradition::Collation::Reading::WordForm; @@ -11,11 +13,16 @@ my $MORPHDIR = '/Users/tla/Projects/morphology'; =head1 NAME -Text::Tradition::Language::French - language-specific modules for French +Text::Tradition::Language::French - language-specific module for French =head1 DESCRIPTION -Implements morphology lookup for French words in context. +Implements morphology lookup for French words in context. This module +depends on the Flemm module for French lemmatization +(L in conjunction with +the TreeTagger software +(L), which is +(for now) expected to be installed in $MORPHDIR/TreeTagger. =head1 SUBROUTINES @@ -23,6 +30,55 @@ Implements morphology lookup for French words in context. Evaluates the string using the Flemm package, and returns the results. +=begin testing + +binmode STDOUT, ':utf8'; +use Text::Tradition; +use_ok( 'Text::Tradition::Language::French' ); + +eval "use Flemm"; +my $err = $@; + +SKIP: { + skip "Package Flemm not found" if $err; + my $tf = Text::Tradition->new( + 'input' => 'Self', + 'file' => 't/data/besoin.xml', + 'language' => 'French' ); + + is( $tf->language, 'French', "Set language okay" ); + $tf->lemmatize(); + # Test the lemmatization. How many readings now have morphological info? + # Do the lexemes match the reading? + my $ambig = 0; + foreach my $r ( $tf->collation->readings ) { + next if $r->is_meta; + ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); + my @lex = $r->lexemes; + my $lexstr = join( '', map { $_->string } @lex ); + my $textstr = $r->text; + $textstr =~ s/\s+//g; + is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); + foreach my $l ( @lex ) { + next if $l->is_disambiguated; + # printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id, + # join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) ); + $ambig++; + } + } + is( $ambig, 102, "Found 102 ambiguous forms as expected" ); + + # Try setting the normal form of a reading and re-analyzing + my $mr = $tf->collation->reading('99,2'); + is( $mr->text, 'minspire', "Picked correct test reading" ); + is( $mr->language, 'French', "Reading has correct language setting" ); + $mr->normal_form( "m'inspire" ); + $mr->lemmatize; + is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" ); +} + +=end testing + =cut sub lemmatize { @@ -48,102 +104,28 @@ sub lemmatize { foreach my $sig ( keys %witness_paths ) { # Get the text as a sequence of readings and as a string print STDERR "Morphologizing witness $sig\n"; - my @path = @{$witness_paths{$sig}}; - my $tagresult = _treetag_string( $c->_text_from_path( @path ) ); - if( $tagresult ) { - # Map the tagged words onto the original readings, splitting - # them up into lexemes where necessary. - # NOTE we can have multiple lexemes in a reading, but not - # multiple readings to a lexeme. - my @tags = split( /\n/, $tagresult ); - my @lexemes; - my $curr_rdg = shift @path; - my @curr_lexemes; - my $unused_rdg_part; - foreach my $tag ( @tags ) { - # Get the original word - my( $lexeme, @rest ) = split( /\t/, $tag ); - # Lemmatize the whole - my @forms = _parse_wordform( _flemm_lookup( $tag ) ); - my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new( - 'string' => $lexeme, 'language' => 'French', - 'wordform_matchlist' => \@forms ); - # Find the next non-meta reading - while( $curr_rdg->is_meta ) { - $curr_rdg = shift @path; - } - unless( $curr_rdg ) { - warn "Ran out of readings in sequence for " . $wit->sigil - . " at $lexeme"; - last; - } - if( $unused_rdg_part && - $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { - # Nth part of curr_rdg - $unused_rdg_part = $2; - push( @curr_lexemes, $lexobj ); - } elsif( $curr_rdg->text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { - # Flag an error if there is already an unused reading part. - warn "Skipped over unused text $unused_rdg_part at $curr_rdg" - if $unused_rdg_part; - $unused_rdg_part = $2; # will be empty if the whole reading matched - push( @curr_lexemes, $lexobj ); - } else { - # We do not cope with the idea of a lexeme being - # spread across multiple readings. - warn "Word sequence changed unexpectedly in text"; - # See if we can find a matching reading - my @lookahead; - my $matched; - while( my $nr = shift @path ) { - if( $nr->text =~ /^\Q$lexeme\E/ ) { - $curr_rdg = $lookahead[-1] if @lookahead; - $matched = 1; - last; - } else { - push( @lookahead, $nr ); - } - } - # No match? Restore the state we had - unless( $matched ) { - unshift( @path, @lookahead ); - } - # Trigger a move - $unused_rdg_part = ''; - } - - unless( $unused_rdg_part ) { - # Record the lexemes for the given reading. - #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n", - # join( ' ', map { $_->string } @curr_lexemes ), - # $curr_rdg->id, $curr_rdg->text ); - _update_reading_lexemes( $curr_rdg, @curr_lexemes ); - $curr_rdg = shift @path; - @curr_lexemes = (); - } - } - } + _lemmatize_sequence( undef, @{$witness_paths{$sig}} ); } } sub _update_reading_lexemes { - my( $reading, @lexemes ) = @_; - if( $reading->has_lexemes ) { + my( $replace, $reading, @lexemes ) = @_; + if( $reading->has_lexemes && !$replace ) { # We need to merge what is in @lexemes with what we have already. my @oldlex = $reading->lexemes; my $cmp1 = join( '||', map { $_->string } @oldlex ); my $cmp2 = join( '||', map { $_->string } @lexemes ); - if ( @oldlex == @lexemes && $cmp1 == $cmp2 ) { + if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) { # The lexeme strings are the same, so merge the possible # word forms from new to old. foreach my $i ( 0 .. $#lexemes ) { my $ol = $oldlex[$i]; my $nl = $lexemes[$i]; my %ofw; - map { $ofw{$_->_stringify} = 1 } $ol->matching_forms; + map { $ofw{$_->to_string} = 1 } $ol->matching_forms; foreach my $form ( $nl->matching_forms ) { - unless( $ofw{$form->_stringify} ) { - print STDERR "Adding form " . $form->_stringify . + unless( $ofw{$form->to_string} ) { + print STDERR "Adding form " . $form->to_string . " to lexeme " . $nl->string . " at $reading\n"; $ol->add_matching_form( $form ); $ol->is_disambiguated(0); @@ -151,35 +133,125 @@ sub _update_reading_lexemes { } } } else { - $DB::single = 1; warn "Lexeme layout for $reading changed; replacing the lot"; $reading->clear_lexemes; $reading->add_lexeme( @lexemes ); } } else { + $reading->clear_lexemes if $replace; $reading->add_lexeme( @lexemes ); } } -=head2 word_lookup( $word ) +=head2 reading_lookup( $rdg[, $rdg, ...] ) -Looks up a word using the Flemm package, and returns the possible results. -It is better to use L for context sensitivity. +Looks up one or more readings using the Flemm package, and returns the +possible results. This uses the same logic as L above for the +entire tradition, but can also be used to (re-)analyze individual readings. =cut -sub word_lookup { - my $word = shift; - my $tagresult = _treetag_string( $word ); - my $lemmatizer; - try { - load 'Flemm'; - $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' ); - } catch { - warn "Cannot do French word lemmatization without Flemm: @_"; - return; +sub reading_lookup { + return _lemmatize_sequence( 1, @_ ); +} + +sub _lemmatize_sequence { + my( $replace, @path ) = @_; + $DB::single = 1 if $replace; + my $tagresult = _treetag_string( _text_from_path( 1, @path ) ); + if( $tagresult ) { + # Map the tagged words onto the original readings, splitting + # them up into lexemes where necessary. + # NOTE we can have multiple lexemes in a reading, but not + # multiple readings to a lexeme. + my @tags = split( /\n/, $tagresult ); + my @lexemes; + my $curr_rdg = shift @path; + my @curr_lexemes; + my $unused_rdg_part; + foreach my $tag ( @tags ) { + # Get the original word + my( $lexeme, @rest ) = split( /\t/, $tag ); + # Lemmatize the whole + my @forms = _parse_wordform( _flemm_lookup( $tag ) ); + my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new( + 'string' => $lexeme, 'language' => 'French', + 'wordform_matchlist' => \@forms ); + # Find the next non-meta reading + while( $curr_rdg && $curr_rdg->is_meta ) { + $curr_rdg = shift @path; + } + unless( $curr_rdg ) { + warn "Ran out of readings in sequence for " . $wit->sigil + . " at $lexeme"; + last; + } + my $curr_rdg_text = $curr_rdg->has_normal_form + ? $curr_rdg->normal_form : $curr_rdg->text; + if( $unused_rdg_part && + $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { + # Nth part of curr_rdg + $unused_rdg_part = $2; + push( @curr_lexemes, $lexobj ); + } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { + # Flag an error if there is already an unused reading part. + warn "Skipped over unused text $unused_rdg_part at $curr_rdg" + if $unused_rdg_part; + $unused_rdg_part = $2; # will be empty if the whole reading matched + push( @curr_lexemes, $lexobj ); + } else { + # We do not cope with the idea of a lexeme being + # spread across multiple readings. + warn "Word sequence changed unexpectedly in text"; + # See if we can find a matching reading + my @lookahead; + my $matched; + while( my $nr = shift @path ) { + my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text; + if( $nrtext =~ /^\Q$lexeme\E/ ) { + $curr_rdg = $lookahead[-1] if @lookahead; + $matched = 1; + last; + } else { + push( @lookahead, $nr ); + } + } + # No match? Restore the state we had + unless( $matched ) { + unshift( @path, @lookahead ); + } + # Trigger a move + $unused_rdg_part = ''; + } + + unless( $unused_rdg_part ) { + # Record the lexemes for the given reading. + #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n", + # join( ' ', map { $_->string } @curr_lexemes ), + # $curr_rdg->id, $curr_rdg->text ); + _update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes ); + $curr_rdg = shift @path; + @curr_lexemes = (); + } + } + } +} + +# Utility function so that we can cheat and use it when we need both the path +# and its text. +sub _text_from_path { + my( $normalize, @path ) = @_; + my $pathtext = ''; + my $last; + foreach my $r ( @path ) { + unless ( $r->join_prior || !$last || $last->join_next ) { + $pathtext .= ' '; + } + $pathtext .= ( $normalize && $r->has_normal_form ) + ? $r->normal_form : $r->text; + $last = $r; } - return _parse_wordform( _flemm_lookup( $tagresult ) ); + return $pathtext; } # Utility function that actually calls the tree tagger. @@ -229,18 +301,37 @@ sub _parse_wordform { foreach ( @results ) { my( $orig, $tag, $lemma ) = split( /\t/, $_ ); my( $pos, $morph ) = split( /:/, $tag ); - my $wf = Text::Tradition::Collation::Reading::WordForm->new( - 'language' => 'French', - 'lemma' => $lemma, - 'morphology' => [ split( //, $morph ) ], - ); - push( @forms, $wf ); + my $morphobj; + if( $morph ) { + $morphobj = Lingua::TagSet::Multext->tag2structure( $morph ); + } else { + # Use the TreeTagger info if there is no Flemm morphology. + $morphobj = Lingua::TagSet::TreeTagger->tag2structure( $pos ); + } + if( $morphobj ) { + my $wf = Text::Tradition::Collation::Reading::WordForm->new( + 'language' => 'French', + 'lemma' => $lemma, + 'morphology' => $morphobj, + ); + push( @forms, $wf ); + } else { + warn "No morphology found for word: $_"; + } } return @forms; } 1; +=head2 TODO + +=over + +=item * Handle package dependencies more gracefully + +=back + =head1 LICENSE This package is free software and is provided "as is" without express diff --git a/t/text_tradition_language_french.t b/t/text_tradition_language_french.t new file mode 100644 index 0000000..01b206b --- /dev/null +++ b/t/text_tradition_language_french.t @@ -0,0 +1,60 @@ +#!/usr/bin/perl -w + +use strict; +use Test::More 'no_plan'; +$| = 1; + + + +# =begin testing +{ +binmode STDOUT, ':utf8'; +use Text::Tradition; +use_ok( 'Text::Tradition::Language::French' ); + +eval "use Flemm"; +my $err = $@; + +SKIP: { + skip "Package Flemm not found" if $err; + my $tf = Text::Tradition->new( + 'input' => 'Self', + 'file' => 't/data/besoin.xml', + 'language' => 'French' ); + + is( $tf->language, 'French', "Set language okay" ); + $tf->lemmatize(); + # Test the lemmatization. How many readings now have morphological info? + # Do the lexemes match the reading? + my $ambig = 0; + foreach my $r ( $tf->collation->readings ) { + next if $r->is_meta; + ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); + my @lex = $r->lexemes; + my $lexstr = join( '', map { $_->string } @lex ); + my $textstr = $r->text; + $textstr =~ s/\s+//g; + is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); + foreach my $l ( @lex ) { + next if $l->is_disambiguated; + # printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id, + # join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) ); + $ambig++; + } + } + is( $ambig, 102, "Found 102 ambiguous forms as expected" ); + + # Try setting the normal form of a reading and re-analyzing + my $mr = $tf->collation->reading('99,2'); + is( $mr->text, 'minspire', "Picked correct test reading" ); + is( $mr->language, 'French', "Reading has correct language setting" ); + $mr->normal_form( "m'inspire" ); + $mr->lemmatize; + is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" ); +} +} + + + + +1;