From: Tara L Andrews Date: Wed, 25 Apr 2012 12:40:23 +0000 (+0200) Subject: make French morph tagging work; dependent on Flemm and TreeTagger X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?p=scpubgit%2Fstemmatology.git;a=commitdiff_plain;h=d3e7842a9402304b1b701c2a72db001b324f1f2f make French morph tagging work; dependent on Flemm and TreeTagger --- diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm index 20ba383..62b50f4 100644 --- a/lib/Text/Tradition.pm +++ b/lib/Text/Tradition.pm @@ -356,6 +356,17 @@ sub add_stemma { return $stemma; } +sub lemmatize { + my $self = shift; + unless( $self->has_language ) { + warn "Please set a language to lemmatize a tradition"; + return; + } + my $mod = "Text::Tradition::Language::" . $self->language; + load( $mod ); + $mod->can( 'lemmatize' )->( $self ); +} + no Moose; __PACKAGE__->meta->make_immutable; diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index 34151eb..7b26135 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -1386,6 +1386,13 @@ sub path_text { $start = $self->start unless $start; $end = $self->end unless $end; my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit ); + return _text_from_path( @path ); +} + +# Utility function so that we can cheat and use it when we need both the path +# and its text. +sub _text_from_path { + my( $self, @path ) = @_; my $pathtext = ''; my $last; foreach my $r ( @path ) { diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm index 47bd0f7..11ec36e 100644 --- a/lib/Text/Tradition/Collation/Reading.pm +++ b/lib/Text/Tradition/Collation/Reading.pm @@ -139,15 +139,16 @@ has 'normal_form' => ( # Holds the word form. If is_disambiguated is true, the form at index zero # is the correct one. -has 'lexemes' => ( +has 'reading_lexemes' => ( traits => ['Array'], - isa => 'ArrayRef[Text::Tradition::Collation::Lexeme]', + isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]', handles => { lexemes => 'elements', has_lexemes => 'count', - _clear_lexemes => 'clear', - _add_lexeme => 'push', + clear_lexemes => 'clear', + add_lexeme => 'push', }, + default => sub { [] }, ); ## For prefix/suffix readings diff --git a/lib/Text/Tradition/Collation/Reading/Lexeme.pm b/lib/Text/Tradition/Collation/Reading/Lexeme.pm index 4ea9bf3..ca4802c 100644 --- a/lib/Text/Tradition/Collation/Reading/Lexeme.pm +++ b/lib/Text/Tradition/Collation/Reading/Lexeme.pm @@ -76,13 +76,13 @@ has 'wordform_matchlist' => ( 'matching_forms' => 'elements', 'matching_form' => 'get', 'add_matching_form' => 'push', + }, ); has 'is_disambiguated' => ( - is => 'ro', + is => 'rw', isa => 'Bool', default => undef, - writer => '_set_disambiguated', ); has 'form' => ( @@ -91,6 +91,14 @@ has 'form' => ( writer => '_set_form', ); +# Do auto-disambiguation if we were created with a single wordform +sub BUILD { + my $self = shift; + + if( $self->matches == 1 ) { + $self->disambiguate( 0 ); + } +} =head2 disambiguate( $index ) @@ -105,7 +113,7 @@ sub disambiguate { throw( "There is no candidate wordform at index $idx" ) unless $form; $self->_set_form( $form ); - $self->_set_disambiguated( 1 ); + $self->is_disambiguated( 1 ); } =head2 lookup diff --git a/lib/Text/Tradition/Collation/Reading/WordForm.pm b/lib/Text/Tradition/Collation/Reading/WordForm.pm index a56cd2b..8f519f9 100644 --- a/lib/Text/Tradition/Collation/Reading/WordForm.pm +++ b/lib/Text/Tradition/Collation/Reading/WordForm.pm @@ -75,7 +75,7 @@ around BUILDARGS => sub { sub _stringify { my $self = shift; return sprintf( "%s//%s//%s", $self->language, $self->lemma, - join( '', $self->morphology ) ); + join( '|', @{$self->morphology} ) ); } no Moose; diff --git a/lib/Text/Tradition/Language/French.pm b/lib/Text/Tradition/Language/French.pm index d103bc0..8028c98 100644 --- a/lib/Text/Tradition/Language/French.pm +++ b/lib/Text/Tradition/Language/French.pm @@ -1,7 +1,13 @@ package Text::Tradition::Language::French; -#use Flemm; +use Encode qw/ encode_utf8 decode_utf8 /; +use IPC::Run qw/ run binary /; +use Module::Load; +use Text::Tradition::Collation::Reading::Lexeme; use Text::Tradition::Collation::Reading::WordForm; +use TryCatch; + +my $MORPHDIR = '/Users/tla/Projects/morphology'; =head1 NAME @@ -20,9 +26,139 @@ Evaluates the string using the Flemm package, and returns the results. =cut sub lemmatize { - my $text = shift; - + my $tradition = shift; + + # Given a tradition, lemmatize it witness by witness and see what we get. + my $workdir = File::Temp->newdir(); + my $c = $tradition->collation; + # First, clear out all existing lexemes from the readings. Save the + # path as long as we went to the trouble of generating it. + my %witness_paths; + foreach my $wit ( $tradition->witnesses ) { + my @sigla = ( $wit->sigil ); + push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered; + foreach my $sig ( @sigla ) { + my @path = grep { !$_->is_meta } + $c->reading_sequence( $c->start, $c->end, $sig ); + map { $_->clear_lexemes } @path; + $witness_paths{$sig} = \@path; + } + } + foreach my $sig ( keys %witness_paths ) { + # Get the text as a sequence of readings and as a string + print STDERR "Morphologizing witness $sig\n"; + my @path = @{$witness_paths{$sig}}; + my $tagresult = _treetag_string( $c->_text_from_path( @path ) ); + if( $tagresult ) { + # Map the tagged words onto the original readings, splitting + # them up into lexemes where necessary. + # NOTE we can have multiple lexemes in a reading, but not + # multiple readings to a lexeme. + my @tags = split( /\n/, $tagresult ); + my @lexemes; + my $curr_rdg = shift @path; + my @curr_lexemes; + my $unused_rdg_part; + foreach my $tag ( @tags ) { + # Get the original word + my( $lexeme, @rest ) = split( /\t/, $tag ); + # Lemmatize the whole + my @forms = _parse_wordform( _flemm_lookup( $tag ) ); + my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new( + 'string' => $lexeme, 'language' => 'French', + 'wordform_matchlist' => \@forms ); + # Find the next non-meta reading + while( $curr_rdg->is_meta ) { + $curr_rdg = shift @path; + } + unless( $curr_rdg ) { + warn "Ran out of readings in sequence for " . $wit->sigil + . " at $lexeme"; + last; + } + if( $unused_rdg_part && + $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { + # Nth part of curr_rdg + $unused_rdg_part = $2; + push( @curr_lexemes, $lexobj ); + } elsif( $curr_rdg->text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { + # Flag an error if there is already an unused reading part. + warn "Skipped over unused text $unused_rdg_part at $curr_rdg" + if $unused_rdg_part; + $unused_rdg_part = $2; # will be empty if the whole reading matched + push( @curr_lexemes, $lexobj ); + } else { + # We do not cope with the idea of a lexeme being + # spread across multiple readings. + warn "Word sequence changed unexpectedly in text"; + # See if we can find a matching reading + my @lookahead; + my $matched; + while( my $nr = shift @path ) { + if( $nr->text =~ /^\Q$lexeme\E/ ) { + $curr_rdg = $lookahead[-1] if @lookahead; + $matched = 1; + last; + } else { + push( @lookahead, $nr ); + } + } + # No match? Restore the state we had + unless( $matched ) { + unshift( @path, @lookahead ); + } + # Trigger a move + $unused_rdg_part = ''; + } + + unless( $unused_rdg_part ) { + # Record the lexemes for the given reading. + #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n", + # join( ' ', map { $_->string } @curr_lexemes ), + # $curr_rdg->id, $curr_rdg->text ); + _update_reading_lexemes( $curr_rdg, @curr_lexemes ); + $curr_rdg = shift @path; + @curr_lexemes = (); + } + } + } + } +} + +sub _update_reading_lexemes { + my( $reading, @lexemes ) = @_; + if( $reading->has_lexemes ) { + # We need to merge what is in @lexemes with what we have already. + my @oldlex = $reading->lexemes; + my $cmp1 = join( '||', map { $_->string } @oldlex ); + my $cmp2 = join( '||', map { $_->string } @lexemes ); + if ( @oldlex == @lexemes && $cmp1 == $cmp2 ) { + # The lexeme strings are the same, so merge the possible + # word forms from new to old. + foreach my $i ( 0 .. $#lexemes ) { + my $ol = $oldlex[$i]; + my $nl = $lexemes[$i]; + my %ofw; + map { $ofw{$_->_stringify} = 1 } $ol->matching_forms; + foreach my $form ( $nl->matching_forms ) { + unless( $ofw{$form->_stringify} ) { + print STDERR "Adding form " . $form->_stringify . + " to lexeme " . $nl->string . " at $reading\n"; + $ol->add_matching_form( $form ); + $ol->is_disambiguated(0); + } + } + } + } else { + $DB::single = 1; + warn "Lexeme layout for $reading changed; replacing the lot"; + $reading->clear_lexemes; + $reading->add_lexeme( @lexemes ); + } + } else { + $reading->add_lexeme( @lexemes ); + } } =head2 word_lookup( $word ) @@ -34,9 +170,77 @@ It is better to use L for context sensitivity. sub word_lookup { my $word = shift; + my $tagresult = _treetag_string( $word ); + my $lemmatizer; + try { + load 'Flemm'; + $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' ); + } catch { + warn "Cannot do French word lemmatization without Flemm: @_"; + return; + } + return _parse_wordform( _flemm_lookup( $tagresult ) ); +} + +# Utility function that actually calls the tree tagger. +sub _treetag_string { + my( $text ) = @_; + my $wittext = encode_utf8( $text ); + # Then see if we have TreeTagger + my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-french-utf8"; + unless( -f $taggercmd ) { + warn "Cannot do French word lemmatization without TreeTagger"; + return; + } + # OK, we can run it then. + my @cmd = ( $taggercmd ); + my( $tagresult, $err ); # Capture the output and error + run( \@cmd, \$wittext, \$tagresult, \$err ); + # TODO check for error + return decode_utf8( $tagresult ); +} + +# Closure and utility function for the package lemmatizer +{ + my $lemmatizer; + + sub _flemm_lookup { + # First try to load Flemm + unless( $lemmatizer ) { + try { + load 'Flemm'; + $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' ); + } catch { + warn "Cannot do French word lemmatization without Flemm: @_"; + return; + } + } + return $lemmatizer->lemmatize( @_ ) + } } +# Utility function to turn a Flemm result into a WordForm +sub _parse_wordform { + my $flemmobj = shift; + # For now just parse the string, until we make sense of the documentation. + my @results = split( / \|\| /, $flemmobj->getResult ); + my @forms; + foreach ( @results ) { + my( $orig, $tag, $lemma ) = split( /\t/, $_ ); + my( $pos, $morph ) = split( /:/, $tag ); + my $wf = Text::Tradition::Collation::Reading::WordForm->new( + 'language' => 'French', + 'lemma' => $lemma, + 'morphology' => [ split( //, $morph ) ], + ); + push( @forms, $wf ); + } + return @forms; +} + +1; + =head1 LICENSE This package is free software and is provided "as is" without express