From: Tara L Andrews Date: Mon, 7 May 2012 14:16:58 +0000 (+0200) Subject: refactor English/French shared TT logic into Base.pm X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?p=scpubgit%2Fstemmatology.git;a=commitdiff_plain;h=e0f6836abec58dd0dd896a130c7587368880d255 refactor English/French shared TT logic into Base.pm --- diff --git a/Makefile.PL b/Makefile.PL index 949a95c..ceb652d 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -36,4 +36,7 @@ requires( 'Lingua::TagSet::Multext' ); requires( 'Lingua::TagSet::TreeTagger' ); requires( 'Lingua::Features::Structure' ); build_requires( 'Test::Warn' ); +# Modules needed for morphology but not trivially CPANnable +recommends( 'Lingua::TreeTagger' ); +recommends( 'Flemm' ); &WriteAll; diff --git a/lib/Text/Tradition/Language/Base.pm b/lib/Text/Tradition/Language/Base.pm new file mode 100644 index 0000000..797c085 --- /dev/null +++ b/lib/Text/Tradition/Language/Base.pm @@ -0,0 +1,278 @@ +package Text::Tradition::Language::Base; + +use strict; +use warnings; +use Encode qw/ encode_utf8 decode_utf8 /; +use Exporter 'import'; +use vars qw/ @EXPORT_OK /; +use IPC::Run qw/ run /; +use Lingua::TagSet::Multext; +use Lingua::TagSet::TreeTagger; +use Module::Load; +use Text::Tradition::Collation::Reading::Lexeme; +use Text::Tradition::Collation::Reading::WordForm; +use TryCatch; + +@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger /; + +=head1 NAME + +Text::Tradition::Language::Base - Base subroutines for lemmatization of words + +=head1 DESCRIPTION + +Common routines for applying morphological tagging to a Text::Tradition. Used +with callbacks from the named language packages. + +=head1 SUBROUTINES + +=head2 lemmatize_treetagger( $tradition ) + +Evaluates the tradition with the given options, and returns the results. + +=cut + +sub lemmatize_treetagger { + my( $tradition, %opts ) = @_; + + # Given a tradition, lemmatize it witness by witness and see what we get. + my $c = $tradition->collation; + # First, clear out all existing lexemes from the readings. + my %witness_paths = _clear_reading_lexemes( $tradition ); + + foreach my $sig ( keys %witness_paths ) { + # Get the text as a sequence of readings and as a string + print STDERR "Morphologizing witness $sig\n"; + my %witopts = ( + 'path' => $witness_paths{$sig}, + %opts + ); + _lemmatize_treetagger_sequence( %witopts ); + } +} + +sub _clear_reading_lexemes { + my $tradition = shift; + my $c = $tradition->collation; + # Clear out all existing lexemes from the readings. Save the path as long + # as we went to the trouble of generating it. + my %witness_paths; + foreach my $wit ( $tradition->witnesses ) { + my @sigla = ( $wit->sigil ); + push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered; + foreach my $sig ( @sigla ) { + my @path = grep { !$_->is_meta } + $c->reading_sequence( $c->start, $c->end, $sig ); + map { $_->clear_lexemes } @path; + $witness_paths{$sig} = \@path; + } + } + return %witness_paths; +} + +=head2 reading_lookup( $rdg[, $rdg, ...] ) + +Looks up one or more readings using the Flemm package, and returns the +possible results. This uses the same logic as L above for the +entire tradition, but can also be used to (re-)analyze individual readings. + +=cut + +sub reading_lookup_treetagger { + my %opts = @_; + $opts{'replace'} = 1; + return _lemmatize_treetagger_sequence( %opts ); +} + +sub _lemmatize_treetagger_sequence { + my %opts = @_; + my @path = @{$opts{'path'}}; + my $tagresult = _treetag_string( _text_from_path( 1, @path ), $opts{'language'} ); + if( $tagresult ) { + # Map the tagged words onto the original readings, splitting + # them up into lexemes where necessary. + # NOTE we can have multiple lexemes in a reading, but not + # multiple readings to a lexeme. + my @tags = split( /\n/, $tagresult ); + my @lexemes; + my $curr_rdg = shift @path; + my @curr_lexemes; + my $unused_rdg_part; + foreach my $tag ( @tags ) { + # Get the original word + my( $lexeme, @rest ) = split( /\t/, $tag ); + # Lemmatize the whole + # TODO error trap this + my @forms = $opts{'callback'}( $tag ); + + my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new( + 'string' => $lexeme, 'language' => $opts{'language'}, + 'wordform_matchlist' => \@forms ); + # Find the next non-meta reading + while( $curr_rdg && $curr_rdg->is_meta ) { + $curr_rdg = shift @path; + } + unless( $curr_rdg ) { + warn "Ran out of readings in sequence at $lexeme"; + last; + } + my $curr_rdg_text = $curr_rdg->has_normal_form + ? $curr_rdg->normal_form : $curr_rdg->text; + if( $unused_rdg_part && + $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { + # Nth part of curr_rdg + $unused_rdg_part = $2; + push( @curr_lexemes, $lexobj ); + } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { + # Flag an error if there is already an unused reading part. + warn "Skipped over unused text $unused_rdg_part at $curr_rdg" + if $unused_rdg_part; + $unused_rdg_part = $2; # will be empty if the whole reading matched + push( @curr_lexemes, $lexobj ); + } else { + # We do not cope with the idea of a lexeme being + # spread across multiple readings. + warn "Word sequence changed unexpectedly in text"; + # See if we can find a matching reading + my @lookahead; + my $matched; + while( my $nr = shift @path ) { + my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text; + if( $nrtext =~ /^\Q$lexeme\E/ ) { + $curr_rdg = $lookahead[-1] if @lookahead; + $matched = 1; + last; + } else { + push( @lookahead, $nr ); + } + } + # No match? Restore the state we had + unless( $matched ) { + unshift( @path, @lookahead ); + } + # Trigger a move + $unused_rdg_part = ''; + } + + unless( $unused_rdg_part ) { + # Record the lexemes for the given reading. + #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n", + # join( ' ', map { $_->string } @curr_lexemes ), + # $curr_rdg->id, $curr_rdg->text ); + _update_reading_lexemes( $opts{replace}, $curr_rdg, @curr_lexemes ); + $curr_rdg = shift @path; + @curr_lexemes = (); + } + } + } +} + +sub _update_reading_lexemes { + my( $replace, $reading, @lexemes ) = @_; + if( $reading->has_lexemes && !$replace ) { + # We need to merge what is in @lexemes with what we have already. + my @oldlex = $reading->lexemes; + my $cmp1 = join( '||', map { $_->string } @oldlex ); + my $cmp2 = join( '||', map { $_->string } @lexemes ); + if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) { + # The lexeme strings are the same, so merge the possible + # word forms from new to old. + foreach my $i ( 0 .. $#lexemes ) { + my $ol = $oldlex[$i]; + my $nl = $lexemes[$i]; + my %ofw; + map { $ofw{$_->to_string} = 1 } $ol->matching_forms; + foreach my $form ( $nl->matching_forms ) { + unless( $ofw{$form->to_string} ) { + print STDERR "Adding form " . $form->to_string . + " to lexeme " . $nl->string . " at $reading\n"; + $ol->add_matching_form( $form ); + $ol->is_disambiguated(0); + } + } + } + } else { + warn "Lexeme layout for $reading changed; replacing the lot"; + $reading->clear_lexemes; + $reading->add_lexeme( @lexemes ); + } + } else { + $reading->clear_lexemes if $replace; + $reading->add_lexeme( @lexemes ); + } +} + +# Utility function so that we can cheat and use it when we need both the path +# and its text. +sub _text_from_path { + my( $normalize, @path ) = @_; + my $pathtext = ''; + my $last; + foreach my $r ( @path ) { + unless ( $r->join_prior || !$last || $last->join_next ) { + $pathtext .= ' '; + } + $pathtext .= ( $normalize && $r->has_normal_form ) + ? $r->normal_form : $r->text; + $last = $r; + } + return $pathtext; +} + +# Utility function that actually calls the tree tagger. +sub _treetag_string { + my( $text, $lang ) = @_; + my $wittext = encode_utf8( $text ); + # Then see if we have TreeTagger + try { + load( 'Lingua::TreeTagger' ); + } catch { + warn "Cannot run TreeTagger without Lingua::TreeTagger module"; + return ''; + } + # OK, we can run it then. + # First upgrade to UTF8 for necessary languages. + $lang = lc( $lang ); + my $use_utf8; + my @utf8_supported = qw/ french /; + if( grep { $_ eq $lang } @utf8_supported ) { + $lang .= '-utf8'; + $use_utf8 = 1; + } + # Now instantiate and run the tagger. + my $tagger = Lingua::TreeTagger->new( + 'language' => $lang, + 'options' => [ qw/ -token -lemma / ], + ); + if( $use_utf8 ) { + $text = encode_utf8( $text ); + } + my $tagresult = $tagger->tag_text( \$text ); + + # TODO maybe send the tokens back rather than the interpreted string... + if( $use_utf8 ) { + return decode_utf8( $tagresult->as_text() ); + } + return $tagresult->as_text(); +} + + +1; + +=head2 TODO + +=over + +=item * Handle package dependencies more gracefully + +=back + +=head1 LICENSE + +This package is free software and is provided "as is" without express +or implied warranty. You can redistribute it and/or modify it under +the same terms as Perl itself. + +=head1 AUTHOR + +Tara L Andrews Eaurum@cpan.orgE diff --git a/lib/Text/Tradition/Language/English.pm b/lib/Text/Tradition/Language/English.pm index 0ea6858..9d25c55 100644 --- a/lib/Text/Tradition/Language/English.pm +++ b/lib/Text/Tradition/Language/English.pm @@ -2,16 +2,9 @@ package Text::Tradition::Language::English; use strict; use warnings; -use Encode qw/ encode_utf8 decode_utf8 /; -use IPC::Run qw/ run /; -use Lingua::TagSet::TreeTagger; -use Module::Load; -use Text::Tradition::Collation::Reading::Lexeme; -use Text::Tradition::Collation::Reading::WordForm; +use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger /; use TryCatch; -my $MORPHDIR = '/Users/tla/Projects/morphology'; - =head1 NAME Text::Tradition::Language::English - language-specific module for English @@ -41,64 +34,11 @@ use_ok( 'Text::Tradition::Language::English' ); sub lemmatize { my $tradition = shift; - - # Given a tradition, lemmatize it witness by witness and see what we get. - my $workdir = File::Temp->newdir(); - my $c = $tradition->collation; - # First, clear out all existing lexemes from the readings. Save the - # path as long as we went to the trouble of generating it. - my %witness_paths; - foreach my $wit ( $tradition->witnesses ) { - my @sigla = ( $wit->sigil ); - push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered; - foreach my $sig ( @sigla ) { - my @path = grep { !$_->is_meta } - $c->reading_sequence( $c->start, $c->end, $sig ); - map { $_->clear_lexemes } @path; - $witness_paths{$sig} = \@path; - } - } - - foreach my $sig ( keys %witness_paths ) { - # Get the text as a sequence of readings and as a string - print STDERR "Morphologizing witness $sig\n"; - _lemmatize_sequence( undef, @{$witness_paths{$sig}} ); - } -} - -sub _update_reading_lexemes { - my( $replace, $reading, @lexemes ) = @_; - if( $reading->has_lexemes && !$replace ) { - # We need to merge what is in @lexemes with what we have already. - my @oldlex = $reading->lexemes; - my $cmp1 = join( '||', map { $_->string } @oldlex ); - my $cmp2 = join( '||', map { $_->string } @lexemes ); - if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) { - # The lexeme strings are the same, so merge the possible - # word forms from new to old. - foreach my $i ( 0 .. $#lexemes ) { - my $ol = $oldlex[$i]; - my $nl = $lexemes[$i]; - my %ofw; - map { $ofw{$_->to_string} = 1 } $ol->matching_forms; - foreach my $form ( $nl->matching_forms ) { - unless( $ofw{$form->to_string} ) { - print STDERR "Adding form " . $form->to_string . - " to lexeme " . $nl->string . " at $reading\n"; - $ol->add_matching_form( $form ); - $ol->is_disambiguated(0); - } - } - } - } else { - warn "Lexeme layout for $reading changed; replacing the lot"; - $reading->clear_lexemes; - $reading->add_lexeme( @lexemes ); - } - } else { - $reading->clear_lexemes if $replace; - $reading->add_lexeme( @lexemes ); - } + my %opts = ( + 'language' => 'French', + 'callback' => sub { _parse_wordform( @_ ) } + ); + return lemmatize_treetagger( $tradition, %opts ); } =head2 reading_lookup( $rdg[, $rdg, ...] ) @@ -110,122 +50,13 @@ entire tradition, but can also be used to (re-)analyze individual readings. =cut sub reading_lookup { - return _lemmatize_sequence( 1, @_ ); -} - -sub _lemmatize_sequence { - my( $replace, @path ) = @_; - my $tagresult = _treetag_string( _text_from_path( 1, @path ) ); - if( $tagresult ) { - # Map the tagged words onto the original readings, splitting - # them up into lexemes where necessary. - # NOTE we can have multiple lexemes in a reading, but not - # multiple readings to a lexeme. - my @tags = split( /\n/, $tagresult ); - my @lexemes; - my $curr_rdg = shift @path; - my @curr_lexemes; - my $unused_rdg_part; - foreach my $tag ( @tags ) { - # Get the original word - my( $lexeme, @rest ) = split( /\t/, $tag ); - # Lemmatize the whole - my @forms = _parse_wordform( $tag ); - my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new( - 'string' => $lexeme, 'language' => 'English', - 'wordform_matchlist' => \@forms ); - # Find the next non-meta reading - while( $curr_rdg && $curr_rdg->is_meta ) { - $curr_rdg = shift @path; - } - unless( $curr_rdg ) { - warn "Ran out of readings in sequence at $lexeme"; - last; - } - my $curr_rdg_text = $curr_rdg->has_normal_form - ? $curr_rdg->normal_form : $curr_rdg->text; - if( $unused_rdg_part && - $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { - # Nth part of curr_rdg - $unused_rdg_part = $2; - push( @curr_lexemes, $lexobj ); - } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { - # Flag an error if there is already an unused reading part. - warn "Skipped over unused text $unused_rdg_part at $curr_rdg" - if $unused_rdg_part; - $unused_rdg_part = $2; # will be empty if the whole reading matched - push( @curr_lexemes, $lexobj ); - } else { - # We do not cope with the idea of a lexeme being - # spread across multiple readings. - warn "Word sequence changed unexpectedly in text"; - # See if we can find a matching reading - my @lookahead; - my $matched; - while( my $nr = shift @path ) { - my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text; - if( $nrtext =~ /^\Q$lexeme\E/ ) { - $curr_rdg = $lookahead[-1] if @lookahead; - $matched = 1; - last; - } else { - push( @lookahead, $nr ); - } - } - # No match? Restore the state we had - unless( $matched ) { - unshift( @path, @lookahead ); - } - # Trigger a move - $unused_rdg_part = ''; - } - - unless( $unused_rdg_part ) { - # Record the lexemes for the given reading. - #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n", - # join( ' ', map { $_->string } @curr_lexemes ), - # $curr_rdg->id, $curr_rdg->text ); - _update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes ); - $curr_rdg = shift @path; - @curr_lexemes = (); - } - } - } -} - -# Utility function so that we can cheat and use it when we need both the path -# and its text. -sub _text_from_path { - my( $normalize, @path ) = @_; - my $pathtext = ''; - my $last; - foreach my $r ( @path ) { - unless ( $r->join_prior || !$last || $last->join_next ) { - $pathtext .= ' '; - } - $pathtext .= ( $normalize && $r->has_normal_form ) - ? $r->normal_form : $r->text; - $last = $r; - } - return $pathtext; -} - -# Utility function that actually calls the tree tagger. -sub _treetag_string { - my( $text ) = @_; - my $wittext = encode_utf8( $text ); - # Then see if we have TreeTagger - my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-english"; - unless( -f $taggercmd ) { - warn "Cannot do English word lemmatization without TreeTagger"; - return; - } - # OK, we can run it then. - my @cmd = ( $taggercmd ); - my( $tagresult, $err ); # Capture the output and error - run( \@cmd, \$wittext, \$tagresult, \$err ); - # TODO check for error - return decode_utf8( $tagresult ); + my( @path ) = @_; + my %opts = ( + 'language' => 'French', + 'callback' => sub { _parse_wordform( @_ ) }, + 'path' => \@path, + ); + return reading_lookup_treetagger( %opts ); } # Utility function to turn a TreeTagger result into a WordForm @@ -250,9 +81,7 @@ sub _parse_wordform { =over -=item * Handle package dependencies more gracefully - -=item * Refactor English/French use of TreeTagger into its own util package +=item * Tests! =back diff --git a/lib/Text/Tradition/Language/French.pm b/lib/Text/Tradition/Language/French.pm index 6847354..7453044 100644 --- a/lib/Text/Tradition/Language/French.pm +++ b/lib/Text/Tradition/Language/French.pm @@ -2,17 +2,10 @@ package Text::Tradition::Language::French; use strict; use warnings; -use Encode qw/ encode_utf8 decode_utf8 /; -use IPC::Run qw/ run /; -use Lingua::TagSet::Multext; -use Lingua::TagSet::TreeTagger; -use Module::Load; -use Text::Tradition::Collation::Reading::Lexeme; -use Text::Tradition::Collation::Reading::WordForm; +use Module::Load qw/ load /; +use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger /; use TryCatch; -my $MORPHDIR = '/Users/tla/Projects/morphology'; - =head1 NAME Text::Tradition::Language::French - language-specific module for French @@ -71,7 +64,7 @@ SKIP: { is( $ambig, 102, "Found 102 ambiguous forms as expected" ); # Try setting the normal form of a reading and re-analyzing - my $mr = $tf->collation->reading('99,2'); + my $mr = $tf->collation->reading('r99.2'); is( $mr->text, 'minspire', "Picked correct test reading" ); is( $mr->language, 'French', "Reading has correct language setting" ); $mr->normal_form( "m'inspire" ); @@ -85,64 +78,11 @@ SKIP: { sub lemmatize { my $tradition = shift; - - # Given a tradition, lemmatize it witness by witness and see what we get. - my $workdir = File::Temp->newdir(); - my $c = $tradition->collation; - # First, clear out all existing lexemes from the readings. Save the - # path as long as we went to the trouble of generating it. - my %witness_paths; - foreach my $wit ( $tradition->witnesses ) { - my @sigla = ( $wit->sigil ); - push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered; - foreach my $sig ( @sigla ) { - my @path = grep { !$_->is_meta } - $c->reading_sequence( $c->start, $c->end, $sig ); - map { $_->clear_lexemes } @path; - $witness_paths{$sig} = \@path; - } - } - - foreach my $sig ( keys %witness_paths ) { - # Get the text as a sequence of readings and as a string - print STDERR "Morphologizing witness $sig\n"; - _lemmatize_sequence( undef, @{$witness_paths{$sig}} ); - } -} - -sub _update_reading_lexemes { - my( $replace, $reading, @lexemes ) = @_; - if( $reading->has_lexemes && !$replace ) { - # We need to merge what is in @lexemes with what we have already. - my @oldlex = $reading->lexemes; - my $cmp1 = join( '||', map { $_->string } @oldlex ); - my $cmp2 = join( '||', map { $_->string } @lexemes ); - if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) { - # The lexeme strings are the same, so merge the possible - # word forms from new to old. - foreach my $i ( 0 .. $#lexemes ) { - my $ol = $oldlex[$i]; - my $nl = $lexemes[$i]; - my %ofw; - map { $ofw{$_->to_string} = 1 } $ol->matching_forms; - foreach my $form ( $nl->matching_forms ) { - unless( $ofw{$form->to_string} ) { - print STDERR "Adding form " . $form->to_string . - " to lexeme " . $nl->string . " at $reading\n"; - $ol->add_matching_form( $form ); - $ol->is_disambiguated(0); - } - } - } - } else { - warn "Lexeme layout for $reading changed; replacing the lot"; - $reading->clear_lexemes; - $reading->add_lexeme( @lexemes ); - } - } else { - $reading->clear_lexemes if $replace; - $reading->add_lexeme( @lexemes ); - } + my %opts = ( + 'language' => 'French', + 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) } + ); + return lemmatize_treetagger( $tradition, %opts ); } =head2 reading_lookup( $rdg[, $rdg, ...] ) @@ -154,122 +94,13 @@ entire tradition, but can also be used to (re-)analyze individual readings. =cut sub reading_lookup { - return _lemmatize_sequence( 1, @_ ); -} - -sub _lemmatize_sequence { - my( $replace, @path ) = @_; - my $tagresult = _treetag_string( _text_from_path( 1, @path ) ); - if( $tagresult ) { - # Map the tagged words onto the original readings, splitting - # them up into lexemes where necessary. - # NOTE we can have multiple lexemes in a reading, but not - # multiple readings to a lexeme. - my @tags = split( /\n/, $tagresult ); - my @lexemes; - my $curr_rdg = shift @path; - my @curr_lexemes; - my $unused_rdg_part; - foreach my $tag ( @tags ) { - # Get the original word - my( $lexeme, @rest ) = split( /\t/, $tag ); - # Lemmatize the whole - my @forms = _parse_wordform( _flemm_lookup( $tag ) ); - my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new( - 'string' => $lexeme, 'language' => 'French', - 'wordform_matchlist' => \@forms ); - # Find the next non-meta reading - while( $curr_rdg && $curr_rdg->is_meta ) { - $curr_rdg = shift @path; - } - unless( $curr_rdg ) { - warn "Ran out of readings in sequence at $lexeme"; - last; - } - my $curr_rdg_text = $curr_rdg->has_normal_form - ? $curr_rdg->normal_form : $curr_rdg->text; - if( $unused_rdg_part && - $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { - # Nth part of curr_rdg - $unused_rdg_part = $2; - push( @curr_lexemes, $lexobj ); - } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { - # Flag an error if there is already an unused reading part. - warn "Skipped over unused text $unused_rdg_part at $curr_rdg" - if $unused_rdg_part; - $unused_rdg_part = $2; # will be empty if the whole reading matched - push( @curr_lexemes, $lexobj ); - } else { - # We do not cope with the idea of a lexeme being - # spread across multiple readings. - warn "Word sequence changed unexpectedly in text"; - # See if we can find a matching reading - my @lookahead; - my $matched; - while( my $nr = shift @path ) { - my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text; - if( $nrtext =~ /^\Q$lexeme\E/ ) { - $curr_rdg = $lookahead[-1] if @lookahead; - $matched = 1; - last; - } else { - push( @lookahead, $nr ); - } - } - # No match? Restore the state we had - unless( $matched ) { - unshift( @path, @lookahead ); - } - # Trigger a move - $unused_rdg_part = ''; - } - - unless( $unused_rdg_part ) { - # Record the lexemes for the given reading. - #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n", - # join( ' ', map { $_->string } @curr_lexemes ), - # $curr_rdg->id, $curr_rdg->text ); - _update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes ); - $curr_rdg = shift @path; - @curr_lexemes = (); - } - } - } -} - -# Utility function so that we can cheat and use it when we need both the path -# and its text. -sub _text_from_path { - my( $normalize, @path ) = @_; - my $pathtext = ''; - my $last; - foreach my $r ( @path ) { - unless ( $r->join_prior || !$last || $last->join_next ) { - $pathtext .= ' '; - } - $pathtext .= ( $normalize && $r->has_normal_form ) - ? $r->normal_form : $r->text; - $last = $r; - } - return $pathtext; -} - -# Utility function that actually calls the tree tagger. -sub _treetag_string { - my( $text ) = @_; - my $wittext = encode_utf8( $text ); - # Then see if we have TreeTagger - my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-french-utf8"; - unless( -f $taggercmd ) { - warn "Cannot do French word lemmatization without TreeTagger"; - return; - } - # OK, we can run it then. - my @cmd = ( $taggercmd ); - my( $tagresult, $err ); # Capture the output and error - run( \@cmd, \$wittext, \$tagresult, \$err ); - # TODO check for error - return decode_utf8( $tagresult ); + my( @path ) = @_; + my %opts = ( + 'language' => 'French', + 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) }, + 'path' => \@path, + ); + return reading_lookup_treetagger( %opts ); } # Closure and utility function for the package lemmatizer @@ -328,7 +159,7 @@ sub _parse_wordform { =over -=item * Handle package dependencies more gracefully +=item * Try to do more things with Perl objects in Flemm and TT =back diff --git a/t/text_tradition_language_french.t b/t/text_tradition_language_french.t index 01b206b..84e9d02 100644 --- a/t/text_tradition_language_french.t +++ b/t/text_tradition_language_french.t @@ -45,7 +45,7 @@ SKIP: { is( $ambig, 102, "Found 102 ambiguous forms as expected" ); # Try setting the normal form of a reading and re-analyzing - my $mr = $tf->collation->reading('99,2'); + my $mr = $tf->collation->reading('r99.2'); is( $mr->text, 'minspire', "Picked correct test reading" ); is( $mr->language, 'French', "Reading has correct language setting" ); $mr->normal_form( "m'inspire" );