refactor English/French shared TT logic into Base.pm

diff --git a/Makefile.PL b/Makefile.PL

index 949a95c..ceb652d 100644 (file)
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -36,4 +36,7 @@ requires( 'Lingua::TagSet::Multext' );
 requires( 'Lingua::TagSet::TreeTagger' );
 requires( 'Lingua::Features::Structure' );
 build_requires( 'Test::Warn' );
+# Modules needed for morphology but not trivially CPANnable
+recommends( 'Lingua::TreeTagger' );
+recommends( 'Flemm' );
 &WriteAll;
diff --git a/lib/Text/Tradition/Language/Base.pm b/lib/Text/Tradition/Language/Base.pm

new file mode 100644 (file)

index 0000000..797c085
--- /dev/null
+++ b/lib/Text/Tradition/Language/Base.pm
@@ -0,0 +1,278 @@
+package Text::Tradition::Language::Base;
+
+use strict;
+use warnings;
+use Encode qw/ encode_utf8 decode_utf8 /;
+use Exporter 'import';
+use vars qw/ @EXPORT_OK /;
+use IPC::Run qw/ run /;
+use Lingua::TagSet::Multext;
+use Lingua::TagSet::TreeTagger;
+use Module::Load;
+use Text::Tradition::Collation::Reading::Lexeme;
+use Text::Tradition::Collation::Reading::WordForm;
+use TryCatch;
+
+@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger /;
+
+=head1 NAME
+
+Text::Tradition::Language::Base - Base subroutines for lemmatization of words
+
+=head1 DESCRIPTION
+
+Common routines for applying morphological tagging to a Text::Tradition. Used
+with callbacks from the named language packages.
+
+=head1 SUBROUTINES
+
+=head2 lemmatize_treetagger( $tradition )
+
+Evaluates the tradition with the given options, and returns the results.
+
+=cut
+
+sub lemmatize_treetagger {
+       my( $tradition, %opts ) = @_;
+
+       # Given a tradition, lemmatize it witness by witness and see what we get.
+       my $c = $tradition->collation;
+       # First, clear out all existing lexemes from the readings. 
+       my %witness_paths = _clear_reading_lexemes( $tradition );
+       
+       foreach my $sig ( keys %witness_paths ) {
+               # Get the text as a sequence of readings and as a string
+               print STDERR "Morphologizing witness $sig\n";
+               my %witopts = (
+                       'path' => $witness_paths{$sig},
+                       %opts
+                       );
+               _lemmatize_treetagger_sequence( %witopts );
+       }
+}
+
+sub _clear_reading_lexemes {
+       my $tradition = shift;
+               my $c = $tradition->collation;
+       # Clear out all existing lexemes from the readings. Save the path as long 
+       # as we went to the trouble of generating it.
+       my %witness_paths;
+       foreach my $wit ( $tradition->witnesses ) {
+               my @sigla = ( $wit->sigil );
+               push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
+               foreach my $sig ( @sigla ) {
+                       my @path = grep { !$_->is_meta } 
+                               $c->reading_sequence( $c->start, $c->end, $sig );
+                       map { $_->clear_lexemes } @path;
+                       $witness_paths{$sig} = \@path;
+               }
+       }
+       return %witness_paths;
+}
+
+=head2 reading_lookup( $rdg[, $rdg, ...] )
+
+Looks up one or more readings using the Flemm package, and returns the
+possible results.  This uses the same logic as L<lemmatize> above for the
+entire tradition, but can also be used to (re-)analyze individual readings.
+
+=cut
+
+sub reading_lookup_treetagger {
+       my %opts = @_;
+       $opts{'replace'} = 1;
+       return _lemmatize_treetagger_sequence( %opts );
+}
+
+sub _lemmatize_treetagger_sequence {
+       my %opts = @_;
+       my @path = @{$opts{'path'}};
+       my $tagresult = _treetag_string( _text_from_path( 1, @path ), $opts{'language'} );
+       if( $tagresult ) {
+               # Map the tagged words onto the original readings, splitting 
+               # them up into lexemes where necessary.
+               # NOTE we can have multiple lexemes in a reading, but not
+               # multiple readings to a lexeme.
+               my @tags = split( /\n/, $tagresult );
+               my @lexemes;
+               my $curr_rdg = shift @path;
+               my @curr_lexemes;
+               my $unused_rdg_part;
+               foreach my $tag ( @tags ) {
+                       # Get the original word
+                       my( $lexeme, @rest ) = split( /\t/, $tag );
+                       # Lemmatize the whole
+                       # TODO error trap this
+                       my @forms = $opts{'callback'}( $tag );
+
+                       my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
+                               'string' => $lexeme, 'language' => $opts{'language'},
+                               'wordform_matchlist' => \@forms );
+                       # Find the next non-meta reading
+                       while( $curr_rdg && $curr_rdg->is_meta ) {
+                               $curr_rdg = shift @path;
+                       }
+                       unless( $curr_rdg ) {
+                               warn "Ran out of readings in sequence at $lexeme";
+                               last;
+                       }
+                       my $curr_rdg_text = $curr_rdg->has_normal_form 
+                               ? $curr_rdg->normal_form : $curr_rdg->text;
+                       if( $unused_rdg_part &&
+                               $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
+                               # Nth part of curr_rdg
+                               $unused_rdg_part = $2;
+                               push( @curr_lexemes, $lexobj );
+                       } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
+                               # Flag an error if there is already an unused reading part.
+                               warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
+                                       if $unused_rdg_part;
+                               $unused_rdg_part = $2; # will be empty if the whole reading matched
+                               push( @curr_lexemes, $lexobj );
+                       } else {
+                               # We do not cope with the idea of a lexeme being 
+                               # spread across multiple readings.
+                               warn "Word sequence changed unexpectedly in text";
+                               # See if we can find a matching reading
+                               my @lookahead;
+                               my $matched;
+                               while( my $nr = shift @path ) {
+                                       my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
+                                       if( $nrtext =~ /^\Q$lexeme\E/ ) {
+                                               $curr_rdg = $lookahead[-1] if @lookahead;
+                                               $matched = 1;
+                                               last;
+                                       } else {
+                                               push( @lookahead, $nr );
+                                       }
+                               }
+                               # No match? Restore the state we had
+                               unless( $matched ) {
+                                       unshift( @path, @lookahead );
+                               }
+                               # Trigger a move
+                               $unused_rdg_part = '';
+                       }
+                       
+                       unless( $unused_rdg_part ) {
+                               # Record the lexemes for the given reading.
+                               #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
+                               #       join( ' ', map { $_->string } @curr_lexemes ),
+                               #       $curr_rdg->id, $curr_rdg->text );
+                               _update_reading_lexemes( $opts{replace}, $curr_rdg, @curr_lexemes );
+                               $curr_rdg = shift @path;
+                               @curr_lexemes = ();
+                       }
+               }
+       }
+}
+
+sub _update_reading_lexemes {
+       my( $replace, $reading, @lexemes ) = @_;
+       if( $reading->has_lexemes && !$replace ) {
+               # We need to merge what is in @lexemes with what we have already.
+               my @oldlex = $reading->lexemes;
+               my $cmp1 = join( '||', map { $_->string } @oldlex );
+               my $cmp2 = join( '||', map { $_->string } @lexemes );
+               if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
+                       # The lexeme strings are the same, so merge the possible
+                       # word forms from new to old.
+                       foreach my $i ( 0 .. $#lexemes ) {
+                               my $ol = $oldlex[$i];
+                               my $nl = $lexemes[$i];
+                               my %ofw;
+                               map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
+                               foreach my $form ( $nl->matching_forms ) {
+                                       unless( $ofw{$form->to_string} ) {
+                                               print STDERR "Adding form " . $form->to_string . 
+                                                       " to lexeme " . $nl->string . " at $reading\n";
+                                               $ol->add_matching_form( $form );
+                                               $ol->is_disambiguated(0);
+                                       }
+                               }
+                       }
+               } else {
+                       warn "Lexeme layout for $reading changed; replacing the lot";
+                       $reading->clear_lexemes;
+                       $reading->add_lexeme( @lexemes );
+               }
+       } else {
+               $reading->clear_lexemes if $replace;
+               $reading->add_lexeme( @lexemes );
+       }
+}
+
+# Utility function so that we can cheat and use it when we need both the path
+# and its text.
+sub _text_from_path {
+       my( $normalize, @path ) = @_;
+       my $pathtext = '';
+       my $last;
+       foreach my $r ( @path ) {
+               unless ( $r->join_prior || !$last || $last->join_next ) {
+                       $pathtext .= ' ';
+               } 
+               $pathtext .= ( $normalize && $r->has_normal_form ) 
+                       ? $r->normal_form : $r->text;
+               $last = $r;
+       }
+       return $pathtext;
+}
+
+# Utility function that actually calls the tree tagger.
+sub _treetag_string {
+       my( $text, $lang ) = @_;
+       my $wittext = encode_utf8( $text );
+       # Then see if we have TreeTagger
+       try {
+               load( 'Lingua::TreeTagger' );
+       } catch {
+               warn "Cannot run TreeTagger without Lingua::TreeTagger module";
+               return '';
+       }
+       # OK, we can run it then.
+       # First upgrade to UTF8 for necessary languages.
+       $lang = lc( $lang );
+       my $use_utf8;
+       my @utf8_supported = qw/ french /;
+       if( grep { $_ eq $lang } @utf8_supported ) {
+               $lang .= '-utf8';
+               $use_utf8 = 1;
+       }
+       # Now instantiate and run the tagger.
+       my $tagger = Lingua::TreeTagger->new(
+               'language' => $lang,
+               'options' => [ qw/ -token -lemma / ],
+               );
+       if( $use_utf8 ) {
+               $text = encode_utf8( $text );
+       } 
+       my $tagresult = $tagger->tag_text( \$text );
+       
+       # TODO maybe send the tokens back rather than the interpreted string...
+       if( $use_utf8 ) {
+               return decode_utf8( $tagresult->as_text() );
+       } 
+       return $tagresult->as_text();
+}
+
+
+1;
+
+=head2 TODO
+
+=over
+
+=item * Handle package dependencies more gracefully
+
+=back
+
+=head1 LICENSE
+
+This package is free software and is provided "as is" without express
+or implied warranty.  You can redistribute it and/or modify it under
+the same terms as Perl itself.
+
+=head1 AUTHOR
+
+Tara L Andrews E<lt>aurum@cpan.orgE<gt>
diff --git a/lib/Text/Tradition/Language/English.pm b/lib/Text/Tradition/Language/English.pm

index 0ea6858..9d25c55 100644 (file)
--- a/lib/Text/Tradition/Language/English.pm
+++ b/lib/Text/Tradition/Language/English.pm
@@ -2,16 +2,9 @@ package Text::Tradition::Language::English;
 
 use strict;
 use warnings;
-use Encode qw/ encode_utf8 decode_utf8 /;
-use IPC::Run qw/ run /;
-use Lingua::TagSet::TreeTagger;
-use Module::Load;
-use Text::Tradition::Collation::Reading::Lexeme;
-use Text::Tradition::Collation::Reading::WordForm;
+use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger /;
 use TryCatch;
 
-my $MORPHDIR = '/Users/tla/Projects/morphology';
-
 =head1 NAME
 
 Text::Tradition::Language::English - language-specific module for English
@@ -41,64 +34,11 @@ use_ok( 'Text::Tradition::Language::English' );
 
 sub lemmatize {
        my $tradition = shift;
-
-       # Given a tradition, lemmatize it witness by witness and see what we get.
-       my $workdir = File::Temp->newdir();
-       my $c = $tradition->collation;
-       # First, clear out all existing lexemes from the readings. Save the
-       # path as long as we went to the trouble of generating it.
-       my %witness_paths;
-       foreach my $wit ( $tradition->witnesses ) {
-               my @sigla = ( $wit->sigil );
-               push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
-               foreach my $sig ( @sigla ) {
-                       my @path = grep { !$_->is_meta } 
-                               $c->reading_sequence( $c->start, $c->end, $sig );
-                       map { $_->clear_lexemes } @path;
-                       $witness_paths{$sig} = \@path;
-               }
-       }
-       
-       foreach my $sig ( keys %witness_paths ) {
-               # Get the text as a sequence of readings and as a string
-               print STDERR "Morphologizing witness $sig\n";
-               _lemmatize_sequence( undef, @{$witness_paths{$sig}} );
-       }
-}
-
-sub _update_reading_lexemes {
-       my( $replace, $reading, @lexemes ) = @_;
-       if( $reading->has_lexemes && !$replace ) {
-               # We need to merge what is in @lexemes with what we have already.
-               my @oldlex = $reading->lexemes;
-               my $cmp1 = join( '||', map { $_->string } @oldlex );
-               my $cmp2 = join( '||', map { $_->string } @lexemes );
-               if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
-                       # The lexeme strings are the same, so merge the possible
-                       # word forms from new to old.
-                       foreach my $i ( 0 .. $#lexemes ) {
-                               my $ol = $oldlex[$i];
-                               my $nl = $lexemes[$i];
-                               my %ofw;
-                               map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
-                               foreach my $form ( $nl->matching_forms ) {
-                                       unless( $ofw{$form->to_string} ) {
-                                               print STDERR "Adding form " . $form->to_string . 
-                                                       " to lexeme " . $nl->string . " at $reading\n";
-                                               $ol->add_matching_form( $form );
-                                               $ol->is_disambiguated(0);
-                                       }
-                               }
-                       }
-               } else {
-                       warn "Lexeme layout for $reading changed; replacing the lot";
-                       $reading->clear_lexemes;
-                       $reading->add_lexeme( @lexemes );
-               }
-       } else {
-               $reading->clear_lexemes if $replace;
-               $reading->add_lexeme( @lexemes );
-       }
+       my %opts = ( 
+               'language' => 'French', 
+               'callback' => sub { _parse_wordform( @_ ) } 
+               );
+       return lemmatize_treetagger( $tradition, %opts );
 }
 
 =head2 reading_lookup( $rdg[, $rdg, ...] )
@@ -110,122 +50,13 @@ entire tradition, but can also be used to (re-)analyze individual readings.
 =cut
 
 sub reading_lookup {
-       return _lemmatize_sequence( 1, @_ );
-}
-
-sub _lemmatize_sequence {
-       my( $replace, @path ) = @_;
-       my $tagresult = _treetag_string( _text_from_path( 1, @path ) );
-       if( $tagresult ) {
-               # Map the tagged words onto the original readings, splitting 
-               # them up into lexemes where necessary.
-               # NOTE we can have multiple lexemes in a reading, but not
-               # multiple readings to a lexeme.
-               my @tags = split( /\n/, $tagresult );
-               my @lexemes;
-               my $curr_rdg = shift @path;
-               my @curr_lexemes;
-               my $unused_rdg_part;
-               foreach my $tag ( @tags ) {
-                       # Get the original word
-                       my( $lexeme, @rest ) = split( /\t/, $tag );
-                       # Lemmatize the whole
-                       my @forms = _parse_wordform( $tag );
-                       my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
-                               'string' => $lexeme, 'language' => 'English',
-                               'wordform_matchlist' => \@forms );
-                       # Find the next non-meta reading
-                       while( $curr_rdg && $curr_rdg->is_meta ) {
-                               $curr_rdg = shift @path;
-                       }
-                       unless( $curr_rdg ) {
-                               warn "Ran out of readings in sequence at $lexeme";
-                               last;
-                       }
-                       my $curr_rdg_text = $curr_rdg->has_normal_form 
-                               ? $curr_rdg->normal_form : $curr_rdg->text;
-                       if( $unused_rdg_part &&
-                               $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
-                               # Nth part of curr_rdg
-                               $unused_rdg_part = $2;
-                               push( @curr_lexemes, $lexobj );
-                       } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
-                               # Flag an error if there is already an unused reading part.
-                               warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
-                                       if $unused_rdg_part;
-                               $unused_rdg_part = $2; # will be empty if the whole reading matched
-                               push( @curr_lexemes, $lexobj );
-                       } else {
-                               # We do not cope with the idea of a lexeme being 
-                               # spread across multiple readings.
-                               warn "Word sequence changed unexpectedly in text";
-                               # See if we can find a matching reading
-                               my @lookahead;
-                               my $matched;
-                               while( my $nr = shift @path ) {
-                                       my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
-                                       if( $nrtext =~ /^\Q$lexeme\E/ ) {
-                                               $curr_rdg = $lookahead[-1] if @lookahead;
-                                               $matched = 1;
-                                               last;
-                                       } else {
-                                               push( @lookahead, $nr );
-                                       }
-                               }
-                               # No match? Restore the state we had
-                               unless( $matched ) {
-                                       unshift( @path, @lookahead );
-                               }
-                               # Trigger a move
-                               $unused_rdg_part = '';
-                       }
-                       
-                       unless( $unused_rdg_part ) {
-                               # Record the lexemes for the given reading.
-                               #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
-                               #       join( ' ', map { $_->string } @curr_lexemes ),
-                               #       $curr_rdg->id, $curr_rdg->text );
-                               _update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes );
-                               $curr_rdg = shift @path;
-                               @curr_lexemes = ();
-                       }
-               }
-       }
-}
-
-# Utility function so that we can cheat and use it when we need both the path
-# and its text.
-sub _text_from_path {
-       my( $normalize, @path ) = @_;
-       my $pathtext = '';
-       my $last;
-       foreach my $r ( @path ) {
-               unless ( $r->join_prior || !$last || $last->join_next ) {
-                       $pathtext .= ' ';
-               } 
-               $pathtext .= ( $normalize && $r->has_normal_form ) 
-                       ? $r->normal_form : $r->text;
-               $last = $r;
-       }
-       return $pathtext;
-}
-
-# Utility function that actually calls the tree tagger.
-sub _treetag_string {
-       my( $text ) = @_;
-       my $wittext = encode_utf8( $text );
-       # Then see if we have TreeTagger
-       my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-english";
-       unless( -f $taggercmd ) {
-               warn "Cannot do English word lemmatization without TreeTagger";
-               return;
-       }
-       # OK, we can run it then.
-       my @cmd = ( $taggercmd );
-       my( $tagresult, $err ); # Capture the output and error
-       run( \@cmd, \$wittext, \$tagresult, \$err );
-       # TODO check for error
-       return decode_utf8( $tagresult );
+       my( @path ) = @_;
+       my %opts = ( 
+               'language' => 'French',
+               'callback' => sub { _parse_wordform( @_ ) },
+               'path' => \@path,
+               );
+       return reading_lookup_treetagger( %opts );
 }
 
 # Utility function to turn a TreeTagger result into a WordForm
@@ -250,9 +81,7 @@ sub _parse_wordform {
 
 =over
 
-=item * Handle package dependencies more gracefully
-
-=item * Refactor English/French use of TreeTagger into its own util package
+=item * Tests!
 
 =back
 
diff --git a/lib/Text/Tradition/Language/French.pm b/lib/Text/Tradition/Language/French.pm

index 6847354..7453044 100644 (file)
--- a/lib/Text/Tradition/Language/French.pm
+++ b/lib/Text/Tradition/Language/French.pm
@@ -2,17 +2,10 @@ package Text::Tradition::Language::French;
 
 use strict;
 use warnings;
-use Encode qw/ encode_utf8 decode_utf8 /;
-use IPC::Run qw/ run /;
-use Lingua::TagSet::Multext;
-use Lingua::TagSet::TreeTagger;
-use Module::Load;
-use Text::Tradition::Collation::Reading::Lexeme;
-use Text::Tradition::Collation::Reading::WordForm;
+use Module::Load qw/ load /;
+use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger /;
 use TryCatch;
 
-my $MORPHDIR = '/Users/tla/Projects/morphology';
-
 =head1 NAME
 
 Text::Tradition::Language::French - language-specific module for French
@@ -71,7 +64,7 @@ SKIP: {
        is( $ambig, 102, "Found 102 ambiguous forms as expected" );
        
        # Try setting the normal form of a reading and re-analyzing
-       my $mr = $tf->collation->reading('99,2');
+       my $mr = $tf->collation->reading('r99.2');
        is( $mr->text, 'minspire', "Picked correct test reading" );
        is( $mr->language, 'French', "Reading has correct language setting" );
        $mr->normal_form( "m'inspire" );
@@ -85,64 +78,11 @@ SKIP: {
 
 sub lemmatize {
        my $tradition = shift;
-
-       # Given a tradition, lemmatize it witness by witness and see what we get.
-       my $workdir = File::Temp->newdir();
-       my $c = $tradition->collation;
-       # First, clear out all existing lexemes from the readings. Save the
-       # path as long as we went to the trouble of generating it.
-       my %witness_paths;
-       foreach my $wit ( $tradition->witnesses ) {
-               my @sigla = ( $wit->sigil );
-               push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
-               foreach my $sig ( @sigla ) {
-                       my @path = grep { !$_->is_meta } 
-                               $c->reading_sequence( $c->start, $c->end, $sig );
-                       map { $_->clear_lexemes } @path;
-                       $witness_paths{$sig} = \@path;
-               }
-       }
-       
-       foreach my $sig ( keys %witness_paths ) {
-               # Get the text as a sequence of readings and as a string
-               print STDERR "Morphologizing witness $sig\n";
-               _lemmatize_sequence( undef, @{$witness_paths{$sig}} );
-       }
-}
-
-sub _update_reading_lexemes {
-       my( $replace, $reading, @lexemes ) = @_;
-       if( $reading->has_lexemes && !$replace ) {
-               # We need to merge what is in @lexemes with what we have already.
-               my @oldlex = $reading->lexemes;
-               my $cmp1 = join( '||', map { $_->string } @oldlex );
-               my $cmp2 = join( '||', map { $_->string } @lexemes );
-               if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
-                       # The lexeme strings are the same, so merge the possible
-                       # word forms from new to old.
-                       foreach my $i ( 0 .. $#lexemes ) {
-                               my $ol = $oldlex[$i];
-                               my $nl = $lexemes[$i];
-                               my %ofw;
-                               map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
-                               foreach my $form ( $nl->matching_forms ) {
-                                       unless( $ofw{$form->to_string} ) {
-                                               print STDERR "Adding form " . $form->to_string . 
-                                                       " to lexeme " . $nl->string . " at $reading\n";
-                                               $ol->add_matching_form( $form );
-                                               $ol->is_disambiguated(0);
-                                       }
-                               }
-                       }
-               } else {
-                       warn "Lexeme layout for $reading changed; replacing the lot";
-                       $reading->clear_lexemes;
-                       $reading->add_lexeme( @lexemes );
-               }
-       } else {
-               $reading->clear_lexemes if $replace;
-               $reading->add_lexeme( @lexemes );
-       }
+       my %opts = ( 
+               'language' => 'French', 
+               'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) } 
+               );
+       return lemmatize_treetagger( $tradition, %opts );
 }
 
 =head2 reading_lookup( $rdg[, $rdg, ...] )
@@ -154,122 +94,13 @@ entire tradition, but can also be used to (re-)analyze individual readings.
 =cut
 
 sub reading_lookup {
-       return _lemmatize_sequence( 1, @_ );
-}
-
-sub _lemmatize_sequence {
-       my( $replace, @path ) = @_;
-       my $tagresult = _treetag_string( _text_from_path( 1, @path ) );
-       if( $tagresult ) {
-               # Map the tagged words onto the original readings, splitting 
-               # them up into lexemes where necessary.
-               # NOTE we can have multiple lexemes in a reading, but not
-               # multiple readings to a lexeme.
-               my @tags = split( /\n/, $tagresult );
-               my @lexemes;
-               my $curr_rdg = shift @path;
-               my @curr_lexemes;
-               my $unused_rdg_part;
-               foreach my $tag ( @tags ) {
-                       # Get the original word
-                       my( $lexeme, @rest ) = split( /\t/, $tag );
-                       # Lemmatize the whole
-                       my @forms = _parse_wordform( _flemm_lookup( $tag ) );
-                       my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
-                               'string' => $lexeme, 'language' => 'French',
-                               'wordform_matchlist' => \@forms );
-                       # Find the next non-meta reading
-                       while( $curr_rdg && $curr_rdg->is_meta ) {
-                               $curr_rdg = shift @path;
-                       }
-                       unless( $curr_rdg ) {
-                               warn "Ran out of readings in sequence at $lexeme";
-                               last;
-                       }
-                       my $curr_rdg_text = $curr_rdg->has_normal_form 
-                               ? $curr_rdg->normal_form : $curr_rdg->text;
-                       if( $unused_rdg_part &&
-                               $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
-                               # Nth part of curr_rdg
-                               $unused_rdg_part = $2;
-                               push( @curr_lexemes, $lexobj );
-                       } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
-                               # Flag an error if there is already an unused reading part.
-                               warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
-                                       if $unused_rdg_part;
-                               $unused_rdg_part = $2; # will be empty if the whole reading matched
-                               push( @curr_lexemes, $lexobj );
-                       } else {
-                               # We do not cope with the idea of a lexeme being 
-                               # spread across multiple readings.
-                               warn "Word sequence changed unexpectedly in text";
-                               # See if we can find a matching reading
-                               my @lookahead;
-                               my $matched;
-                               while( my $nr = shift @path ) {
-                                       my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
-                                       if( $nrtext =~ /^\Q$lexeme\E/ ) {
-                                               $curr_rdg = $lookahead[-1] if @lookahead;
-                                               $matched = 1;
-                                               last;
-                                       } else {
-                                               push( @lookahead, $nr );
-                                       }
-                               }
-                               # No match? Restore the state we had
-                               unless( $matched ) {
-                                       unshift( @path, @lookahead );
-                               }
-                               # Trigger a move
-                               $unused_rdg_part = '';
-                       }
-                       
-                       unless( $unused_rdg_part ) {
-                               # Record the lexemes for the given reading.
-                               #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
-                               #       join( ' ', map { $_->string } @curr_lexemes ),
-                               #       $curr_rdg->id, $curr_rdg->text );
-                               _update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes );
-                               $curr_rdg = shift @path;
-                               @curr_lexemes = ();
-                       }
-               }
-       }
-}
-
-# Utility function so that we can cheat and use it when we need both the path
-# and its text.
-sub _text_from_path {
-       my( $normalize, @path ) = @_;
-       my $pathtext = '';
-       my $last;
-       foreach my $r ( @path ) {
-               unless ( $r->join_prior || !$last || $last->join_next ) {
-                       $pathtext .= ' ';
-               } 
-               $pathtext .= ( $normalize && $r->has_normal_form ) 
-                       ? $r->normal_form : $r->text;
-               $last = $r;
-       }
-       return $pathtext;
-}
-
-# Utility function that actually calls the tree tagger.
-sub _treetag_string {
-       my( $text ) = @_;
-       my $wittext = encode_utf8( $text );
-       # Then see if we have TreeTagger
-       my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-french-utf8";
-       unless( -f $taggercmd ) {
-               warn "Cannot do French word lemmatization without TreeTagger";
-               return;
-       }
-       # OK, we can run it then.
-       my @cmd = ( $taggercmd );
-       my( $tagresult, $err ); # Capture the output and error
-       run( \@cmd, \$wittext, \$tagresult, \$err );
-       # TODO check for error
-       return decode_utf8( $tagresult );
+       my( @path ) = @_;
+       my %opts = ( 
+               'language' => 'French',
+               'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) },
+               'path' => \@path,
+               );
+       return reading_lookup_treetagger( %opts );
 }
 
 # Closure and utility function for the package lemmatizer
@@ -328,7 +159,7 @@ sub _parse_wordform {
 
 =over
 
-=item * Handle package dependencies more gracefully
+=item * Try to do more things with Perl objects in Flemm and TT
 
 =back
 
diff --git a/t/text_tradition_language_french.t b/t/text_tradition_language_french.t

index 01b206b..84e9d02 100644 (file)
--- a/t/text_tradition_language_french.t
+++ b/t/text_tradition_language_french.t
@@ -45,7 +45,7 @@ SKIP: {
        is( $ambig, 102, "Found 102 ambiguous forms as expected" );
        
        # Try setting the normal form of a reading and re-analyzing
-       my $mr = $tf->collation->reading('99,2');
+       my $mr = $tf->collation->reading('r99.2');
        is( $mr->text, 'minspire', "Picked correct test reading" );
        is( $mr->language, 'French', "Reading has correct language setting" );
        $mr->normal_form( "m'inspire" );
Makefile.PL		patch \| blob \| blame \| history
lib/Text/Tradition/Language/Base.pm	[new file with mode: 0644]	patch \| blob
lib/Text/Tradition/Language/English.pm		patch \| blob \| blame \| history
lib/Text/Tradition/Language/French.pm		patch \| blob \| blame \| history
t/text_tradition_language_french.t		patch \| blob \| blame \| history