From: Tara L Andrews <tla\@mit.edu>
Date: Mon, 7 May 2012 14:16:58 +0000 (+0200)
Subject: refactor English/French shared TT logic into Base.pm
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=e0f6836abec58dd0dd896a130c7587368880d255;p=scpubgit%2Fstemmatology.git

refactor English/French shared TT logic into Base.pm
---

diff --git a/Makefile.PL b/Makefile.PL
index 949a95c..ceb652d 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -36,4 +36,7 @@ requires( 'Lingua::TagSet::Multext' );
 requires( 'Lingua::TagSet::TreeTagger' );
 requires( 'Lingua::Features::Structure' );
 build_requires( 'Test::Warn' );
+# Modules needed for morphology but not trivially CPANnable
+recommends( 'Lingua::TreeTagger' );
+recommends( 'Flemm' );
 &WriteAll;
diff --git a/lib/Text/Tradition/Language/Base.pm b/lib/Text/Tradition/Language/Base.pm
new file mode 100644
index 0000000..797c085
--- /dev/null
+++ b/lib/Text/Tradition/Language/Base.pm
@@ -0,0 +1,278 @@
+package Text::Tradition::Language::Base;
+
+use strict;
+use warnings;
+use Encode qw/ encode_utf8 decode_utf8 /;
+use Exporter 'import';
+use vars qw/ @EXPORT_OK /;
+use IPC::Run qw/ run /;
+use Lingua::TagSet::Multext;
+use Lingua::TagSet::TreeTagger;
+use Module::Load;
+use Text::Tradition::Collation::Reading::Lexeme;
+use Text::Tradition::Collation::Reading::WordForm;
+use TryCatch;
+
+@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger /;
+
+=head1 NAME
+
+Text::Tradition::Language::Base - Base subroutines for lemmatization of words
+
+=head1 DESCRIPTION
+
+Common routines for applying morphological tagging to a Text::Tradition. Used
+with callbacks from the named language packages.
+
+=head1 SUBROUTINES
+
+=head2 lemmatize_treetagger( $tradition )
+
+Evaluates the tradition with the given options, and returns the results.
+
+=cut
+
+sub lemmatize_treetagger {
+	my( $tradition, %opts ) = @_;
+
+	# Given a tradition, lemmatize it witness by witness and see what we get.
+	my $c = $tradition->collation;
+	# First, clear out all existing lexemes from the readings. 
+	my %witness_paths = _clear_reading_lexemes( $tradition );
+	
+	foreach my $sig ( keys %witness_paths ) {
+		# Get the text as a sequence of readings and as a string
+		print STDERR "Morphologizing witness $sig\n";
+		my %witopts = (
+			'path' => $witness_paths{$sig},
+			%opts
+			);
+		_lemmatize_treetagger_sequence( %witopts );
+	}
+}
+
+sub _clear_reading_lexemes {
+	my $tradition = shift;
+		my $c = $tradition->collation;
+	# Clear out all existing lexemes from the readings. Save the path as long 
+	# as we went to the trouble of generating it.
+	my %witness_paths;
+	foreach my $wit ( $tradition->witnesses ) {
+		my @sigla = ( $wit->sigil );
+		push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
+		foreach my $sig ( @sigla ) {
+			my @path = grep { !$_->is_meta } 
+				$c->reading_sequence( $c->start, $c->end, $sig );
+			map { $_->clear_lexemes } @path;
+			$witness_paths{$sig} = \@path;
+		}
+	}
+	return %witness_paths;
+}
+
+=head2 reading_lookup( $rdg[, $rdg, ...] )
+
+Looks up one or more readings using the Flemm package, and returns the
+possible results.  This uses the same logic as L<lemmatize> above for the
+entire tradition, but can also be used to (re-)analyze individual readings.
+
+=cut
+
+sub reading_lookup_treetagger {
+	my %opts = @_;
+	$opts{'replace'} = 1;
+	return _lemmatize_treetagger_sequence( %opts );
+}
+
+sub _lemmatize_treetagger_sequence {
+	my %opts = @_;
+	my @path = @{$opts{'path'}};
+	my $tagresult = _treetag_string( _text_from_path( 1, @path ), $opts{'language'} );
+	if( $tagresult ) {
+		# Map the tagged words onto the original readings, splitting 
+		# them up into lexemes where necessary.
+		# NOTE we can have multiple lexemes in a reading, but not
+		# multiple readings to a lexeme.
+		my @tags = split( /\n/, $tagresult );
+		my @lexemes;
+		my $curr_rdg = shift @path;
+		my @curr_lexemes;
+		my $unused_rdg_part;
+		foreach my $tag ( @tags ) {
+			# Get the original word
+			my( $lexeme, @rest ) = split( /\t/, $tag );
+			# Lemmatize the whole
+			# TODO error trap this
+			my @forms = $opts{'callback'}( $tag );
+
+			my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
+				'string' => $lexeme, 'language' => $opts{'language'},
+				'wordform_matchlist' => \@forms );
+			# Find the next non-meta reading
+			while( $curr_rdg && $curr_rdg->is_meta ) {
+				$curr_rdg = shift @path;
+			}
+			unless( $curr_rdg ) {
+				warn "Ran out of readings in sequence at $lexeme";
+				last;
+			}
+			my $curr_rdg_text = $curr_rdg->has_normal_form 
+				? $curr_rdg->normal_form : $curr_rdg->text;
+			if( $unused_rdg_part &&
+				$unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
+				# Nth part of curr_rdg
+				$unused_rdg_part = $2;
+				push( @curr_lexemes, $lexobj );
+			} elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
+				# Flag an error if there is already an unused reading part.
+				warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
+					if $unused_rdg_part;
+				$unused_rdg_part = $2; # will be empty if the whole reading matched
+				push( @curr_lexemes, $lexobj );
+			} else {
+				# We do not cope with the idea of a lexeme being 
+				# spread across multiple readings.
+				warn "Word sequence changed unexpectedly in text";
+				# See if we can find a matching reading
+				my @lookahead;
+				my $matched;
+				while( my $nr = shift @path ) {
+					my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
+					if( $nrtext =~ /^\Q$lexeme\E/ ) {
+						$curr_rdg = $lookahead[-1] if @lookahead;
+						$matched = 1;
+						last;
+					} else {
+						push( @lookahead, $nr );
+					}
+				}
+				# No match? Restore the state we had
+				unless( $matched ) {
+					unshift( @path, @lookahead );
+				}
+				# Trigger a move
+				$unused_rdg_part = '';
+			}
+			
+			unless( $unused_rdg_part ) {
+				# Record the lexemes for the given reading.
+				#print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
+				#	join( ' ', map { $_->string } @curr_lexemes ),
+				#	$curr_rdg->id, $curr_rdg->text );
+				_update_reading_lexemes( $opts{replace}, $curr_rdg, @curr_lexemes );
+				$curr_rdg = shift @path;
+				@curr_lexemes = ();
+			}
+		}
+	}
+}
+
+sub _update_reading_lexemes {
+	my( $replace, $reading, @lexemes ) = @_;
+	if( $reading->has_lexemes && !$replace ) {
+		# We need to merge what is in @lexemes with what we have already.
+		my @oldlex = $reading->lexemes;
+		my $cmp1 = join( '||', map { $_->string } @oldlex );
+		my $cmp2 = join( '||', map { $_->string } @lexemes );
+		if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
+			# The lexeme strings are the same, so merge the possible
+			# word forms from new to old.
+			foreach my $i ( 0 .. $#lexemes ) {
+				my $ol = $oldlex[$i];
+				my $nl = $lexemes[$i];
+				my %ofw;
+				map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
+				foreach my $form ( $nl->matching_forms ) {
+					unless( $ofw{$form->to_string} ) {
+						print STDERR "Adding form " . $form->to_string . 
+							" to lexeme " . $nl->string . " at $reading\n";
+						$ol->add_matching_form( $form );
+						$ol->is_disambiguated(0);
+					}
+				}
+			}
+		} else {
+			warn "Lexeme layout for $reading changed; replacing the lot";
+			$reading->clear_lexemes;
+			$reading->add_lexeme( @lexemes );
+		}
+	} else {
+		$reading->clear_lexemes if $replace;
+		$reading->add_lexeme( @lexemes );
+	}
+}
+
+# Utility function so that we can cheat and use it when we need both the path
+# and its text.
+sub _text_from_path {
+	my( $normalize, @path ) = @_;
+	my $pathtext = '';
+	my $last;
+	foreach my $r ( @path ) {
+		unless ( $r->join_prior || !$last || $last->join_next ) {
+			$pathtext .= ' ';
+		} 
+		$pathtext .= ( $normalize && $r->has_normal_form ) 
+			? $r->normal_form : $r->text;
+		$last = $r;
+	}
+	return $pathtext;
+}
+
+# Utility function that actually calls the tree tagger.
+sub _treetag_string {
+	my( $text, $lang ) = @_;
+	my $wittext = encode_utf8( $text );
+	# Then see if we have TreeTagger
+	try {
+		load( 'Lingua::TreeTagger' );
+	} catch {
+		warn "Cannot run TreeTagger without Lingua::TreeTagger module";
+		return '';
+	}
+	# OK, we can run it then.
+	# First upgrade to UTF8 for necessary languages.
+	$lang = lc( $lang );
+	my $use_utf8;
+	my @utf8_supported = qw/ french /;
+	if( grep { $_ eq $lang } @utf8_supported ) {
+		$lang .= '-utf8';
+		$use_utf8 = 1;
+	}
+	# Now instantiate and run the tagger.
+	my $tagger = Lingua::TreeTagger->new(
+		'language' => $lang,
+		'options' => [ qw/ -token -lemma / ],
+		);
+	if( $use_utf8 ) {
+		$text = encode_utf8( $text );
+	} 
+	my $tagresult = $tagger->tag_text( \$text );
+	
+	# TODO maybe send the tokens back rather than the interpreted string...
+	if( $use_utf8 ) {
+		return decode_utf8( $tagresult->as_text() );
+	} 
+	return $tagresult->as_text();
+}
+
+
+1;
+
+=head2 TODO
+
+=over
+
+=item * Handle package dependencies more gracefully
+
+=back
+
+=head1 LICENSE
+
+This package is free software and is provided "as is" without express
+or implied warranty.  You can redistribute it and/or modify it under
+the same terms as Perl itself.
+
+=head1 AUTHOR
+
+Tara L Andrews E<lt>aurum@cpan.orgE<gt>
diff --git a/lib/Text/Tradition/Language/English.pm b/lib/Text/Tradition/Language/English.pm
index 0ea6858..9d25c55 100644
--- a/lib/Text/Tradition/Language/English.pm
+++ b/lib/Text/Tradition/Language/English.pm
@@ -2,16 +2,9 @@ package Text::Tradition::Language::English;
 
 use strict;
 use warnings;
-use Encode qw/ encode_utf8 decode_utf8 /;
-use IPC::Run qw/ run /;
-use Lingua::TagSet::TreeTagger;
-use Module::Load;
-use Text::Tradition::Collation::Reading::Lexeme;
-use Text::Tradition::Collation::Reading::WordForm;
+use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger /;
 use TryCatch;
 
-my $MORPHDIR = '/Users/tla/Projects/morphology';
-
 =head1 NAME
 
 Text::Tradition::Language::English - language-specific module for English
@@ -41,64 +34,11 @@ use_ok( 'Text::Tradition::Language::English' );
 
 sub lemmatize {
 	my $tradition = shift;
-
-	# Given a tradition, lemmatize it witness by witness and see what we get.
-	my $workdir = File::Temp->newdir();
-	my $c = $tradition->collation;
-	# First, clear out all existing lexemes from the readings. Save the
-	# path as long as we went to the trouble of generating it.
-	my %witness_paths;
-	foreach my $wit ( $tradition->witnesses ) {
-		my @sigla = ( $wit->sigil );
-		push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
-		foreach my $sig ( @sigla ) {
-			my @path = grep { !$_->is_meta } 
-				$c->reading_sequence( $c->start, $c->end, $sig );
-			map { $_->clear_lexemes } @path;
-			$witness_paths{$sig} = \@path;
-		}
-	}
-	
-	foreach my $sig ( keys %witness_paths ) {
-		# Get the text as a sequence of readings and as a string
-		print STDERR "Morphologizing witness $sig\n";
-		_lemmatize_sequence( undef, @{$witness_paths{$sig}} );
-	}
-}
-
-sub _update_reading_lexemes {
-	my( $replace, $reading, @lexemes ) = @_;
-	if( $reading->has_lexemes && !$replace ) {
-		# We need to merge what is in @lexemes with what we have already.
-		my @oldlex = $reading->lexemes;
-		my $cmp1 = join( '||', map { $_->string } @oldlex );
-		my $cmp2 = join( '||', map { $_->string } @lexemes );
-		if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
-			# The lexeme strings are the same, so merge the possible
-			# word forms from new to old.
-			foreach my $i ( 0 .. $#lexemes ) {
-				my $ol = $oldlex[$i];
-				my $nl = $lexemes[$i];
-				my %ofw;
-				map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
-				foreach my $form ( $nl->matching_forms ) {
-					unless( $ofw{$form->to_string} ) {
-						print STDERR "Adding form " . $form->to_string . 
-							" to lexeme " . $nl->string . " at $reading\n";
-						$ol->add_matching_form( $form );
-						$ol->is_disambiguated(0);
-					}
-				}
-			}
-		} else {
-			warn "Lexeme layout for $reading changed; replacing the lot";
-			$reading->clear_lexemes;
-			$reading->add_lexeme( @lexemes );
-		}
-	} else {
-		$reading->clear_lexemes if $replace;
-		$reading->add_lexeme( @lexemes );
-	}
+	my %opts = ( 
+		'language' => 'French', 
+		'callback' => sub { _parse_wordform( @_ ) } 
+		);
+	return lemmatize_treetagger( $tradition, %opts );
 }
 
 =head2 reading_lookup( $rdg[, $rdg, ...] )
@@ -110,122 +50,13 @@ entire tradition, but can also be used to (re-)analyze individual readings.
 =cut
 
 sub reading_lookup {
-	return _lemmatize_sequence( 1, @_ );
-}
-
-sub _lemmatize_sequence {
-	my( $replace, @path ) = @_;
-	my $tagresult = _treetag_string( _text_from_path( 1, @path ) );
-	if( $tagresult ) {
-		# Map the tagged words onto the original readings, splitting 
-		# them up into lexemes where necessary.
-		# NOTE we can have multiple lexemes in a reading, but not
-		# multiple readings to a lexeme.
-		my @tags = split( /\n/, $tagresult );
-		my @lexemes;
-		my $curr_rdg = shift @path;
-		my @curr_lexemes;
-		my $unused_rdg_part;
-		foreach my $tag ( @tags ) {
-			# Get the original word
-			my( $lexeme, @rest ) = split( /\t/, $tag );
-			# Lemmatize the whole
-			my @forms = _parse_wordform( $tag );
-			my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
-				'string' => $lexeme, 'language' => 'English',
-				'wordform_matchlist' => \@forms );
-			# Find the next non-meta reading
-			while( $curr_rdg && $curr_rdg->is_meta ) {
-				$curr_rdg = shift @path;
-			}
-			unless( $curr_rdg ) {
-				warn "Ran out of readings in sequence at $lexeme";
-				last;
-			}
-			my $curr_rdg_text = $curr_rdg->has_normal_form 
-				? $curr_rdg->normal_form : $curr_rdg->text;
-			if( $unused_rdg_part &&
-				$unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
-				# Nth part of curr_rdg
-				$unused_rdg_part = $2;
-				push( @curr_lexemes, $lexobj );
-			} elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
-				# Flag an error if there is already an unused reading part.
-				warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
-					if $unused_rdg_part;
-				$unused_rdg_part = $2; # will be empty if the whole reading matched
-				push( @curr_lexemes, $lexobj );
-			} else {
-				# We do not cope with the idea of a lexeme being 
-				# spread across multiple readings.
-				warn "Word sequence changed unexpectedly in text";
-				# See if we can find a matching reading
-				my @lookahead;
-				my $matched;
-				while( my $nr = shift @path ) {
-					my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
-					if( $nrtext =~ /^\Q$lexeme\E/ ) {
-						$curr_rdg = $lookahead[-1] if @lookahead;
-						$matched = 1;
-						last;
-					} else {
-						push( @lookahead, $nr );
-					}
-				}
-				# No match? Restore the state we had
-				unless( $matched ) {
-					unshift( @path, @lookahead );
-				}
-				# Trigger a move
-				$unused_rdg_part = '';
-			}
-			
-			unless( $unused_rdg_part ) {
-				# Record the lexemes for the given reading.
-				#print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
-				#	join( ' ', map { $_->string } @curr_lexemes ),
-				#	$curr_rdg->id, $curr_rdg->text );
-				_update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes );
-				$curr_rdg = shift @path;
-				@curr_lexemes = ();
-			}
-		}
-	}
-}
-
-# Utility function so that we can cheat and use it when we need both the path
-# and its text.
-sub _text_from_path {
-	my( $normalize, @path ) = @_;
-	my $pathtext = '';
-	my $last;
-	foreach my $r ( @path ) {
-		unless ( $r->join_prior || !$last || $last->join_next ) {
-			$pathtext .= ' ';
-		} 
-		$pathtext .= ( $normalize && $r->has_normal_form ) 
-			? $r->normal_form : $r->text;
-		$last = $r;
-	}
-	return $pathtext;
-}
-
-# Utility function that actually calls the tree tagger.
-sub _treetag_string {
-	my( $text ) = @_;
-	my $wittext = encode_utf8( $text );
-	# Then see if we have TreeTagger
-	my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-english";
-	unless( -f $taggercmd ) {
-		warn "Cannot do English word lemmatization without TreeTagger";
-		return;
-	}
-	# OK, we can run it then.
-	my @cmd = ( $taggercmd );
-	my( $tagresult, $err ); # Capture the output and error
-	run( \@cmd, \$wittext, \$tagresult, \$err );
-	# TODO check for error
-	return decode_utf8( $tagresult );
+	my( @path ) = @_;
+	my %opts = ( 
+		'language' => 'French',
+		'callback' => sub { _parse_wordform( @_ ) },
+		'path' => \@path,
+		);
+	return reading_lookup_treetagger( %opts );
 }
 
 # Utility function to turn a TreeTagger result into a WordForm
@@ -250,9 +81,7 @@ sub _parse_wordform {
 
 =over
 
-=item * Handle package dependencies more gracefully
-
-=item * Refactor English/French use of TreeTagger into its own util package
+=item * Tests!
 
 =back
 
diff --git a/lib/Text/Tradition/Language/French.pm b/lib/Text/Tradition/Language/French.pm
index 6847354..7453044 100644
--- a/lib/Text/Tradition/Language/French.pm
+++ b/lib/Text/Tradition/Language/French.pm
@@ -2,17 +2,10 @@ package Text::Tradition::Language::French;
 
 use strict;
 use warnings;
-use Encode qw/ encode_utf8 decode_utf8 /;
-use IPC::Run qw/ run /;
-use Lingua::TagSet::Multext;
-use Lingua::TagSet::TreeTagger;
-use Module::Load;
-use Text::Tradition::Collation::Reading::Lexeme;
-use Text::Tradition::Collation::Reading::WordForm;
+use Module::Load qw/ load /;
+use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger /;
 use TryCatch;
 
-my $MORPHDIR = '/Users/tla/Projects/morphology';
-
 =head1 NAME
 
 Text::Tradition::Language::French - language-specific module for French
@@ -71,7 +64,7 @@ SKIP: {
 	is( $ambig, 102, "Found 102 ambiguous forms as expected" );
 	
 	# Try setting the normal form of a reading and re-analyzing
-	my $mr = $tf->collation->reading('99,2');
+	my $mr = $tf->collation->reading('r99.2');
 	is( $mr->text, 'minspire', "Picked correct test reading" );
 	is( $mr->language, 'French', "Reading has correct language setting" );
 	$mr->normal_form( "m'inspire" );
@@ -85,64 +78,11 @@ SKIP: {
 
 sub lemmatize {
 	my $tradition = shift;
-
-	# Given a tradition, lemmatize it witness by witness and see what we get.
-	my $workdir = File::Temp->newdir();
-	my $c = $tradition->collation;
-	# First, clear out all existing lexemes from the readings. Save the
-	# path as long as we went to the trouble of generating it.
-	my %witness_paths;
-	foreach my $wit ( $tradition->witnesses ) {
-		my @sigla = ( $wit->sigil );
-		push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
-		foreach my $sig ( @sigla ) {
-			my @path = grep { !$_->is_meta } 
-				$c->reading_sequence( $c->start, $c->end, $sig );
-			map { $_->clear_lexemes } @path;
-			$witness_paths{$sig} = \@path;
-		}
-	}
-	
-	foreach my $sig ( keys %witness_paths ) {
-		# Get the text as a sequence of readings and as a string
-		print STDERR "Morphologizing witness $sig\n";
-		_lemmatize_sequence( undef, @{$witness_paths{$sig}} );
-	}
-}
-
-sub _update_reading_lexemes {
-	my( $replace, $reading, @lexemes ) = @_;
-	if( $reading->has_lexemes && !$replace ) {
-		# We need to merge what is in @lexemes with what we have already.
-		my @oldlex = $reading->lexemes;
-		my $cmp1 = join( '||', map { $_->string } @oldlex );
-		my $cmp2 = join( '||', map { $_->string } @lexemes );
-		if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
-			# The lexeme strings are the same, so merge the possible
-			# word forms from new to old.
-			foreach my $i ( 0 .. $#lexemes ) {
-				my $ol = $oldlex[$i];
-				my $nl = $lexemes[$i];
-				my %ofw;
-				map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
-				foreach my $form ( $nl->matching_forms ) {
-					unless( $ofw{$form->to_string} ) {
-						print STDERR "Adding form " . $form->to_string . 
-							" to lexeme " . $nl->string . " at $reading\n";
-						$ol->add_matching_form( $form );
-						$ol->is_disambiguated(0);
-					}
-				}
-			}
-		} else {
-			warn "Lexeme layout for $reading changed; replacing the lot";
-			$reading->clear_lexemes;
-			$reading->add_lexeme( @lexemes );
-		}
-	} else {
-		$reading->clear_lexemes if $replace;
-		$reading->add_lexeme( @lexemes );
-	}
+	my %opts = ( 
+		'language' => 'French', 
+		'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) } 
+		);
+	return lemmatize_treetagger( $tradition, %opts );
 }
 
 =head2 reading_lookup( $rdg[, $rdg, ...] )
@@ -154,122 +94,13 @@ entire tradition, but can also be used to (re-)analyze individual readings.
 =cut
 
 sub reading_lookup {
-	return _lemmatize_sequence( 1, @_ );
-}
-
-sub _lemmatize_sequence {
-	my( $replace, @path ) = @_;
-	my $tagresult = _treetag_string( _text_from_path( 1, @path ) );
-	if( $tagresult ) {
-		# Map the tagged words onto the original readings, splitting 
-		# them up into lexemes where necessary.
-		# NOTE we can have multiple lexemes in a reading, but not
-		# multiple readings to a lexeme.
-		my @tags = split( /\n/, $tagresult );
-		my @lexemes;
-		my $curr_rdg = shift @path;
-		my @curr_lexemes;
-		my $unused_rdg_part;
-		foreach my $tag ( @tags ) {
-			# Get the original word
-			my( $lexeme, @rest ) = split( /\t/, $tag );
-			# Lemmatize the whole
-			my @forms = _parse_wordform( _flemm_lookup( $tag ) );
-			my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
-				'string' => $lexeme, 'language' => 'French',
-				'wordform_matchlist' => \@forms );
-			# Find the next non-meta reading
-			while( $curr_rdg && $curr_rdg->is_meta ) {
-				$curr_rdg = shift @path;
-			}
-			unless( $curr_rdg ) {
-				warn "Ran out of readings in sequence at $lexeme";
-				last;
-			}
-			my $curr_rdg_text = $curr_rdg->has_normal_form 
-				? $curr_rdg->normal_form : $curr_rdg->text;
-			if( $unused_rdg_part &&
-				$unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
-				# Nth part of curr_rdg
-				$unused_rdg_part = $2;
-				push( @curr_lexemes, $lexobj );
-			} elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
-				# Flag an error if there is already an unused reading part.
-				warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
-					if $unused_rdg_part;
-				$unused_rdg_part = $2; # will be empty if the whole reading matched
-				push( @curr_lexemes, $lexobj );
-			} else {
-				# We do not cope with the idea of a lexeme being 
-				# spread across multiple readings.
-				warn "Word sequence changed unexpectedly in text";
-				# See if we can find a matching reading
-				my @lookahead;
-				my $matched;
-				while( my $nr = shift @path ) {
-					my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
-					if( $nrtext =~ /^\Q$lexeme\E/ ) {
-						$curr_rdg = $lookahead[-1] if @lookahead;
-						$matched = 1;
-						last;
-					} else {
-						push( @lookahead, $nr );
-					}
-				}
-				# No match? Restore the state we had
-				unless( $matched ) {
-					unshift( @path, @lookahead );
-				}
-				# Trigger a move
-				$unused_rdg_part = '';
-			}
-			
-			unless( $unused_rdg_part ) {
-				# Record the lexemes for the given reading.
-				#print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
-				#	join( ' ', map { $_->string } @curr_lexemes ),
-				#	$curr_rdg->id, $curr_rdg->text );
-				_update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes );
-				$curr_rdg = shift @path;
-				@curr_lexemes = ();
-			}
-		}
-	}
-}
-
-# Utility function so that we can cheat and use it when we need both the path
-# and its text.
-sub _text_from_path {
-	my( $normalize, @path ) = @_;
-	my $pathtext = '';
-	my $last;
-	foreach my $r ( @path ) {
-		unless ( $r->join_prior || !$last || $last->join_next ) {
-			$pathtext .= ' ';
-		} 
-		$pathtext .= ( $normalize && $r->has_normal_form ) 
-			? $r->normal_form : $r->text;
-		$last = $r;
-	}
-	return $pathtext;
-}
-
-# Utility function that actually calls the tree tagger.
-sub _treetag_string {
-	my( $text ) = @_;
-	my $wittext = encode_utf8( $text );
-	# Then see if we have TreeTagger
-	my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-french-utf8";
-	unless( -f $taggercmd ) {
-		warn "Cannot do French word lemmatization without TreeTagger";
-		return;
-	}
-	# OK, we can run it then.
-	my @cmd = ( $taggercmd );
-	my( $tagresult, $err ); # Capture the output and error
-	run( \@cmd, \$wittext, \$tagresult, \$err );
-	# TODO check for error
-	return decode_utf8( $tagresult );
+	my( @path ) = @_;
+	my %opts = ( 
+		'language' => 'French',
+		'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) },
+		'path' => \@path,
+		);
+	return reading_lookup_treetagger( %opts );
 }
 
 # Closure and utility function for the package lemmatizer
@@ -328,7 +159,7 @@ sub _parse_wordform {
 
 =over
 
-=item * Handle package dependencies more gracefully
+=item * Try to do more things with Perl objects in Flemm and TT
 
 =back
 
diff --git a/t/text_tradition_language_french.t b/t/text_tradition_language_french.t
index 01b206b..84e9d02 100644
--- a/t/text_tradition_language_french.t
+++ b/t/text_tradition_language_french.t
@@ -45,7 +45,7 @@ SKIP: {
 	is( $ambig, 102, "Found 102 ambiguous forms as expected" );
 	
 	# Try setting the normal form of a reading and re-analyzing
-	my $mr = $tf->collation->reading('99,2');
+	my $mr = $tf->collation->reading('r99.2');
 	is( $mr->text, 'minspire', "Picked correct test reading" );
 	is( $mr->language, 'French', "Reading has correct language setting" );
 	$mr->normal_form( "m'inspire" );