From: Tara L Andrews <tla\@mit.edu>
Date: Wed, 25 Apr 2012 12:40:23 +0000 (+0200)
Subject: make French morph tagging work; dependent on Flemm and TreeTagger
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?p=scpubgit%2Fstemmatology.git;a=commitdiff_plain;h=d3e7842a9402304b1b701c2a72db001b324f1f2f

make French morph tagging work; dependent on Flemm and TreeTagger
---

diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm
index 20ba383..62b50f4 100644
--- a/lib/Text/Tradition.pm
+++ b/lib/Text/Tradition.pm
@@ -356,6 +356,17 @@ sub add_stemma {
 	return $stemma;
 }
 
+sub lemmatize {
+	my $self = shift;
+	unless( $self->has_language ) {
+		warn "Please set a language to lemmatize a tradition";
+		return;
+	}
+	my $mod = "Text::Tradition::Language::" . $self->language;
+	load( $mod );
+	$mod->can( 'lemmatize' )->( $self );
+}
+
 no Moose;
 __PACKAGE__->meta->make_immutable;
 
diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm
index 34151eb..7b26135 100644
--- a/lib/Text/Tradition/Collation.pm
+++ b/lib/Text/Tradition/Collation.pm
@@ -1386,6 +1386,13 @@ sub path_text {
 	$start = $self->start unless $start;
 	$end = $self->end unless $end;
 	my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
+	return _text_from_path( @path );
+}
+
+# Utility function so that we can cheat and use it when we need both the path
+# and its text.
+sub _text_from_path {
+	my( $self, @path ) = @_;
 	my $pathtext = '';
 	my $last;
 	foreach my $r ( @path ) {
diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm
index 47bd0f7..11ec36e 100644
--- a/lib/Text/Tradition/Collation/Reading.pm
+++ b/lib/Text/Tradition/Collation/Reading.pm
@@ -139,15 +139,16 @@ has 'normal_form' => (
 
 # Holds the word form. If is_disambiguated is true, the form at index zero
 # is the correct one.
-has 'lexemes' => (
+has 'reading_lexemes' => (
 	traits => ['Array'],
-	isa => 'ArrayRef[Text::Tradition::Collation::Lexeme]',
+	isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]',
 	handles => {
 		lexemes => 'elements',
 		has_lexemes => 'count',
-		_clear_lexemes => 'clear',
-		_add_lexeme => 'push',
+		clear_lexemes => 'clear',
+		add_lexeme => 'push',
 		},
+	default => sub { [] },
 	);
 	
 ## For prefix/suffix readings
diff --git a/lib/Text/Tradition/Collation/Reading/Lexeme.pm b/lib/Text/Tradition/Collation/Reading/Lexeme.pm
index 4ea9bf3..ca4802c 100644
--- a/lib/Text/Tradition/Collation/Reading/Lexeme.pm
+++ b/lib/Text/Tradition/Collation/Reading/Lexeme.pm
@@ -76,13 +76,13 @@ has 'wordform_matchlist' => (
 		'matching_forms' => 'elements',
 		'matching_form' => 'get',
 		'add_matching_form' => 'push',
+		},
 	);
 
 has 'is_disambiguated' => (
-	is => 'ro',
+	is => 'rw',
 	isa => 'Bool',
 	default => undef,
-	writer => '_set_disambiguated',
 	);
 	
 has 'form' => (
@@ -91,6 +91,14 @@ has 'form' => (
 	writer => '_set_form',
 	);
 	
+# Do auto-disambiguation if we were created with a single wordform
+sub BUILD {
+	my $self = shift;
+
+	if( $self->matches == 1 ) {
+		$self->disambiguate( 0 );
+	}	
+}
 
 =head2 disambiguate( $index )
 
@@ -105,7 +113,7 @@ sub disambiguate {
 	throw( "There is no candidate wordform at index $idx" )
 		unless $form;
 	$self->_set_form( $form );
-	$self->_set_disambiguated( 1 );	
+	$self->is_disambiguated( 1 );	
 }
 
 =head2 lookup
diff --git a/lib/Text/Tradition/Collation/Reading/WordForm.pm b/lib/Text/Tradition/Collation/Reading/WordForm.pm
index a56cd2b..8f519f9 100644
--- a/lib/Text/Tradition/Collation/Reading/WordForm.pm
+++ b/lib/Text/Tradition/Collation/Reading/WordForm.pm
@@ -75,7 +75,7 @@ around BUILDARGS => sub {
 sub _stringify {
 	my $self = shift;
 	return sprintf( "%s//%s//%s", $self->language, $self->lemma,
-		join( '', $self->morphology ) );
+		join( '|', @{$self->morphology} ) );
 }
 
 no Moose;
diff --git a/lib/Text/Tradition/Language/French.pm b/lib/Text/Tradition/Language/French.pm
index d103bc0..8028c98 100644
--- a/lib/Text/Tradition/Language/French.pm
+++ b/lib/Text/Tradition/Language/French.pm
@@ -1,7 +1,13 @@
 package Text::Tradition::Language::French;
 
-#use Flemm;
+use Encode qw/ encode_utf8 decode_utf8 /;
+use IPC::Run qw/ run binary /;
+use Module::Load;
+use Text::Tradition::Collation::Reading::Lexeme;
 use Text::Tradition::Collation::Reading::WordForm;
+use TryCatch;
+
+my $MORPHDIR = '/Users/tla/Projects/morphology';
 
 =head1 NAME
 
@@ -20,9 +26,139 @@ Evaluates the string using the Flemm package, and returns the results.
 =cut
 
 sub lemmatize {
-	my $text = shift;
-	
+	my $tradition = shift;
+
+	# Given a tradition, lemmatize it witness by witness and see what we get.
+	my $workdir = File::Temp->newdir();
+	my $c = $tradition->collation;
+	# First, clear out all existing lexemes from the readings. Save the
+	# path as long as we went to the trouble of generating it.
+	my %witness_paths;
+	foreach my $wit ( $tradition->witnesses ) {
+		my @sigla = ( $wit->sigil );
+		push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
+		foreach my $sig ( @sigla ) {
+			my @path = grep { !$_->is_meta } 
+				$c->reading_sequence( $c->start, $c->end, $sig );
+			map { $_->clear_lexemes } @path;
+			$witness_paths{$sig} = \@path;
+		}
+	}
 	
+	foreach my $sig ( keys %witness_paths ) {
+		# Get the text as a sequence of readings and as a string
+		print STDERR "Morphologizing witness $sig\n";
+		my @path = @{$witness_paths{$sig}};
+		my $tagresult = _treetag_string( $c->_text_from_path( @path ) );
+		if( $tagresult ) {
+			# Map the tagged words onto the original readings, splitting 
+			# them up into lexemes where necessary.
+			# NOTE we can have multiple lexemes in a reading, but not
+			# multiple readings to a lexeme.
+			my @tags = split( /\n/, $tagresult );
+			my @lexemes;
+			my $curr_rdg = shift @path;
+			my @curr_lexemes;
+			my $unused_rdg_part;
+			foreach my $tag ( @tags ) {
+				# Get the original word
+				my( $lexeme, @rest ) = split( /\t/, $tag );
+				# Lemmatize the whole
+				my @forms = _parse_wordform( _flemm_lookup( $tag ) );
+				my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
+					'string' => $lexeme, 'language' => 'French',
+					'wordform_matchlist' => \@forms );
+				# Find the next non-meta reading
+				while( $curr_rdg->is_meta ) {
+					$curr_rdg = shift @path;
+				}
+				unless( $curr_rdg ) {
+					warn "Ran out of readings in sequence for " . $wit->sigil
+						. " at $lexeme";
+					last;
+				}
+				if( $unused_rdg_part &&
+					$unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
+					# Nth part of curr_rdg
+					$unused_rdg_part = $2;
+					push( @curr_lexemes, $lexobj );
+				} elsif( $curr_rdg->text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
+					# Flag an error if there is already an unused reading part.
+					warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
+						if $unused_rdg_part;
+					$unused_rdg_part = $2; # will be empty if the whole reading matched
+					push( @curr_lexemes, $lexobj );
+				} else {
+					# We do not cope with the idea of a lexeme being 
+					# spread across multiple readings.
+					warn "Word sequence changed unexpectedly in text";
+					# See if we can find a matching reading
+					my @lookahead;
+					my $matched;
+					while( my $nr = shift @path ) {
+						if( $nr->text =~ /^\Q$lexeme\E/ ) {
+							$curr_rdg = $lookahead[-1] if @lookahead;
+							$matched = 1;
+							last;
+						} else {
+							push( @lookahead, $nr );
+						}
+					}
+					# No match? Restore the state we had
+					unless( $matched ) {
+						unshift( @path, @lookahead );
+					}
+					# Trigger a move
+					$unused_rdg_part = '';
+				}
+				
+				unless( $unused_rdg_part ) {
+					# Record the lexemes for the given reading.
+					#print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
+					#	join( ' ', map { $_->string } @curr_lexemes ),
+					#	$curr_rdg->id, $curr_rdg->text );
+					_update_reading_lexemes( $curr_rdg, @curr_lexemes );
+					$curr_rdg = shift @path;
+					@curr_lexemes = ();
+				}
+			}
+		}
+	}
+}
+
+sub _update_reading_lexemes {
+	my( $reading, @lexemes ) = @_;
+	if( $reading->has_lexemes ) {
+		# We need to merge what is in @lexemes with what we have already.
+		my @oldlex = $reading->lexemes;
+		my $cmp1 = join( '||', map { $_->string } @oldlex );
+		my $cmp2 = join( '||', map { $_->string } @lexemes );
+		if ( @oldlex == @lexemes && $cmp1 == $cmp2 ) {
+			# The lexeme strings are the same, so merge the possible
+			# word forms from new to old.
+			foreach my $i ( 0 .. $#lexemes ) {
+				my $ol = $oldlex[$i];
+				my $nl = $lexemes[$i];
+				my %ofw;
+				map { $ofw{$_->_stringify} = 1 } $ol->matching_forms;
+				foreach my $form ( $nl->matching_forms ) {
+					unless( $ofw{$form->_stringify} ) {
+						print STDERR "Adding form " . $form->_stringify . 
+							" to lexeme " . $nl->string . " at $reading\n";
+						$ol->add_matching_form( $form );
+						$ol->is_disambiguated(0);
+					}
+				}
+			}
+		} else {
+			$DB::single = 1;
+			warn "Lexeme layout for $reading changed; replacing the lot";
+			$reading->clear_lexemes;
+			$reading->add_lexeme( @lexemes );
+		}
+	} else {
+		$reading->add_lexeme( @lexemes );
+	}
 }
 
 =head2 word_lookup( $word )
@@ -34,9 +170,77 @@ It is better to use L<lemmatize> for context sensitivity.
 
 sub word_lookup {
 	my $word = shift;
+	my $tagresult = _treetag_string( $word );
+	my $lemmatizer;
+	try {
+		load 'Flemm';
+		$lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
+	} catch {
+		warn "Cannot do French word lemmatization without Flemm: @_";
+		return;
+	}
+	return _parse_wordform( _flemm_lookup( $tagresult ) );
+}
+
+# Utility function that actually calls the tree tagger.
+sub _treetag_string {
+	my( $text ) = @_;
+	my $wittext = encode_utf8( $text );
+	# Then see if we have TreeTagger
+	my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-french-utf8";
+	unless( -f $taggercmd ) {
+		warn "Cannot do French word lemmatization without TreeTagger";
+		return;
+	}
+	# OK, we can run it then.
+	my @cmd = ( $taggercmd );
+	my( $tagresult, $err ); # Capture the output and error
+	run( \@cmd, \$wittext, \$tagresult, \$err );
+	# TODO check for error
+	return decode_utf8( $tagresult );
+}
+
+# Closure and utility function for the package lemmatizer
+{
+	my $lemmatizer;
+	
+	sub _flemm_lookup {
+		# First try to load Flemm
+		unless( $lemmatizer ) {
+			try {
+				load 'Flemm';
+				$lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
+			} catch {
+				warn "Cannot do French word lemmatization without Flemm: @_";
+				return;
+			}
+		}
+		return $lemmatizer->lemmatize( @_ )
+	}
 	
 }
 
+# Utility function to turn a Flemm result into a WordForm
+sub _parse_wordform {
+	my $flemmobj = shift;
+	# For now just parse the string, until we make sense of the documentation.
+	my @results = split( / \|\| /, $flemmobj->getResult );
+	my @forms;
+	foreach ( @results ) {
+		my( $orig, $tag, $lemma ) = split( /\t/, $_ );
+		my( $pos, $morph ) = split( /:/, $tag );
+		my $wf = Text::Tradition::Collation::Reading::WordForm->new(
+			'language' => 'French',
+			'lemma' => $lemma,
+			'morphology' => [ split( //, $morph ) ],
+			);
+		push( @forms, $wf );
+	}
+	return @forms;
+}
+
+1;
+
 =head1 LICENSE
 
 This package is free software and is provided "as is" without express