[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / English.pm

package Text::Tradition::Language::English;

use strict;
use warnings;
use Encode qw/ encode_utf8 decode_utf8 /;
use IPC::Run qw/ run /;
use Lingua::TagSet::TreeTagger;
use Module::Load;
use Text::Tradition::Collation::Reading::Lexeme;
use Text::Tradition::Collation::Reading::WordForm;
use TryCatch;

my $MORPHDIR = '/Users/tla/Projects/morphology';

=head1 NAME

Text::Tradition::Language::English - language-specific module for English

=head1 DESCRIPTION

Implements morphology lookup for English words in context.  This module
depends on the TreeTagger software
(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
(for now) expected to be installed in $MORPHDIR/TreeTagger.

=head1 SUBROUTINES

=head2 lemmatize( $text )

Evaluates the string using the TreeTagger, and returns the results.

=begin testing

binmode STDOUT, ':utf8';
use Text::Tradition;
use_ok( 'Text::Tradition::Language::English' );

=end testing

=cut

sub lemmatize {
	my $tradition = shift;

	# Given a tradition, lemmatize it witness by witness and see what we get.
	my $workdir = File::Temp->newdir();
	my $c = $tradition->collation;
	# First, clear out all existing lexemes from the readings. Save the
	# path as long as we went to the trouble of generating it.
	my %witness_paths;
	foreach my $wit ( $tradition->witnesses ) {
		my @sigla = ( $wit->sigil );
		push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
		foreach my $sig ( @sigla ) {
			my @path = grep { !$_->is_meta } 
				$c->reading_sequence( $c->start, $c->end, $sig );
			map { $_->clear_lexemes } @path;
			$witness_paths{$sig} = \@path;
		}
	}
	
	foreach my $sig ( keys %witness_paths ) {
		# Get the text as a sequence of readings and as a string
		print STDERR "Morphologizing witness $sig\n";
		_lemmatize_sequence( undef, @{$witness_paths{$sig}} );
	}
}

sub _update_reading_lexemes {
	my( $replace, $reading, @lexemes ) = @_;
	if( $reading->has_lexemes && !$replace ) {
		# We need to merge what is in @lexemes with what we have already.
		my @oldlex = $reading->lexemes;
		my $cmp1 = join( '||', map { $_->string } @oldlex );
		my $cmp2 = join( '||', map { $_->string } @lexemes );
		if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
			# The lexeme strings are the same, so merge the possible
			# word forms from new to old.
			foreach my $i ( 0 .. $#lexemes ) {
				my $ol = $oldlex[$i];
				my $nl = $lexemes[$i];
				my %ofw;
				map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
				foreach my $form ( $nl->matching_forms ) {
					unless( $ofw{$form->to_string} ) {
						print STDERR "Adding form " . $form->to_string . 
							" to lexeme " . $nl->string . " at $reading\n";
						$ol->add_matching_form( $form );
						$ol->is_disambiguated(0);
					}
				}
			}
		} else {
			warn "Lexeme layout for $reading changed; replacing the lot";
			$reading->clear_lexemes;
			$reading->add_lexeme( @lexemes );
		}
	} else {
		$reading->clear_lexemes if $replace;
		$reading->add_lexeme( @lexemes );
	}
}

=head2 reading_lookup( $rdg[, $rdg, ...] )

Looks up one or more readings using the Flemm package, and returns the
possible results.  This uses the same logic as L<lemmatize> above for the
entire tradition, but can also be used to (re-)analyze individual readings.

=cut

sub reading_lookup {
	return _lemmatize_sequence( 1, @_ );
}

sub _lemmatize_sequence {
	my( $replace, @path ) = @_;
	my $tagresult = _treetag_string( _text_from_path( 1, @path ) );
	if( $tagresult ) {
		# Map the tagged words onto the original readings, splitting 
		# them up into lexemes where necessary.
		# NOTE we can have multiple lexemes in a reading, but not
		# multiple readings to a lexeme.
		my @tags = split( /\n/, $tagresult );
		my @lexemes;
		my $curr_rdg = shift @path;
		my @curr_lexemes;
		my $unused_rdg_part;
		foreach my $tag ( @tags ) {
			# Get the original word
			my( $lexeme, @rest ) = split( /\t/, $tag );
			# Lemmatize the whole
			my @forms = _parse_wordform( $tag );
			my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
				'string' => $lexeme, 'language' => 'English',
				'wordform_matchlist' => \@forms );
			# Find the next non-meta reading
			while( $curr_rdg && $curr_rdg->is_meta ) {
				$curr_rdg = shift @path;
			}
			unless( $curr_rdg ) {
				warn "Ran out of readings in sequence at $lexeme";
				last;
			}
			my $curr_rdg_text = $curr_rdg->has_normal_form 
				? $curr_rdg->normal_form : $curr_rdg->text;
			if( $unused_rdg_part &&
				$unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
				# Nth part of curr_rdg
				$unused_rdg_part = $2;
				push( @curr_lexemes, $lexobj );
			} elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
				# Flag an error if there is already an unused reading part.
				warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
					if $unused_rdg_part;
				$unused_rdg_part = $2; # will be empty if the whole reading matched
				push( @curr_lexemes, $lexobj );
			} else {
				# We do not cope with the idea of a lexeme being 
				# spread across multiple readings.
				warn "Word sequence changed unexpectedly in text";
				# See if we can find a matching reading
				my @lookahead;
				my $matched;
				while( my $nr = shift @path ) {
					my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
					if( $nrtext =~ /^\Q$lexeme\E/ ) {
						$curr_rdg = $lookahead[-1] if @lookahead;
						$matched = 1;
						last;
					} else {
						push( @lookahead, $nr );
					}
				}
				# No match? Restore the state we had
				unless( $matched ) {
					unshift( @path, @lookahead );
				}
				# Trigger a move
				$unused_rdg_part = '';
			}
			
			unless( $unused_rdg_part ) {
				# Record the lexemes for the given reading.
				#print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
				#	join( ' ', map { $_->string } @curr_lexemes ),
				#	$curr_rdg->id, $curr_rdg->text );
				_update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes );
				$curr_rdg = shift @path;
				@curr_lexemes = ();
			}
		}
	}
}

# Utility function so that we can cheat and use it when we need both the path
# and its text.
sub _text_from_path {
	my( $normalize, @path ) = @_;
	my $pathtext = '';
	my $last;
	foreach my $r ( @path ) {
		unless ( $r->join_prior || !$last || $last->join_next ) {
			$pathtext .= ' ';
		} 
		$pathtext .= ( $normalize && $r->has_normal_form ) 
			? $r->normal_form : $r->text;
		$last = $r;
	}
	return $pathtext;
}

# Utility function that actually calls the tree tagger.
sub _treetag_string {
	my( $text ) = @_;
	my $wittext = encode_utf8( $text );
	# Then see if we have TreeTagger
	my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-english";
	unless( -f $taggercmd ) {
		warn "Cannot do English word lemmatization without TreeTagger";
		return;
	}
	# OK, we can run it then.
	my @cmd = ( $taggercmd );
	my( $tagresult, $err ); # Capture the output and error
	run( \@cmd, \$wittext, \$tagresult, \$err );
	# TODO check for error
	return decode_utf8( $tagresult );
}

# Utility function to turn a TreeTagger result into a WordForm
sub _parse_wordform {
	my $tagresult = shift;
	my( $orig, $tag, $lemma ) = split( /\t/, $tagresult );
	my $morphobj = Lingua::TagSet::TreeTagger->tag2structure( $tag );
	if( $morphobj ) {
		return Text::Tradition::Collation::Reading::WordForm->new(
			'language' => 'English',
			'lemma' => $lemma,
			'morphology' => $morphobj,
			);
	} else {
		warn "No morphology found for word: $_";
	}
}

1;

=head2 TODO

=over

=item * Handle package dependencies more gracefully

=item * Refactor English/French use of TreeTagger into its own util package

=back

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews E<lt>aurum@cpan.orgE<gt>
Commit	Line	Data
f4b6b4d0	1	package Text::Tradition::Language::English;
	2
	3	use strict;
	4	use warnings;
	5	use Encode qw/ encode_utf8 decode_utf8 /;
	6	use IPC::Run qw/ run /;
	7	use Lingua::TagSet::TreeTagger;
	8	use Module::Load;
	9	use Text::Tradition::Collation::Reading::Lexeme;
	10	use Text::Tradition::Collation::Reading::WordForm;
	11	use TryCatch;
	12
	13	my $MORPHDIR = '/Users/tla/Projects/morphology';
	14
	15	=head1 NAME
	16
	17	Text::Tradition::Language::English - language-specific module for English
	18
	19	=head1 DESCRIPTION
	20
	21	Implements morphology lookup for English words in context. This module
	22	depends on the TreeTagger software
	23	(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
	24	(for now) expected to be installed in $MORPHDIR/TreeTagger.
	25
	26	=head1 SUBROUTINES
	27
	28	=head2 lemmatize( $text )
	29
	30	Evaluates the string using the TreeTagger, and returns the results.
	31
	32	=begin testing
	33
	34	binmode STDOUT, ':utf8';
	35	use Text::Tradition;
	36	use_ok( 'Text::Tradition::Language::English' );
	37
	38	=end testing
	39
	40	=cut
	41
	42	sub lemmatize {
	43	my $tradition = shift;
	44
	45	# Given a tradition, lemmatize it witness by witness and see what we get.
	46	my $workdir = File::Temp->newdir();
	47	my $c = $tradition->collation;
	48	# First, clear out all existing lexemes from the readings. Save the
	49	# path as long as we went to the trouble of generating it.
	50	my %witness_paths;
	51	foreach my $wit ( $tradition->witnesses ) {
	52	my @sigla = ( $wit->sigil );
	53	push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
	54	foreach my $sig ( @sigla ) {
	55	my @path = grep { !$_->is_meta }
	56	$c->reading_sequence( $c->start, $c->end, $sig );
	57	map { $_->clear_lexemes } @path;
	58	$witness_paths{$sig} = \@path;
	59	}
	60	}
	61
	62	foreach my $sig ( keys %witness_paths ) {
	63	# Get the text as a sequence of readings and as a string
	64	print STDERR "Morphologizing witness $sig\n";
65	_lemmatize_sequence( undef, @{$witness_paths{$sig}} );
66	}
67	}
68
69	sub _update_reading_lexemes {
70	my( $replace, $reading, @lexemes ) = @_;
71	if( $reading->has_lexemes && !$replace ) {
72	# We need to merge what is in @lexemes with what we have already.
73	my @oldlex = $reading->lexemes;
74	my $cmp1 = join( '\|\|', map { $_->string } @oldlex );
75	my $cmp2 = join( '\|\|', map { $_->string } @lexemes );
76	if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
77	# The lexeme strings are the same, so merge the possible
78	# word forms from new to old.
79	foreach my $i ( 0 .. $#lexemes ) {
80	my $ol = $oldlex[$i];
81	my $nl = $lexemes[$i];
82	my %ofw;
83	map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
84	foreach my $form ( $nl->matching_forms ) {
85	unless( $ofw{$form->to_string} ) {
86	print STDERR "Adding form " . $form->to_string .
87	" to lexeme " . $nl->string . " at $reading\n";
88	$ol->add_matching_form( $form );
89	$ol->is_disambiguated(0);
90	}
91	}
92	}
93	} else {
94	warn "Lexeme layout for $reading changed; replacing the lot";
95	$reading->clear_lexemes;
96	$reading->add_lexeme( @lexemes );
97	}
98	} else {
99	$reading->clear_lexemes if $replace;
100	$reading->add_lexeme( @lexemes );
101	}
102	}
103
104	=head2 reading_lookup( $rdg[, $rdg, ...] )
105
106	Looks up one or more readings using the Flemm package, and returns the
107	possible results. This uses the same logic as L<lemmatize> above for the
108	entire tradition, but can also be used to (re-)analyze individual readings.
109
110	=cut
111
112	sub reading_lookup {
113	return _lemmatize_sequence( 1, @_ );
114	}
115
116	sub _lemmatize_sequence {
117	my( $replace, @path ) = @_;
118	my $tagresult = _treetag_string( _text_from_path( 1, @path ) );
119	if( $tagresult ) {
120	# Map the tagged words onto the original readings, splitting
121	# them up into lexemes where necessary.
122	# NOTE we can have multiple lexemes in a reading, but not
123	# multiple readings to a lexeme.
124	my @tags = split( /\n/, $tagresult );
125	my @lexemes;
126	my $curr_rdg = shift @path;
127	my @curr_lexemes;
128	my $unused_rdg_part;
129	foreach my $tag ( @tags ) {
130	# Get the original word
131	my( $lexeme, @rest ) = split( /\t/, $tag );
132	# Lemmatize the whole
133	my @forms = _parse_wordform( $tag );
134	my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
135	'string' => $lexeme, 'language' => 'English',
136	'wordform_matchlist' => \@forms );
137	# Find the next non-meta reading
138	while( $curr_rdg && $curr_rdg->is_meta ) {
139	$curr_rdg = shift @path;
140	}
141	unless( $curr_rdg ) {
142	warn "Ran out of readings in sequence at $lexeme";
143	last;
144	}
145	my $curr_rdg_text = $curr_rdg->has_normal_form
146	? $curr_rdg->normal_form : $curr_rdg->text;
147	if( $unused_rdg_part &&
148	$unused_rdg_part =~ /^\Q$lexeme\E(\s)(.)$/ ) {
149	# Nth part of curr_rdg
150	$unused_rdg_part = $2;
151	push( @curr_lexemes, $lexobj );
152	} elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s)(.)$/ ) {
153	# Flag an error if there is already an unused reading part.
154	warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
155	if $unused_rdg_part;
156	$unused_rdg_part = $2; # will be empty if the whole reading matched
157	push( @curr_lexemes, $lexobj );
158	} else {
159	# We do not cope with the idea of a lexeme being
160	# spread across multiple readings.
161	warn "Word sequence changed unexpectedly in text";
162	# See if we can find a matching reading
163	my @lookahead;
164	my $matched;
165	while( my $nr = shift @path ) {
166	my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
167	if( $nrtext =~ /^\Q$lexeme\E/ ) {
168	$curr_rdg = $lookahead[-1] if @lookahead;
169	$matched = 1;
170	last;
171	} else {
172	push( @lookahead, $nr );
173	}
174	}
175	# No match? Restore the state we had
176	unless( $matched ) {
177	unshift( @path, @lookahead );
178	}
179	# Trigger a move
180	$unused_rdg_part = '';
181	}
182
183	unless( $unused_rdg_part ) {
184	# Record the lexemes for the given reading.
185	#print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
186	# join( ' ', map { $_->string } @curr_lexemes ),
187	# $curr_rdg->id, $curr_rdg->text );
188	_update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes );
189	$curr_rdg = shift @path;
190	@curr_lexemes = ();
191	}
192	}
193	}
194	}
195
196	# Utility function so that we can cheat and use it when we need both the path
197	# and its text.
198	sub _text_from_path {
199	my( $normalize, @path ) = @_;
200	my $pathtext = '';
201	my $last;
202	foreach my $r ( @path ) {
203	unless ( $r->join_prior \|\| !$last \|\| $last->join_next ) {
204	$pathtext .= ' ';
205	}
206	$pathtext .= ( $normalize && $r->has_normal_form )
207	? $r->normal_form : $r->text;
208	$last = $r;
209	}
210	return $pathtext;
211	}
212
213	# Utility function that actually calls the tree tagger.
214	sub _treetag_string {
215	my( $text ) = @_;
216	my $wittext = encode_utf8( $text );
217	# Then see if we have TreeTagger
218	my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-english";
219	unless( -f $taggercmd ) {
220	warn "Cannot do English word lemmatization without TreeTagger";
221	return;
222	}
223	# OK, we can run it then.
224	my @cmd = ( $taggercmd );
225	my( $tagresult, $err ); # Capture the output and error
226	run( \@cmd, \$wittext, \$tagresult, \$err );
227	# TODO check for error
228	return decode_utf8( $tagresult );
229	}
230
231	# Utility function to turn a TreeTagger result into a WordForm
232	sub _parse_wordform {
233	my $tagresult = shift;
234	my( $orig, $tag, $lemma ) = split( /\t/, $tagresult );
235	my $morphobj = Lingua::TagSet::TreeTagger->tag2structure( $tag );
236	if( $morphobj ) {
237	return Text::Tradition::Collation::Reading::WordForm->new(
238	'language' => 'English',
239	'lemma' => $lemma,
240	'morphology' => $morphobj,
241	);
242	} else {
243	warn "No morphology found for word: $_";
244	}
245	}
246
247	1;
248
249	=head2 TODO
250
251	=over
252
253	=item * Handle package dependencies more gracefully
254
255	=item * Refactor English/French use of TreeTagger into its own util package
256
257	=back
258
259	=head1 LICENSE
260
261	This package is free software and is provided "as is" without express
262	or implied warranty. You can redistribute it and/or modify it under
263	the same terms as Perl itself.
264
265	=head1 AUTHOR
266
267	Tara L Andrews E<lt>aurum@cpan.orgE<gt>