[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / French.pm

package Text::Tradition::Language::French;

use Encode qw/ encode_utf8 decode_utf8 /;
use IPC::Run qw/ run binary /;
use Module::Load;
use Text::Tradition::Collation::Reading::Lexeme;
use Text::Tradition::Collation::Reading::WordForm;
use TryCatch;

my $MORPHDIR = '/Users/tla/Projects/morphology';

=head1 NAME

Text::Tradition::Language::French - language-specific modules for French

=head1 DESCRIPTION

Implements morphology lookup for French words in context.

=head1 SUBROUTINES

=head2 lemmatize( $text )

Evaluates the string using the Flemm package, and returns the results.

=cut

sub lemmatize {
	my $tradition = shift;

	# Given a tradition, lemmatize it witness by witness and see what we get.
	my $workdir = File::Temp->newdir();
	my $c = $tradition->collation;
	# First, clear out all existing lexemes from the readings. Save the
	# path as long as we went to the trouble of generating it.
	my %witness_paths;
	foreach my $wit ( $tradition->witnesses ) {
		my @sigla = ( $wit->sigil );
		push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
		foreach my $sig ( @sigla ) {
			my @path = grep { !$_->is_meta } 
				$c->reading_sequence( $c->start, $c->end, $sig );
			map { $_->clear_lexemes } @path;
			$witness_paths{$sig} = \@path;
		}
	}
	
	foreach my $sig ( keys %witness_paths ) {
		# Get the text as a sequence of readings and as a string
		print STDERR "Morphologizing witness $sig\n";
		my @path = @{$witness_paths{$sig}};
		my $tagresult = _treetag_string( $c->_text_from_path( @path ) );
		if( $tagresult ) {
			# Map the tagged words onto the original readings, splitting 
			# them up into lexemes where necessary.
			# NOTE we can have multiple lexemes in a reading, but not
			# multiple readings to a lexeme.
			my @tags = split( /\n/, $tagresult );
			my @lexemes;
			my $curr_rdg = shift @path;
			my @curr_lexemes;
			my $unused_rdg_part;
			foreach my $tag ( @tags ) {
				# Get the original word
				my( $lexeme, @rest ) = split( /\t/, $tag );
				# Lemmatize the whole
				my @forms = _parse_wordform( _flemm_lookup( $tag ) );
				my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
					'string' => $lexeme, 'language' => 'French',
					'wordform_matchlist' => \@forms );
				# Find the next non-meta reading
				while( $curr_rdg->is_meta ) {
					$curr_rdg = shift @path;
				}
				unless( $curr_rdg ) {
					warn "Ran out of readings in sequence for " . $wit->sigil
						. " at $lexeme";
					last;
				}
				if( $unused_rdg_part &&
					$unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
					# Nth part of curr_rdg
					$unused_rdg_part = $2;
					push( @curr_lexemes, $lexobj );
				} elsif( $curr_rdg->text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
					# Flag an error if there is already an unused reading part.
					warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
						if $unused_rdg_part;
					$unused_rdg_part = $2; # will be empty if the whole reading matched
					push( @curr_lexemes, $lexobj );
				} else {
					# We do not cope with the idea of a lexeme being 
					# spread across multiple readings.
					warn "Word sequence changed unexpectedly in text";
					# See if we can find a matching reading
					my @lookahead;
					my $matched;
					while( my $nr = shift @path ) {
						if( $nr->text =~ /^\Q$lexeme\E/ ) {
							$curr_rdg = $lookahead[-1] if @lookahead;
							$matched = 1;
							last;
						} else {
							push( @lookahead, $nr );
						}
					}
					# No match? Restore the state we had
					unless( $matched ) {
						unshift( @path, @lookahead );
					}
					# Trigger a move
					$unused_rdg_part = '';
				}
				
				unless( $unused_rdg_part ) {
					# Record the lexemes for the given reading.
					#print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
					#	join( ' ', map { $_->string } @curr_lexemes ),
					#	$curr_rdg->id, $curr_rdg->text );
					_update_reading_lexemes( $curr_rdg, @curr_lexemes );
					$curr_rdg = shift @path;
					@curr_lexemes = ();
				}
			}
		}
	}
}

sub _update_reading_lexemes {
	my( $reading, @lexemes ) = @_;
	if( $reading->has_lexemes ) {
		# We need to merge what is in @lexemes with what we have already.
		my @oldlex = $reading->lexemes;
		my $cmp1 = join( '||', map { $_->string } @oldlex );
		my $cmp2 = join( '||', map { $_->string } @lexemes );
		if ( @oldlex == @lexemes && $cmp1 == $cmp2 ) {
			# The lexeme strings are the same, so merge the possible
			# word forms from new to old.
			foreach my $i ( 0 .. $#lexemes ) {
				my $ol = $oldlex[$i];
				my $nl = $lexemes[$i];
				my %ofw;
				map { $ofw{$_->_stringify} = 1 } $ol->matching_forms;
				foreach my $form ( $nl->matching_forms ) {
					unless( $ofw{$form->_stringify} ) {
						print STDERR "Adding form " . $form->_stringify . 
							" to lexeme " . $nl->string . " at $reading\n";
						$ol->add_matching_form( $form );
						$ol->is_disambiguated(0);
					}
				}
			}
		} else {
			$DB::single = 1;
			warn "Lexeme layout for $reading changed; replacing the lot";
			$reading->clear_lexemes;
			$reading->add_lexeme( @lexemes );
		}
	} else {
		$reading->add_lexeme( @lexemes );
	}
}

=head2 word_lookup( $word )

Looks up a word using the Flemm package, and returns the possible results.
It is better to use L<lemmatize> for context sensitivity.

=cut

sub word_lookup {
	my $word = shift;
	my $tagresult = _treetag_string( $word );
	my $lemmatizer;
	try {
		load 'Flemm';
		$lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
	} catch {
		warn "Cannot do French word lemmatization without Flemm: @_";
		return;
	}
	return _parse_wordform( _flemm_lookup( $tagresult ) );
}

# Utility function that actually calls the tree tagger.
sub _treetag_string {
	my( $text ) = @_;
	my $wittext = encode_utf8( $text );
	# Then see if we have TreeTagger
	my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-french-utf8";
	unless( -f $taggercmd ) {
		warn "Cannot do French word lemmatization without TreeTagger";
		return;
	}
	# OK, we can run it then.
	my @cmd = ( $taggercmd );
	my( $tagresult, $err ); # Capture the output and error
	run( \@cmd, \$wittext, \$tagresult, \$err );
	# TODO check for error
	return decode_utf8( $tagresult );
}

# Closure and utility function for the package lemmatizer
{
	my $lemmatizer;
	
	sub _flemm_lookup {
		# First try to load Flemm
		unless( $lemmatizer ) {
			try {
				load 'Flemm';
				$lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
			} catch {
				warn "Cannot do French word lemmatization without Flemm: @_";
				return;
			}
		}
		return $lemmatizer->lemmatize( @_ )
	}
	
}

# Utility function to turn a Flemm result into a WordForm
sub _parse_wordform {
	my $flemmobj = shift;
	# For now just parse the string, until we make sense of the documentation.
	my @results = split( / \|\| /, $flemmobj->getResult );
	my @forms;
	foreach ( @results ) {
		my( $orig, $tag, $lemma ) = split( /\t/, $_ );
		my( $pos, $morph ) = split( /:/, $tag );
		my $wf = Text::Tradition::Collation::Reading::WordForm->new(
			'language' => 'French',
			'lemma' => $lemma,
			'morphology' => [ split( //, $morph ) ],
			);
		push( @forms, $wf );
	}
	return @forms;
}

1;

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews E<lt>aurum@cpan.orgE<gt>
Commit	Line	Data
cca4f996	1	package Text::Tradition::Language::French;
cca4f996	2
d3e7842a	3	use Encode qw/ encode_utf8 decode_utf8 /;
	4	use IPC::Run qw/ run binary /;
	5	use Module::Load;
	6	use Text::Tradition::Collation::Reading::Lexeme;
cca4f996	7	use Text::Tradition::Collation::Reading::WordForm;
d3e7842a	8	use TryCatch;
	9
	10	my $MORPHDIR = '/Users/tla/Projects/morphology';
cca4f996	11
	12	=head1 NAME
	13
	14	Text::Tradition::Language::French - language-specific modules for French
	15
	16	=head1 DESCRIPTION
	17
	18	Implements morphology lookup for French words in context.
	19
	20	=head1 SUBROUTINES
	21
	22	=head2 lemmatize( $text )
	23
	24	Evaluates the string using the Flemm package, and returns the results.
	25
	26	=cut
	27
	28	sub lemmatize {
d3e7842a	29	my $tradition = shift;
	30
	31	# Given a tradition, lemmatize it witness by witness and see what we get.
	32	my $workdir = File::Temp->newdir();
	33	my $c = $tradition->collation;
	34	# First, clear out all existing lexemes from the readings. Save the
	35	# path as long as we went to the trouble of generating it.
	36	my %witness_paths;
	37	foreach my $wit ( $tradition->witnesses ) {
	38	my @sigla = ( $wit->sigil );
	39	push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
	40	foreach my $sig ( @sigla ) {
	41	my @path = grep { !$_->is_meta }
	42	$c->reading_sequence( $c->start, $c->end, $sig );
	43	map { $_->clear_lexemes } @path;
	44	$witness_paths{$sig} = \@path;
	45	}
	46	}
cca4f996	47
d3e7842a	48	foreach my $sig ( keys %witness_paths ) {
	49	# Get the text as a sequence of readings and as a string
	50	print STDERR "Morphologizing witness $sig\n";
	51	my @path = @{$witness_paths{$sig}};
	52	my $tagresult = _treetag_string( $c->_text_from_path( @path ) );
	53	if( $tagresult ) {
	54	# Map the tagged words onto the original readings, splitting
	55	# them up into lexemes where necessary.
	56	# NOTE we can have multiple lexemes in a reading, but not
	57	# multiple readings to a lexeme.
	58	my @tags = split( /\n/, $tagresult );
	59	my @lexemes;
	60	my $curr_rdg = shift @path;
	61	my @curr_lexemes;
	62	my $unused_rdg_part;
	63	foreach my $tag ( @tags ) {
	64	# Get the original word
	65	my( $lexeme, @rest ) = split( /\t/, $tag );
	66	# Lemmatize the whole
	67	my @forms = _parse_wordform( _flemm_lookup( $tag ) );
	68	my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
	69	'string' => $lexeme, 'language' => 'French',
	70	'wordform_matchlist' => \@forms );
	71	# Find the next non-meta reading
	72	while( $curr_rdg->is_meta ) {
	73	$curr_rdg = shift @path;
	74	}
	75	unless( $curr_rdg ) {
	76	warn "Ran out of readings in sequence for " . $wit->sigil
	77	. " at $lexeme";
	78	last;
	79	}
	80	if( $unused_rdg_part &&
	81	$unused_rdg_part =~ /^\Q$lexeme\E(\s)(.)$/ ) {
	82	# Nth part of curr_rdg
	83	$unused_rdg_part = $2;
	84	push( @curr_lexemes, $lexobj );
	85	} elsif( $curr_rdg->text =~ /^\Q$lexeme\E(\s)(.)$/ ) {
	86	# Flag an error if there is already an unused reading part.
	87	warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
	88	if $unused_rdg_part;
	89	$unused_rdg_part = $2; # will be empty if the whole reading matched
	90	push( @curr_lexemes, $lexobj );
	91	} else {
	92	# We do not cope with the idea of a lexeme being
	93	# spread across multiple readings.
	94	warn "Word sequence changed unexpectedly in text";
	95	# See if we can find a matching reading
	96	my @lookahead;
	97	my $matched;
	98	while( my $nr = shift @path ) {
	99	if( $nr->text =~ /^\Q$lexeme\E/ ) {
	100	$curr_rdg = $lookahead[-1] if @lookahead;
	101	$matched = 1;
	102	last;
	103	} else {
	104	push( @lookahead, $nr );
	105	}
	106	}
	107	# No match? Restore the state we had
	108	unless( $matched ) {
	109	unshift( @path, @lookahead );
	110	}
	111	# Trigger a move
112	$unused_rdg_part = '';
113	}
114
115	unless( $unused_rdg_part ) {
116	# Record the lexemes for the given reading.
117	#print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
118	# join( ' ', map { $_->string } @curr_lexemes ),
119	# $curr_rdg->id, $curr_rdg->text );
120	_update_reading_lexemes( $curr_rdg, @curr_lexemes );
121	$curr_rdg = shift @path;
122	@curr_lexemes = ();
123	}
124	}
125	}
126	}
127	}
128
129	sub _update_reading_lexemes {
130	my( $reading, @lexemes ) = @_;
131	if( $reading->has_lexemes ) {
132	# We need to merge what is in @lexemes with what we have already.
133	my @oldlex = $reading->lexemes;
134	my $cmp1 = join( '\|\|', map { $_->string } @oldlex );
135	my $cmp2 = join( '\|\|', map { $_->string } @lexemes );
136	if ( @oldlex == @lexemes && $cmp1 == $cmp2 ) {
137	# The lexeme strings are the same, so merge the possible
138	# word forms from new to old.
139	foreach my $i ( 0 .. $#lexemes ) {
140	my $ol = $oldlex[$i];
141	my $nl = $lexemes[$i];
142	my %ofw;
143	map { $ofw{$_->_stringify} = 1 } $ol->matching_forms;
144	foreach my $form ( $nl->matching_forms ) {
145	unless( $ofw{$form->_stringify} ) {
146	print STDERR "Adding form " . $form->_stringify .
147	" to lexeme " . $nl->string . " at $reading\n";
148	$ol->add_matching_form( $form );
149	$ol->is_disambiguated(0);
150	}
151	}
152	}
153	} else {
154	$DB::single = 1;
155	warn "Lexeme layout for $reading changed; replacing the lot";
156	$reading->clear_lexemes;
157	$reading->add_lexeme( @lexemes );
158	}
159	} else {
160	$reading->add_lexeme( @lexemes );
161	}
cca4f996	162	}
	163
	164	=head2 word_lookup( $word )
	165
	166	Looks up a word using the Flemm package, and returns the possible results.
	167	It is better to use L<lemmatize> for context sensitivity.
	168
	169	=cut
	170
	171	sub word_lookup {
	172	my $word = shift;
d3e7842a	173	my $tagresult = _treetag_string( $word );
	174	my $lemmatizer;
	175	try {
	176	load 'Flemm';
	177	$lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
	178	} catch {
	179	warn "Cannot do French word lemmatization without Flemm: @_";
	180	return;
	181	}
	182	return _parse_wordform( _flemm_lookup( $tagresult ) );
	183	}
	184
	185	# Utility function that actually calls the tree tagger.
	186	sub _treetag_string {
	187	my( $text ) = @_;
	188	my $wittext = encode_utf8( $text );
	189	# Then see if we have TreeTagger
	190	my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-french-utf8";
	191	unless( -f $taggercmd ) {
	192	warn "Cannot do French word lemmatization without TreeTagger";
	193	return;
	194	}
	195	# OK, we can run it then.
	196	my @cmd = ( $taggercmd );
	197	my( $tagresult, $err ); # Capture the output and error
	198	run( \@cmd, \$wittext, \$tagresult, \$err );
	199	# TODO check for error
	200	return decode_utf8( $tagresult );
	201	}
	202
	203	# Closure and utility function for the package lemmatizer
	204	{
	205	my $lemmatizer;
	206
	207	sub _flemm_lookup {
	208	# First try to load Flemm
	209	unless( $lemmatizer ) {
	210	try {
	211	load 'Flemm';
	212	$lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
	213	} catch {
	214	warn "Cannot do French word lemmatization without Flemm: @_";
	215	return;
	216	}
	217	}
	218	return $lemmatizer->lemmatize( @_ )
	219	}
cca4f996	220
	221	}
	222
d3e7842a	223	# Utility function to turn a Flemm result into a WordForm
	224	sub _parse_wordform {
	225	my $flemmobj = shift;
	226	# For now just parse the string, until we make sense of the documentation.
	227	my @results = split( / \\|\\| /, $flemmobj->getResult );
	228	my @forms;
	229	foreach ( @results ) {
	230	my( $orig, $tag, $lemma ) = split( /\t/, $_ );
	231	my( $pos, $morph ) = split( /:/, $tag );
	232	my $wf = Text::Tradition::Collation::Reading::WordForm->new(
	233	'language' => 'French',
	234	'lemma' => $lemma,
	235	'morphology' => [ split( //, $morph ) ],
	236	);
	237	push( @forms, $wf );
	238	}
	239	return @forms;
	240	}
	241
	242	1;
	243
cca4f996	244	=head1 LICENSE
	245
	246	This package is free software and is provided "as is" without express
	247	or implied warranty. You can redistribute it and/or modify it under
	248	the same terms as Perl itself.
	249
	250	=head1 AUTHOR
	251
	252	Tara L Andrews E<lt>aurum@cpan.orgE<gt>