[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / French.pm

package Text::Tradition::Language::French;

use strict;
use warnings;
use Module::Load qw/ load /;
use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct multext_struct /;
use TryCatch;

=head1 NAME

Text::Tradition::Language::French - language-specific module for French

=head1 DESCRIPTION

Implements morphology lookup for French words in context.  This module
depends on the Flemm module for French lemmatization
(L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with
the TreeTagger software
(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
(for now) expected to be installed in $MORPHDIR/TreeTagger.

=head1 SUBROUTINES

=head2 lemmatize( $text )

Evaluates the string using the Flemm package, and returns the results.

=begin testing

binmode STDOUT, ':utf8';
use Text::Tradition;
use_ok( 'Text::Tradition::Language::French' );

eval "use Flemm";
my $err = $@;

SKIP: {
	skip "Package Flemm not found" if $err;
	my $tf = Text::Tradition->new(
		'input' => 'Self',
		'file' => 't/data/besoin.xml',
		'language' => 'French' );
		
	is( $tf->language, 'French', "Set language okay" );
	$tf->lemmatize();
	# Test the lemmatization. How many readings now have morphological info?
	# Do the lexemes match the reading?
	my $ambig = 0;
	foreach my $r ( $tf->collation->readings ) {
		next if $r->is_meta;
		ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
		my @lex = $r->lexemes;
		my $lexstr = join( '', map { $_->string } @lex );
		my $textstr = $r->text;
		$textstr =~ s/\s+//g;
		is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
		foreach my $l ( @lex ) {
			next if $l->is_disambiguated;
	# 		printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
	# 			join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
			$ambig++;
		}
	}
	is( $ambig, 102, "Found 102 ambiguous forms as expected" );
	
	# Try setting the normal form of a reading and re-analyzing
	my $mr = $tf->collation->reading('r99.2');
	is( $mr->text, 'minspire', "Picked correct test reading" );
	is( $mr->language, 'French', "Reading has correct language setting" );
	$mr->normal_form( "m'inspire" );
	$mr->lemmatize;
	is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
}

=end testing

=cut

sub lemmatize {
	my $tradition = shift;
	my %opts = ( 
		'language' => 'French', 
		'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) } 
		);
	return lemmatize_treetagger( $tradition, %opts );
}

=head2 reading_lookup( $rdg[, $rdg, ...] )

Looks up one or more readings using the Flemm package, and returns the
possible results.  This uses the same logic as L<lemmatize> above for the
entire tradition, but can also be used to (re-)analyze individual readings.

=cut

sub reading_lookup {
	my( @path ) = @_;
	my %opts = ( 
		'language' => 'French',
		'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) },
		'path' => \@path,
		);
	return reading_lookup_treetagger( %opts );
}

# Closure and utility function for the package lemmatizer
{
	my $lemmatizer;
	
	sub _flemm_lookup {
		# First try to load Flemm
		unless( $lemmatizer ) {
			try {
				load 'Flemm';
				$lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
			} catch {
				warn "Cannot do French word lemmatization without Flemm: @_";
				return;
			}
		}
		return $lemmatizer->lemmatize( @_ )
	}
	
}

# Utility function to turn a Flemm result into a WordForm
sub _parse_wordform {
	my $flemmobj = shift;
	# For now just parse the string, until we make sense of the documentation.
	my @results = split( / \|\| /, $flemmobj->getResult );
	my @forms;
	foreach ( @results ) {
		my( $orig, $tag, $lemma ) = split( /\t/, $_ );
		my( $pos, $morph ) = split( /:/, $tag );
		my $morphobj;
		if( $morph ) {
			$morphobj = multext_struct( $morph );
		} else {
			# Use the TreeTagger info if there is no Flemm morphology.
			$morphobj = treetagger_struct( $pos );
		}
		if( $morphobj ) {
			my $wf = Text::Tradition::Collation::Reading::WordForm->new(
				'language' => 'French',
				'lemma' => $lemma,
				'morphology' => $morphobj,
				);
			push( @forms, $wf );
		} else {
			warn "No morphology found for word: $_";
		}
	}
	return @forms;
}

1;

=head2 TODO

=over

=item * Try to do more things with Perl objects in Flemm and TT

=back

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews E<lt>aurum@cpan.orgE<gt>
Commit	Line	Data
cca4f996	1	package Text::Tradition::Language::French;
cca4f996	2
f4b6b4d0	3	use strict;
f4b6b4d0	4	use warnings;
e0f6836a	5	use Module::Load qw/ load /;
5271a011	6	use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct multext_struct /;
d3e7842a	7	use TryCatch;
d3e7842a	8
cca4f996	9	=head1 NAME
cca4f996	10
6ad2ce78	11	Text::Tradition::Language::French - language-specific module for French
cca4f996	12
	13	=head1 DESCRIPTION
	14
6ad2ce78	15	Implements morphology lookup for French words in context. This module
	16	depends on the Flemm module for French lemmatization
	17	(L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with
	18	the TreeTagger software
	19	(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
	20	(for now) expected to be installed in $MORPHDIR/TreeTagger.
cca4f996	21
	22	=head1 SUBROUTINES
	23
	24	=head2 lemmatize( $text )
	25
	26	Evaluates the string using the Flemm package, and returns the results.
	27
6ad2ce78	28	=begin testing
	29
	30	binmode STDOUT, ':utf8';
	31	use Text::Tradition;
	32	use_ok( 'Text::Tradition::Language::French' );
	33
	34	eval "use Flemm";
	35	my $err = $@;
	36
	37	SKIP: {
	38	skip "Package Flemm not found" if $err;
	39	my $tf = Text::Tradition->new(
	40	'input' => 'Self',
	41	'file' => 't/data/besoin.xml',
	42	'language' => 'French' );
	43
	44	is( $tf->language, 'French', "Set language okay" );
	45	$tf->lemmatize();
	46	# Test the lemmatization. How many readings now have morphological info?
	47	# Do the lexemes match the reading?
	48	my $ambig = 0;
	49	foreach my $r ( $tf->collation->readings ) {
	50	next if $r->is_meta;
	51	ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
	52	my @lex = $r->lexemes;
	53	my $lexstr = join( '', map { $_->string } @lex );
	54	my $textstr = $r->text;
	55	$textstr =~ s/\s+//g;
	56	is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
	57	foreach my $l ( @lex ) {
	58	next if $l->is_disambiguated;
	59	# printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
	60	# join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
	61	$ambig++;
	62	}
	63	}
	64	is( $ambig, 102, "Found 102 ambiguous forms as expected" );
	65
	66	# Try setting the normal form of a reading and re-analyzing
e0f6836a	67	my $mr = $tf->collation->reading('r99.2');
6ad2ce78	68	is( $mr->text, 'minspire', "Picked correct test reading" );
	69	is( $mr->language, 'French', "Reading has correct language setting" );
	70	$mr->normal_form( "m'inspire" );
	71	$mr->lemmatize;
	72	is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
	73	}
	74
	75	=end testing
	76
cca4f996	77	=cut
	78
	79	sub lemmatize {
d3e7842a	80	my $tradition = shift;
e0f6836a	81	my %opts = (
	82	'language' => 'French',
	83	'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) }
	84	);
	85	return lemmatize_treetagger( $tradition, %opts );
cca4f996	86	}
cca4f996	87
6ad2ce78	88	=head2 reading_lookup( $rdg[, $rdg, ...] )
cca4f996	89
6ad2ce78	90	Looks up one or more readings using the Flemm package, and returns the
	91	possible results. This uses the same logic as L<lemmatize> above for the
	92	entire tradition, but can also be used to (re-)analyze individual readings.
cca4f996	93
	94	=cut
	95
6ad2ce78	96	sub reading_lookup {
e0f6836a	97	my( @path ) = @_;
	98	my %opts = (
	99	'language' => 'French',
	100	'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) },
	101	'path' => \@path,
	102	);
	103	return reading_lookup_treetagger( %opts );
d3e7842a	104	}
	105
	106	# Closure and utility function for the package lemmatizer
	107	{
	108	my $lemmatizer;
	109
	110	sub _flemm_lookup {
	111	# First try to load Flemm
	112	unless( $lemmatizer ) {
	113	try {
	114	load 'Flemm';
	115	$lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
	116	} catch {
	117	warn "Cannot do French word lemmatization without Flemm: @_";
	118	return;
	119	}
	120	}
	121	return $lemmatizer->lemmatize( @_ )
	122	}
cca4f996	123
	124	}
	125
d3e7842a	126	# Utility function to turn a Flemm result into a WordForm
	127	sub _parse_wordform {
	128	my $flemmobj = shift;
	129	# For now just parse the string, until we make sense of the documentation.
	130	my @results = split( / \\|\\| /, $flemmobj->getResult );
	131	my @forms;
	132	foreach ( @results ) {
	133	my( $orig, $tag, $lemma ) = split( /\t/, $_ );
	134	my( $pos, $morph ) = split( /:/, $tag );
6ad2ce78	135	my $morphobj;
6ad2ce78	136	if( $morph ) {
5271a011	137	$morphobj = multext_struct( $morph );
6ad2ce78	138	} else {
6ad2ce78	139	# Use the TreeTagger info if there is no Flemm morphology.
5271a011	140	$morphobj = treetagger_struct( $pos );
6ad2ce78	141	}
	142	if( $morphobj ) {
	143	my $wf = Text::Tradition::Collation::Reading::WordForm->new(
	144	'language' => 'French',
	145	'lemma' => $lemma,
	146	'morphology' => $morphobj,
	147	);
	148	push( @forms, $wf );
	149	} else {
	150	warn "No morphology found for word: $_";
	151	}
d3e7842a	152	}
	153	return @forms;
	154	}
	155
	156	1;
	157
6ad2ce78	158	=head2 TODO
	159
	160	=over
	161
e0f6836a	162	=item * Try to do more things with Perl objects in Flemm and TT
6ad2ce78	163
	164	=back
	165
cca4f996	166	=head1 LICENSE
	167
	168	This package is free software and is provided "as is" without express
	169	or implied warranty. You can redistribute it and/or modify it under
	170	the same terms as Perl itself.
	171
	172	=head1 AUTHOR
	173
	174	Tara L Andrews E<lt>aurum@cpan.orgE<gt>