[scpubgit/stemmatology.git] / morphology / lib / Text / Tradition / Language / English.pm

package Text::Tradition::Language::English;

use strict;
use warnings;
use Lingua::TagSet::TreeTagger::English;
use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger
	lfs_morph_tags /;
use TryCatch;

=head1 NAME

Text::Tradition::Language::English - language-specific module for English

=head1 DESCRIPTION

Implements morphology lookup for English words in context.  This module
depends on the TreeTagger software
(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
(for now) expected to be installed in $MORPHDIR/TreeTagger.

=head1 SUBROUTINES

=head2 lemmatize( $text )

Evaluates the string using the TreeTagger, and returns the results.

=begin testing

binmode STDOUT, ':utf8';
use Text::Tradition;
use_ok( 'Text::Tradition::Language::English' );

=end testing

=cut

sub lemmatize {
	my $tradition = shift;
	my %opts = ( 
		'language' => 'English', 
		'callback' => sub { _parse_wordform( @_ ) } 
		);
	return lemmatize_treetagger( $tradition, %opts );
}

=head2 reading_lookup( $rdg[, $rdg, ...] )

Looks up one or more readings using the Flemm package, and returns the
possible results.  This uses the same logic as L<lemmatize> above for the
entire tradition, but can also be used to (re-)analyze individual readings.

=cut

sub reading_lookup {
	my( @path ) = @_;
	my %opts = ( 
		'language' => 'English',
		'callback' => sub { _parse_wordform( @_ ) },
		'path' => \@path,
		);
	return reading_lookup_treetagger( %opts );
}

=head2 morphology_tags

Return a data structure describing the available parts of speech and their attributes.

=cut

sub morphology_tags {
	return lfs_morph_tags();
}

# Utility function to turn a TreeTagger result into a WordForm
sub _parse_wordform {
	my $tagresult = shift;
	my( $orig, $tag, $lemma ) = split( /\t/, $tagresult );
	return () unless $tag =~ /\w/; # skip punct-only "tags"
	my $morphobj = Lingua::TagSet::TreeTagger::English->tag2structure( $tag );
	if( $morphobj ) {
		return ( Text::Tradition::Collation::Reading::WordForm->new(
			'language' => 'English',
			'lemma' => $lemma,
			'morphology' => $morphobj,
			) );
	} else {
		warn "No morphology found for word: $tagresult";
		return ();
	}
}

1;

=head2 TODO

=over

=item * Tests!

=back
Commit	Line	Data
f4b6b4d0	1	package Text::Tradition::Language::English;
	2
	3	use strict;
	4	use warnings;
f8862b58	5	use Lingua::TagSet::TreeTagger::English;
75ae2b25	6	use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger
75ae2b25	7	lfs_morph_tags /;
f4b6b4d0	8	use TryCatch;
f4b6b4d0	9
f4b6b4d0	10	=head1 NAME
	11
	12	Text::Tradition::Language::English - language-specific module for English
	13
	14	=head1 DESCRIPTION
	15
	16	Implements morphology lookup for English words in context. This module
	17	depends on the TreeTagger software
	18	(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
	19	(for now) expected to be installed in $MORPHDIR/TreeTagger.
	20
	21	=head1 SUBROUTINES
	22
	23	=head2 lemmatize( $text )
	24
	25	Evaluates the string using the TreeTagger, and returns the results.
	26
	27	=begin testing
	28
	29	binmode STDOUT, ':utf8';
	30	use Text::Tradition;
	31	use_ok( 'Text::Tradition::Language::English' );
	32
	33	=end testing
	34
	35	=cut
	36
	37	sub lemmatize {
	38	my $tradition = shift;
e0f6836a	39	my %opts = (
5271a011	40	'language' => 'English',
e0f6836a	41	'callback' => sub { _parse_wordform( @_ ) }
	42	);
	43	return lemmatize_treetagger( $tradition, %opts );
f4b6b4d0	44	}
	45
	46	=head2 reading_lookup( $rdg[, $rdg, ...] )
	47
	48	Looks up one or more readings using the Flemm package, and returns the
	49	possible results. This uses the same logic as L<lemmatize> above for the
	50	entire tradition, but can also be used to (re-)analyze individual readings.
	51
	52	=cut
	53
	54	sub reading_lookup {
e0f6836a	55	my( @path ) = @_;
e0f6836a	56	my %opts = (
d4f6cbac	57	'language' => 'English',
e0f6836a	58	'callback' => sub { _parse_wordform( @_ ) },
	59	'path' => \@path,
	60	);
	61	return reading_lookup_treetagger( %opts );
f4b6b4d0	62	}
f4b6b4d0	63
75ae2b25	64	=head2 morphology_tags
	65
	66	Return a data structure describing the available parts of speech and their attributes.
	67
	68	=cut
	69
	70	sub morphology_tags {
	71	return lfs_morph_tags();
	72	}
	73
f4b6b4d0	74	# Utility function to turn a TreeTagger result into a WordForm
	75	sub _parse_wordform {
	76	my $tagresult = shift;
	77	my( $orig, $tag, $lemma ) = split( /\t/, $tagresult );
f8862b58	78	return () unless $tag =~ /\w/; # skip punct-only "tags"
f8862b58	79	my $morphobj = Lingua::TagSet::TreeTagger::English->tag2structure( $tag );
f4b6b4d0	80	if( $morphobj ) {
f8862b58	81	return ( Text::Tradition::Collation::Reading::WordForm->new(
f4b6b4d0	82	'language' => 'English',
	83	'lemma' => $lemma,
	84	'morphology' => $morphobj,
f8862b58	85	) );
f4b6b4d0	86	} else {
f8862b58	87	warn "No morphology found for word: $tagresult";
f8862b58	88	return ();
f4b6b4d0	89	}
	90	}
	91
	92	1;
	93
	94	=head2 TODO
	95
	96	=over
	97
e0f6836a	98	=item * Tests!
f4b6b4d0	99
	100	=back
	101