use English module for lemmatizing English words, oops
[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / English.pm
CommitLineData
f4b6b4d0 1package Text::Tradition::Language::English;
2
3use strict;
4use warnings;
f8862b58 5use Lingua::TagSet::TreeTagger::English;
75ae2b25 6use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger
7 lfs_morph_tags /;
f4b6b4d0 8use TryCatch;
9
f4b6b4d0 10=head1 NAME
11
12Text::Tradition::Language::English - language-specific module for English
13
14=head1 DESCRIPTION
15
16Implements morphology lookup for English words in context. This module
17depends on the TreeTagger software
18(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
19(for now) expected to be installed in $MORPHDIR/TreeTagger.
20
21=head1 SUBROUTINES
22
23=head2 lemmatize( $text )
24
25Evaluates the string using the TreeTagger, and returns the results.
26
27=begin testing
28
29binmode STDOUT, ':utf8';
30use Text::Tradition;
31use_ok( 'Text::Tradition::Language::English' );
32
33=end testing
34
35=cut
36
37sub lemmatize {
38 my $tradition = shift;
e0f6836a 39 my %opts = (
5271a011 40 'language' => 'English',
e0f6836a 41 'callback' => sub { _parse_wordform( @_ ) }
42 );
43 return lemmatize_treetagger( $tradition, %opts );
f4b6b4d0 44}
45
46=head2 reading_lookup( $rdg[, $rdg, ...] )
47
48Looks up one or more readings using the Flemm package, and returns the
49possible results. This uses the same logic as L<lemmatize> above for the
50entire tradition, but can also be used to (re-)analyze individual readings.
51
52=cut
53
54sub reading_lookup {
e0f6836a 55 my( @path ) = @_;
56 my %opts = (
d4f6cbac 57 'language' => 'English',
e0f6836a 58 'callback' => sub { _parse_wordform( @_ ) },
59 'path' => \@path,
60 );
61 return reading_lookup_treetagger( %opts );
f4b6b4d0 62}
63
75ae2b25 64=head2 morphology_tags
65
66Return a data structure describing the available parts of speech and their attributes.
67
68=cut
69
70sub morphology_tags {
71 return lfs_morph_tags();
72}
73
f4b6b4d0 74# Utility function to turn a TreeTagger result into a WordForm
75sub _parse_wordform {
76 my $tagresult = shift;
77 my( $orig, $tag, $lemma ) = split( /\t/, $tagresult );
f8862b58 78 return () unless $tag =~ /\w/; # skip punct-only "tags"
79 my $morphobj = Lingua::TagSet::TreeTagger::English->tag2structure( $tag );
f4b6b4d0 80 if( $morphobj ) {
f8862b58 81 return ( Text::Tradition::Collation::Reading::WordForm->new(
f4b6b4d0 82 'language' => 'English',
83 'lemma' => $lemma,
84 'morphology' => $morphobj,
f8862b58 85 ) );
f4b6b4d0 86 } else {
f8862b58 87 warn "No morphology found for word: $tagresult";
88 return ();
f4b6b4d0 89 }
90}
91
921;
93
94=head2 TODO
95
96=over
97
e0f6836a 98=item * Tests!
f4b6b4d0 99
100=back
101
102=head1 LICENSE
103
104This package is free software and is provided "as is" without express
105or implied warranty. You can redistribute it and/or modify it under
106the same terms as Perl itself.
107
108=head1 AUTHOR
109
110Tara L Andrews E<lt>aurum@cpan.orgE<gt>