Commit | Line | Data |
f4b6b4d0 |
1 | package Text::Tradition::Language::English; |
2 | |
3 | use strict; |
4 | use warnings; |
f8862b58 |
5 | use Lingua::TagSet::TreeTagger::English; |
75ae2b25 |
6 | use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger |
7 | lfs_morph_tags /; |
f4b6b4d0 |
8 | use TryCatch; |
9 | |
f4b6b4d0 |
10 | =head1 NAME |
11 | |
12 | Text::Tradition::Language::English - language-specific module for English |
13 | |
14 | =head1 DESCRIPTION |
15 | |
16 | Implements morphology lookup for English words in context. This module |
17 | depends on the TreeTagger software |
18 | (L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is |
19 | (for now) expected to be installed in $MORPHDIR/TreeTagger. |
20 | |
21 | =head1 SUBROUTINES |
22 | |
23 | =head2 lemmatize( $text ) |
24 | |
25 | Evaluates the string using the TreeTagger, and returns the results. |
26 | |
27 | =begin testing |
28 | |
29 | binmode STDOUT, ':utf8'; |
30 | use Text::Tradition; |
31 | use_ok( 'Text::Tradition::Language::English' ); |
32 | |
33 | =end testing |
34 | |
35 | =cut |
36 | |
37 | sub lemmatize { |
38 | my $tradition = shift; |
e0f6836a |
39 | my %opts = ( |
5271a011 |
40 | 'language' => 'English', |
e0f6836a |
41 | 'callback' => sub { _parse_wordform( @_ ) } |
42 | ); |
43 | return lemmatize_treetagger( $tradition, %opts ); |
f4b6b4d0 |
44 | } |
45 | |
46 | =head2 reading_lookup( $rdg[, $rdg, ...] ) |
47 | |
48 | Looks up one or more readings using the Flemm package, and returns the |
49 | possible results. This uses the same logic as L<lemmatize> above for the |
50 | entire tradition, but can also be used to (re-)analyze individual readings. |
51 | |
52 | =cut |
53 | |
54 | sub reading_lookup { |
e0f6836a |
55 | my( @path ) = @_; |
56 | my %opts = ( |
d4f6cbac |
57 | 'language' => 'English', |
e0f6836a |
58 | 'callback' => sub { _parse_wordform( @_ ) }, |
59 | 'path' => \@path, |
60 | ); |
61 | return reading_lookup_treetagger( %opts ); |
f4b6b4d0 |
62 | } |
63 | |
75ae2b25 |
64 | =head2 morphology_tags |
65 | |
66 | Return a data structure describing the available parts of speech and their attributes. |
67 | |
68 | =cut |
69 | |
70 | sub morphology_tags { |
71 | return lfs_morph_tags(); |
72 | } |
73 | |
f4b6b4d0 |
74 | # Utility function to turn a TreeTagger result into a WordForm |
75 | sub _parse_wordform { |
76 | my $tagresult = shift; |
77 | my( $orig, $tag, $lemma ) = split( /\t/, $tagresult ); |
f8862b58 |
78 | return () unless $tag =~ /\w/; # skip punct-only "tags" |
79 | my $morphobj = Lingua::TagSet::TreeTagger::English->tag2structure( $tag ); |
f4b6b4d0 |
80 | if( $morphobj ) { |
f8862b58 |
81 | return ( Text::Tradition::Collation::Reading::WordForm->new( |
f4b6b4d0 |
82 | 'language' => 'English', |
83 | 'lemma' => $lemma, |
84 | 'morphology' => $morphobj, |
f8862b58 |
85 | ) ); |
f4b6b4d0 |
86 | } else { |
f8862b58 |
87 | warn "No morphology found for word: $tagresult"; |
88 | return (); |
f4b6b4d0 |
89 | } |
90 | } |
91 | |
92 | 1; |
93 | |
94 | =head2 TODO |
95 | |
96 | =over |
97 | |
e0f6836a |
98 | =item * Tests! |
f4b6b4d0 |
99 | |
100 | =back |
101 | |