Commit | Line | Data |
f4b6b4d0 |
1 | package Text::Tradition::Language::English; |
2 | |
3 | use strict; |
4 | use warnings; |
75ae2b25 |
5 | use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger |
6 | lfs_morph_tags /; |
f4b6b4d0 |
7 | use TryCatch; |
8 | |
f4b6b4d0 |
9 | =head1 NAME |
10 | |
11 | Text::Tradition::Language::English - language-specific module for English |
12 | |
13 | =head1 DESCRIPTION |
14 | |
15 | Implements morphology lookup for English words in context. This module |
16 | depends on the TreeTagger software |
17 | (L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is |
18 | (for now) expected to be installed in $MORPHDIR/TreeTagger. |
19 | |
20 | =head1 SUBROUTINES |
21 | |
22 | =head2 lemmatize( $text ) |
23 | |
24 | Evaluates the string using the TreeTagger, and returns the results. |
25 | |
26 | =begin testing |
27 | |
28 | binmode STDOUT, ':utf8'; |
29 | use Text::Tradition; |
30 | use_ok( 'Text::Tradition::Language::English' ); |
31 | |
32 | =end testing |
33 | |
34 | =cut |
35 | |
36 | sub lemmatize { |
37 | my $tradition = shift; |
e0f6836a |
38 | my %opts = ( |
5271a011 |
39 | 'language' => 'English', |
e0f6836a |
40 | 'callback' => sub { _parse_wordform( @_ ) } |
41 | ); |
42 | return lemmatize_treetagger( $tradition, %opts ); |
f4b6b4d0 |
43 | } |
44 | |
45 | =head2 reading_lookup( $rdg[, $rdg, ...] ) |
46 | |
47 | Looks up one or more readings using the Flemm package, and returns the |
48 | possible results. This uses the same logic as L<lemmatize> above for the |
49 | entire tradition, but can also be used to (re-)analyze individual readings. |
50 | |
51 | =cut |
52 | |
53 | sub reading_lookup { |
e0f6836a |
54 | my( @path ) = @_; |
55 | my %opts = ( |
56 | 'language' => 'French', |
57 | 'callback' => sub { _parse_wordform( @_ ) }, |
58 | 'path' => \@path, |
59 | ); |
60 | return reading_lookup_treetagger( %opts ); |
f4b6b4d0 |
61 | } |
62 | |
75ae2b25 |
63 | =head2 morphology_tags |
64 | |
65 | Return a data structure describing the available parts of speech and their attributes. |
66 | |
67 | =cut |
68 | |
69 | sub morphology_tags { |
70 | return lfs_morph_tags(); |
71 | } |
72 | |
f4b6b4d0 |
73 | # Utility function to turn a TreeTagger result into a WordForm |
74 | sub _parse_wordform { |
75 | my $tagresult = shift; |
76 | my( $orig, $tag, $lemma ) = split( /\t/, $tagresult ); |
77 | my $morphobj = Lingua::TagSet::TreeTagger->tag2structure( $tag ); |
78 | if( $morphobj ) { |
79 | return Text::Tradition::Collation::Reading::WordForm->new( |
80 | 'language' => 'English', |
81 | 'lemma' => $lemma, |
82 | 'morphology' => $morphobj, |
83 | ); |
84 | } else { |
85 | warn "No morphology found for word: $_"; |
86 | } |
87 | } |
88 | |
89 | 1; |
90 | |
91 | =head2 TODO |
92 | |
93 | =over |
94 | |
e0f6836a |
95 | =item * Tests! |
f4b6b4d0 |
96 | |
97 | =back |
98 | |
99 | =head1 LICENSE |
100 | |
101 | This package is free software and is provided "as is" without express |
102 | or implied warranty. You can redistribute it and/or modify it under |
103 | the same terms as Perl itself. |
104 | |
105 | =head1 AUTHOR |
106 | |
107 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |