1 package Text::Tradition::Language::Perseus;
6 use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger
12 Text::Tradition::Language::Perseus - base module for those languages that rely
13 on a Lingua::Morph::Perseus database.
17 Implements morphology lookup for words in context. This module depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
21 =head2 lemmatize( $text )
23 Evaluates the string using Treetagger and Perseus, and returns the results.
27 # tested in child language modules
29 sub perseus_lemmatize {
31 my $tradition = shift;
33 'language' => $tradition->language,
34 'callback' => sub { _perseus_lookup_tt( $self, @_ ) }
36 return lemmatize_treetagger( $tradition, %opts );
39 =head2 reading_lookup( $rdg[, $rdg, ...] )
41 Looks up one or more readings using the Perseus package, and returns the
42 possible results. This skips the tree tagger / tokenizer, returning any
43 match for the word string(s) in the morphology DB.
47 sub perseus_reading_lookup {
48 my( $self, @words ) = @_;
50 'language' => $self->_get_lang(),
51 'callback' => sub { _perseus_lookup_str( $self, @_ ) },
54 return reading_lookup_treetagger( %opts );
57 =head2 morphology_tags
59 Return a data structure describing the available parts of speech and their attributes.
64 return lfs_morph_tags();
69 my @parts = split( /::/, $self );
75 unless( $self::dbhandle ) {
76 my $lang = $self->_get_lang();
78 load 'Lingua::Morph::Perseus';
79 $self::dbhandle = Lingua::Morph::Perseus->connect( $lang );
81 warn "Cannot do $lang word lemmatization without Lingua::Morph::Perseus: @_";
87 sub _perseus_lookup_tt {
89 my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
90 $self->_morph_connect();
91 return unless $self::dbhandle;
92 # Discard results that don't match the lemma, unless lemma is unknown
94 unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
96 map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
97 $lookupopts->{'lemma'} = [ keys %lems ];
99 $lookupopts->{'ttpos'} = $pos if $pos;
101 my $result = $self::dbhandle->lexicon_lookup( $orig, $lookupopts );
102 # unless( !keys( %$lookupopts ) || $result->{'filtered'} ) {
103 # warn "Filter on $pos / $lemma returned no results; using all results";
105 my @ret = @{$result->{'objects'}};
106 my %unique_wordforms;
107 foreach my $obj ( @ret ) {
108 my $wf = $self->_wordform_from_row( $obj );
109 $unique_wordforms{$wf->to_string} = $wf;
111 return values( %unique_wordforms );
114 sub _perseus_lookup_str {
116 my ( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
117 $self->_morph_connect();
118 return unless $self::dbhandle;
119 # Simple morph DB lookup, and return the results. Disregard the treetagger.
120 my $result = $self::dbhandle->lexicon_lookup( $orig );
121 return map { $self->_wordform_from_row( $_ ) } @{$result->{'objects'}};
124 sub _wordform_from_row {
125 my( $self, $rowobj ) = @_;
126 my $lemma = $rowobj->lemma;
127 $lemma =~ s/^(\D+)\d*$/$1/;
128 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
129 'language' => $self->_get_lang(),
131 'morphology' => $rowobj->morphology,
138 =head1 ACKNOWLEDGMENTS
140 The Perseus model and its dependents (currently Latin and Greek) draws its
141 data from the ARTFL project at the University of Chicago. The module author
142 wishes to thank Helma Dik in particular for her kind assistance.