lib/Text/Tradition/Language/Latin.pm

   1 package Text::Tradition::Language::Latin;
   2
   3 use strict;
   4 use warnings;
   5 use Module::Load;
   6 use Text::Tradition::Language::Base qw/ lemmatize_treetagger lfs_morph_tags /;
   7 use TryCatch;
   8
   9 =head1 NAME
  10
  11 Text::Tradition::Language::Latin - language-specific module for Latin
  12
  13 =head1 DESCRIPTION
  14
  15 Implements morphology lookup for French words in context.  This module
  16 depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
  17
  18 =head1 SUBROUTINES
  19
  20 =head2 lemmatize( $text )
  21
  22 Evaluates the string using the Flemm package, and returns the results.
  23
  24 =begin testing
  25
  26 use Text::Tradition;
  27 use_ok( 'Text::Tradition::Language::Latin' );
  28
  29 eval "use Lingua::Morph::Perseus";
  30 my $err = $@;
  31
  32 SKIP: {
  33         skip "Package Lingua::Morph::Perseus not found" if $err;
  34
  35         my $trad = Text::Tradition->new(
  36                 'language' => 'Latin',
  37                 'file' => 't/data/legendfrag.xml',
  38                 'input' => 'Self' );
  39         $trad->lemmatize();
  40         my $ambig = 0;
  41         foreach my $r ( $trad->collation->readings ) {
  42                 next if $r->is_meta;
  43                 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
  44                 my @lex = $r->lexemes;
  45                 my $lexstr = join( '', map { $_->string } @lex );
  46                 my $textstr = $r->text;
  47                 $textstr =~ s/\s+//g;
  48                 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
  49                 foreach my $l ( @lex ) {
  50                         next unless $l->matches;
  51                         next if $l->is_disambiguated;
  52                         printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
  53                                 join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
  54                         $ambig++;
  55                 }
  56         }
  57         is( $ambig, 4, "Found 4 ambiguous forms as expected" );
  58 }
  59
  60 =end testing
  61
  62 =cut
  63
  64 sub lemmatize {
  65         my $tradition = shift;
  66         my %opts = (
  67                 'language' => 'Latin',
  68                 'callback' => sub { _perseus_lookup_tt( @_ ) }
  69                 );
  70         return lemmatize_treetagger( $tradition, %opts );
  71 }
  72
  73 =head2 reading_lookup( $rdg[, $rdg, ...] )
  74
  75 Looks up one or more readings using the Perseus package, and returns the
  76 possible results.  This skips the tree tagger / tokenizer, returning any
  77 match for the word string(s) in the morphology DB.
  78
  79 =cut
  80
  81 sub reading_lookup {
  82         my @words = @_;
  83         return map { _perseus_lookup_str( $_ ) } @words;
  84 }
  85
  86 =head2 morphology_tags
  87
  88 Return a data structure describing the available parts of speech and their attributes.
  89
  90 =cut
  91
  92 sub morphology_tags {
  93         return lfs_morph_tags();
  94 }
  95
  96
  97 {
  98         my $morph;
  99
 100         sub _morph_connect {
 101                 unless( $morph ) {
 102                         try {
 103                                 load 'Lingua::Morph::Perseus';
 104                                 $morph = Lingua::Morph::Perseus->connect( 'Latin' );
 105                         } catch {
 106                                 warn "Cannot do Latin word lemmatization without Lingua::Morph::Perseus: @_";
 107                                 return;
 108                         }
 109                 }
 110         }
 111
 112         # TODO special case:
 113         #  passive verbs (-or)
 114         #  T sapientia -> sapientia
 115         #  T primus -> unus
 116         #  T occulta -> occultus (with occulo in next field, hmm...)
 117         #  T carne -> carnis
 118         #  T melius -> bonus
 119
 120         my %excep = (
 121                 'absens' => 'absum',
 122                 'aperte' => 'apertus',
 123                 'evolvo' => 'exvolvo',
 124                 'inquiam' => 'inquam',
 125                 'intelligo' => 'intellego',
 126                 'itaque' => 'ita',
 127                 'iuste' => 'iustus',
 128                 'longe' => 'longus',
 129                 'male' => 'malus|malum',
 130                 'multum' => 'multus',
 131                 'nec' => 'neque',
 132                 'nos' => 'ego',
 133                 'occultum' => 'occultus',
 134                 'peregrinans' => 'peregrinor',
 135                 'perfectus' => 'perficio',
 136                 'potius' => 'potis',
 137                 'praesente' => 'praesens',
 138                 'prius' => 'prior',
 139                 'quotidianus' => 'cottidianus',
 140                 'se' => 'sui',
 141                 'septem' => 'septimus',
 142                 'Spiritum' => 'spiritus',
 143                 'viriliter' => 'virilis', # TODO special case -iter?
 144                 'vos' => 'tu',
 145
 146                 'datum' => 'do|data|datus',
 147                 'forte' => 'fors|fortis',
 148                 'vere' => 'verum|verus',
 149                 );
 150
 151         sub _perseus_lookup_tt {
 152                 my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
 153                 _morph_connect();
 154                 return unless $morph;
 155                 # Discard results that don't match the lemma, unless lemma is unknown
 156                 my $lookupopts = {};
 157                 unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
 158                         # TODO Perseus lemma might have a number on the end, yuck.
 159                         #  multiple lemmata separated with |
 160                         $lemma =~ s/[^\w|]//g;
 161                         $lemma = $excep{$lemma} if exists $excep{$lemma};
 162                         $lemma =~ s/j/i/g;
 163                         if( $lemma ) { # if we have anything left...
 164                                 my %lems;
 165                                 map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
 166                                 $lookupopts->{'lemma'} = [ keys %lems ];
 167                         }
 168                 }
 169                 $lookupopts->{'ttpos'} = $pos if $pos;
 170
 171                 my $result = $morph->lexicon_lookup( $orig, $lookupopts );
 172                 # unless( !keys( %$lookupopts ) ||  $result->{'filtered'} ) {
 173                 #       warn "Filter on $pos / $lemma returned no results; using all results";
 174                 # }
 175                 my @ret = @{$result->{'objects'}};
 176                 my %unique_wordforms;
 177                 foreach my $obj ( @ret ) {
 178                         my $wf = _wordform_from_row( $obj );
 179                         $unique_wordforms{$wf->to_string} = $wf;
 180                 }
 181                 return values( %unique_wordforms );
 182         }
 183
 184         sub _perseus_lookup_str {
 185                 my( $orig ) = @_;
 186                 _morph_connect();
 187                 return unless $morph;
 188                 # Simple morph DB lookup, and return the results.
 189                 my $result = $morph->lookup( $orig );
 190                 return map { _wordform_from_row( $_ ) } @{$result->{'objects'}};
 191         }
 192
 193 }
 194
 195 sub _wordform_from_row {
 196         my( $rowobj ) = @_;
 197         my $lemma = $rowobj->lemma;
 198         $lemma =~ s/^(\D+)\d*$/$1/;
 199         my $wf = Text::Tradition::Collation::Reading::WordForm->new(
 200                 'language' => 'Latin',
 201                 'lemma' => $lemma,
 202                 'morphology' => $rowobj->morphology,
 203                 );
 204         return $wf;
 205 }
 206
 207 1;