lib/Text/Tradition/Language/Latin.pm

   1 package Text::Tradition::Language::Latin;
   2
   3 use strict;
   4 use warnings;
   5 use Module::Load;
   6 use Morph::Perseus::Structure;
   7 use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct
   8         lfs_morph_tags /;
   9 use TryCatch;
  10
  11 =head1 NAME
  12
  13 Text::Tradition::Language::Latin - language-specific module for Latin
  14
  15 =head1 DESCRIPTION
  16
  17 Implements morphology lookup for French words in context.  This module
  18 depends on the Morph::Perseus module for access to PhiloLogic database data.
  19 It also depends on the TreeTagger software
  20 (L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
  21 (for now) expected to be installed in $MORPHDIR/TreeTagger.
  22
  23 =head1 SUBROUTINES
  24
  25 =head2 lemmatize( $text )
  26
  27 Evaluates the string using the Flemm package, and returns the results.
  28
  29 =begin testing
  30
  31 use Text::Tradition;
  32 use_ok( 'Text::Tradition::Language::Latin' );
  33
  34 eval "use Morph::Perseus";
  35 my $err = $@;
  36
  37 SKIP: {
  38         skip "Package Morph::Perseus not found" if $err;
  39
  40         my $trad = Text::Tradition->new(
  41                 'language' => 'Latin',
  42                 'file' => 't/data/legendfrag.xml',
  43                 'input' => 'Self' );
  44         $trad->lemmatize();
  45         my $ambig = 0;
  46         foreach my $r ( $trad->collation->readings ) {
  47                 next if $r->is_meta;
  48                 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
  49                 my @lex = $r->lexemes;
  50                 my $lexstr = join( '', map { $_->string } @lex );
  51                 my $textstr = $r->text;
  52                 $textstr =~ s/\s+//g;
  53                 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
  54                 foreach my $l ( @lex ) {
  55                         next unless $l->matches;
  56                         next if $l->is_disambiguated;
  57                         printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
  58                                 join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
  59                         $ambig++;
  60                 }
  61         }
  62         is( $ambig, 7, "Found 7 ambiguous forms as expected" );
  63 }
  64
  65 =end testing
  66
  67 =cut
  68
  69 sub lemmatize {
  70         my $tradition = shift;
  71         my %opts = (
  72                 'language' => 'Latin',
  73                 'callback' => sub { _perseus_lookup_tt( @_ ) }
  74                 );
  75         return lemmatize_treetagger( $tradition, %opts );
  76 }
  77
  78 =head2 reading_lookup( $rdg[, $rdg, ...] )
  79
  80 Looks up one or more readings using the Perseus package, and returns the
  81 possible results.  This skips the tree tagger / tokenizer, returning any
  82 match for the word string(s) in the morphology DB.
  83
  84 =cut
  85
  86 sub reading_lookup {
  87         my @words = @_;
  88         return map { _perseus_lookup_str( $_ ) } @words;
  89 }
  90
  91 =head2 morphology_tags
  92
  93 Return a data structure describing the available parts of speech and their attributes.
  94
  95 =cut
  96
  97 sub morphology_tags {
  98         try {
  99                 load 'Morph::Perseus::Structure';
 100         } catch {
 101                 warn "Not using Perseus Latin tags";
 102         }
 103         return lfs_morph_tags();
 104 }
 105
 106
 107 {
 108         my $morph;
 109
 110         sub _morph_connect {
 111                 unless( $morph ) {
 112                         try {
 113                                 load 'Morph::Perseus';
 114                                 load 'Morph::Perseus::Structure';
 115                                 $morph = Morph::Perseus->connect( 'Latin' );
 116                         } catch {
 117                                 warn "Cannot do Latin word lemmatization without Morph::Perseus: @_";
 118                                 return;
 119                         }
 120                 }
 121         }
 122
 123         # TODO special case:
 124         #  passive verbs (-or)
 125         #  T sapientia -> sapientia
 126         #  T primus -> unus
 127         #  T occulta -> occultus (with occulo in next field, hmm...)
 128         #  T carne -> carnis
 129         #  T melius -> bonus
 130
 131
 132         my %excep = (
 133                 'absens' => 'absum',
 134                 'aperte' => 'apertus',
 135                 'evolvo' => 'exvolvo',
 136                 'inquiam' => 'inquam',
 137                 'intelligo' => 'intellego',
 138                 'itaque' => 'ita',
 139                 'iuste' => 'iustus',
 140                 'longe' => 'longus',
 141                 'male' => 'malus|malum',
 142                 'multum' => 'multus',
 143                 'nec' => 'neque',
 144                 'nos' => 'ego',
 145                 'occultum' => 'occultus',
 146                 'peregrinans' => 'peregrinor',
 147                 'perfectus' => 'perficio',
 148                 'potius' => 'potis',
 149                 'praesente' => 'praesens',
 150                 'prius' => 'prior',
 151                 'quotidianus' => 'cottidianus',
 152                 'se' => 'sui',
 153                 'septem' => 'septimus',
 154                 'Spiritum' => 'spiritus',
 155                 'viriliter' => 'virilis', # TODO special case -iter?
 156                 'vos' => 'tu',
 157
 158                 'datum' => 'do|data|datus',
 159                 'forte' => 'fors|fortis',
 160                 'vere' => 'verum|verus',
 161                 );
 162
 163         sub _perseus_lookup_tt {
 164                 my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
 165                 _morph_connect();
 166                 my $result = $morph->lookup( $orig );
 167                 # Discard results that don't match the lemma, unless lemma is unknown
 168                 my @orig = @{$result->{'objects'}};
 169                 my @ret;
 170                 unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
 171                         # TODO Perseus lemma might have a number on the end, yuck.
 172                         #  multiple lemmata separated with |
 173                         $lemma =~ s/[^\w|]//g;
 174                         $lemma = $excep{$lemma} if exists $excep{$lemma};
 175                         $lemma =~ s/j/i/g;
 176                         my %lems;
 177                         my @forms =
 178                         map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
 179                         # Now match the lemmata from Treetagger to the lemmata and alt_ls
 180                         # from Perseus.
 181                         @ret = grep {
 182                                 my $x = $_->lemma;
 183                                 $x =~ s/\d+$//;
 184                                 my $y = $_->alt_lex || '';
 185                                 $y =~ s/\d+$//;
 186                                 $lems{$x} || $lems{$y};
 187                                 } @orig;
 188                         warn "TreeTagger lemma $lemma matched no results from Perseus for $orig"
 189                                 if @orig && !@ret;
 190                 }
 191                 @ret = @orig unless @ret;
 192
 193                 my %unique_wordforms;
 194                 foreach my $obj ( @ret ) {
 195                         my $wf = _wordform_from_row( $obj );
 196                         $unique_wordforms{$wf->to_string} = $wf;
 197                 }
 198                 ## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet
 199                 # Discard results that don't match the given TreeTagger POS, unless
 200                 # that leaves zero results
 201 #               my $ttstruct = treetagger_struct( $pos );
 202 #               my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms;
 203 #               unless( @ttmatch ) {
 204 #                       warn "TreeTagger POS $pos matched no results from Perseus for $orig";
 205 #                       @ttmatch = @wordforms;
 206 #               }
 207 #               return @ttmatch;
 208                 return values( %unique_wordforms );
 209         }
 210
 211         sub _perseus_lookup_str {
 212                 my( $orig ) = @_;
 213                 _morph_connect();
 214                 # Simple morph DB lookup, and return the results.
 215                 my $result = $morph->lookup( $orig );
 216                 return map { _wordform_from_row( $_ ) } @{$result->{'objects'}};
 217         }
 218
 219 }
 220
 221 sub _wordform_from_row {
 222         my( $rowobj ) = @_;
 223         my $mpstruct;
 224         try {
 225                 $mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code );
 226         } catch {
 227                 warn "Could not create morphology structure from "
 228                         . $rowobj->code . ": $!";
 229         }
 230         my $lemma = $rowobj->lemma;
 231         $lemma =~ s/^(\D+)\d*$/$1/;
 232         my $wf = Text::Tradition::Collation::Reading::WordForm->new(
 233                 'language' => 'Latin',
 234                 'lemma' => $lemma,
 235                 'morphology' => $mpstruct,
 236                 );
 237         return $wf;
 238 }
 239
 240 1;