morphology/lib/Text/Tradition/Language/Armenian.pm

   1 package Text::Tradition::Language::Armenian;
   2
   3 use strict;
   4 use warnings;
   5 use Module::Load;
   6 use parent qw/ Text::Tradition::Language::Perseus /;
   7
   8 =head1 NAME
   9
  10 Text::Tradition::Language::Armenian - language-specific module for Armenian
  11
  12 =head1 DESCRIPTION
  13
  14 Implements morphology lookup for Armenian (Grabar) words in context.  This module
  15 depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
  16
  17 =head1 SUBROUTINES
  18
  19 =head2 lemmatize( $text )
  20
  21 Evaluates the string using Treetagger and Perseus, and returns the results.
  22
  23 =head2 reading_lookup( $word )
  24
  25 Returns a single-word morphological lookup of the given word using Perseus.
  26
  27 =begin testing
  28
  29 use Text::Tradition;
  30 use_ok( 'Text::Tradition::Language::Armenian' );
  31
  32 eval "use Lingua::Morph::Perseus";
  33 my $err = $@;
  34
  35 SKIP: {
  36         skip "No Armenian test data yet";
  37
  38         my $trad = Text::Tradition->new(
  39                 'language' => 'Armenian',
  40                 'file' => 't/data/legendfrag.xml',
  41                 'input' => 'Self' );
  42         $trad->lemmatize();
  43         my $ambig = 0;
  44         foreach my $r ( $trad->collation->readings ) {
  45                 next if $r->is_meta;
  46                 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
  47                 my @lex = $r->lexemes;
  48                 my $lexstr = join( '', map { $_->string } @lex );
  49                 my $textstr = $r->text;
  50                 $textstr =~ s/\s+//g;
  51                 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
  52                 foreach my $l ( @lex ) {
  53                         next unless $l->matches;
  54                         next if $l->is_disambiguated;
  55                         printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
  56                                 join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
  57                         $ambig++;
  58                 }
  59         }
  60         is( $ambig, 4, "Found 4 ambiguous forms as expected" );
  61 }
  62
  63 =end testing
  64
  65 =cut
  66
  67 our $dbhandle;
  68
  69 sub lemmatize {
  70         return __PACKAGE__->perseus_lemmatize( @_ );
  71 }
  72
  73 sub reading_lookup {
  74         return __PACKAGE__->perseus_reading_lookup( @_ );
  75 }
  76
  77 =head2 regularize( $text )
  78
  79 Returns a regularized form of the reading for the purposes of collation.
  80
  81 =cut
  82
  83 sub regularize {
  84         my( $word ) = @_;
  85         # We don't really distinguish between commas and semicolons properly
  86         # in the manuscript.  Make them the same.
  87         $word =~ s/\./\,/g;
  88
  89         # Get rid of accent marks.
  90         $word =~ s/՛//g;
  91         # Get rid of hyphen.
  92         $word =~ s/֊//g;
  93         # Get rid of any backtick that falls mid-word.
  94         $word =~ s/՝(.)/$1/g;
  95         # Standardize ligatures.
  96         $word =~ s/աւ/օ/g;   # for easy vocalic comparison to ո
  97         $word =~ s/և/եւ/g;
  98
  99         # TODO split off suspected prefix/suffix markers?
 100         # Downcase the word.
 101         $word = lc( $word );
 102         return $word;
 103 }
 104
 105 1;
 106