add witness role for collation regularization
[scpubgit/stemmatology.git] / morphology / lib / Text / Tradition / Language / Armenian.pm
CommitLineData
0ce8c0cf 1package Text::Tradition::Language::Armenian;
2
3use strict;
4use warnings;
5use Module::Load;
6use parent qw/ Text::Tradition::Language::Perseus /;
7
8=head1 NAME
9
10Text::Tradition::Language::Armenian - language-specific module for Armenian
11
12=head1 DESCRIPTION
13
14Implements morphology lookup for Armenian (Grabar) words in context. This module
15depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
16
17=head1 SUBROUTINES
18
19=head2 lemmatize( $text )
20
21Evaluates the string using Treetagger and Perseus, and returns the results.
22
307d8db9 23=head2 reading_lookup( $word )
24
25Returns a single-word morphological lookup of the given word using Perseus.
26
0ce8c0cf 27=begin testing
28
29use Text::Tradition;
30use_ok( 'Text::Tradition::Language::Armenian' );
31
32eval "use Lingua::Morph::Perseus";
33my $err = $@;
34
35SKIP: {
36 skip "No Armenian test data yet";
37
38 my $trad = Text::Tradition->new(
39 'language' => 'Armenian',
40 'file' => 't/data/legendfrag.xml',
41 'input' => 'Self' );
42 $trad->lemmatize();
43 my $ambig = 0;
44 foreach my $r ( $trad->collation->readings ) {
45 next if $r->is_meta;
46 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
47 my @lex = $r->lexemes;
48 my $lexstr = join( '', map { $_->string } @lex );
49 my $textstr = $r->text;
50 $textstr =~ s/\s+//g;
51 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
52 foreach my $l ( @lex ) {
53 next unless $l->matches;
54 next if $l->is_disambiguated;
55 printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
56 join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
57 $ambig++;
58 }
59 }
60 is( $ambig, 4, "Found 4 ambiguous forms as expected" );
61}
62
63=end testing
64
65=cut
66
67our $dbhandle;
68
69sub lemmatize {
70 return __PACKAGE__->perseus_lemmatize( @_ );
71}
72
73sub reading_lookup {
74 return __PACKAGE__->perseus_reading_lookup( @_ );
75}
76
307d8db9 77=head2 regularize( $text )
78
79Returns a regularized form of the reading for the purposes of collation.
80
81=cut
82
83sub regularize {
84 my( $word ) = @_;
85 # We don't really distinguish between commas and semicolons properly
86 # in the manuscript. Make them the same.
87 $word =~ s/\./\,/g;
88
89 # Get rid of accent marks.
90 $word =~ s/՛//g;
91 # Get rid of hyphen.
92 $word =~ s/֊//g;
93 # Get rid of any backtick that falls mid-word.
94 $word =~ s/՝(.)/$1/g;
95 # Standardize ligatures.
96 $word =~ s/աւ/օ/g; # for easy vocalic comparison to ո
97 $word =~ s/և/եւ/g;
98
99 # TODO split off suspected prefix/suffix markers?
100 # Downcase the word.
101 $word = lc( $word );
102 return $word;
103}
104
0ce8c0cf 1051;
e92d4229 106