[scpubgit/stemmatology.git] / morphology / lib / Text / Tradition / Language / Armenian.pm

package Text::Tradition::Language::Armenian;

use strict;
use warnings;
use Module::Load;
use parent qw/ Text::Tradition::Language::Perseus /;

=head1 NAME

Text::Tradition::Language::Armenian - language-specific module for Armenian

=head1 DESCRIPTION

Implements morphology lookup for Armenian (Grabar) words in context.  This module
depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.

=head1 SUBROUTINES

=head2 lemmatize( $text )

Evaluates the string using Treetagger and Perseus, and returns the results.

=head2 reading_lookup( $word )

Returns a single-word morphological lookup of the given word using Perseus.

=begin testing

use Text::Tradition;
use_ok( 'Text::Tradition::Language::Armenian' );

eval "use Lingua::Morph::Perseus";
my $err = $@;

SKIP: {
	skip "No Armenian test data yet";

	my $trad = Text::Tradition->new(
		'language' => 'Armenian',
		'file' => 't/data/legendfrag.xml',
		'input' => 'Self' );
	$trad->lemmatize();
	my $ambig = 0;
	foreach my $r ( $trad->collation->readings ) {
		next if $r->is_meta;
		ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
		my @lex = $r->lexemes;
		my $lexstr = join( '', map { $_->string } @lex );
		my $textstr = $r->text;
		$textstr =~ s/\s+//g;
		is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
		foreach my $l ( @lex ) {
			next unless $l->matches;
			next if $l->is_disambiguated;
	 		printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
	 			join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
			$ambig++;
		}
	}
	is( $ambig, 4, "Found 4 ambiguous forms as expected" );
}

=end testing

=cut

our $dbhandle;

sub lemmatize {
	return __PACKAGE__->perseus_lemmatize( @_ );
}

sub reading_lookup {
	return __PACKAGE__->perseus_reading_lookup( @_ );
}

=head2 regularize( $text )

Returns a regularized form of the reading for the purposes of collation.

=cut

sub regularize {
	my( $word ) = @_;
	# We don't really distinguish between commas and semicolons properly
	# in the manuscript.  Make them the same.
	$word =~ s/\./\,/g;

	# Get rid of accent marks.
	$word =~ s/՛//g;
	# Get rid of hyphen.
	$word =~ s/֊//g;
	# Get rid of any backtick that falls mid-word.
	$word =~ s/՝(.)/$1/g;
	# Standardize ligatures.
	$word =~ s/աւ/օ/g;	# for easy vocalic comparison to ո
	$word =~ s/և/եւ/g;
	
	# TODO split off suspected prefix/suffix markers?
	# Downcase the word.
	$word = lc( $word );
	return $word;
}

1;
Commit	Line	Data
0ce8c0cf	1	package Text::Tradition::Language::Armenian;
	2
	3	use strict;
	4	use warnings;
	5	use Module::Load;
	6	use parent qw/ Text::Tradition::Language::Perseus /;
	7
	8	=head1 NAME
	9
	10	Text::Tradition::Language::Armenian - language-specific module for Armenian
	11
	12	=head1 DESCRIPTION
	13
	14	Implements morphology lookup for Armenian (Grabar) words in context. This module
	15	depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
	16
	17	=head1 SUBROUTINES
	18
	19	=head2 lemmatize( $text )
	20
	21	Evaluates the string using Treetagger and Perseus, and returns the results.
	22
307d8db9	23	=head2 reading_lookup( $word )
	24
	25	Returns a single-word morphological lookup of the given word using Perseus.
	26
0ce8c0cf	27	=begin testing
	28
	29	use Text::Tradition;
	30	use_ok( 'Text::Tradition::Language::Armenian' );
	31
	32	eval "use Lingua::Morph::Perseus";
	33	my $err = $@;
	34
	35	SKIP: {
	36	skip "No Armenian test data yet";
	37
	38	my $trad = Text::Tradition->new(
	39	'language' => 'Armenian',
	40	'file' => 't/data/legendfrag.xml',
	41	'input' => 'Self' );
	42	$trad->lemmatize();
	43	my $ambig = 0;
	44	foreach my $r ( $trad->collation->readings ) {
	45	next if $r->is_meta;
	46	ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
	47	my @lex = $r->lexemes;
	48	my $lexstr = join( '', map { $_->string } @lex );
	49	my $textstr = $r->text;
	50	$textstr =~ s/\s+//g;
	51	is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
	52	foreach my $l ( @lex ) {
	53	next unless $l->matches;
	54	next if $l->is_disambiguated;
	55	printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
	56	join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
	57	$ambig++;
	58	}
	59	}
	60	is( $ambig, 4, "Found 4 ambiguous forms as expected" );
	61	}
	62
	63	=end testing
	64
	65	=cut
	66
	67	our $dbhandle;
	68
	69	sub lemmatize {
	70	return __PACKAGE__->perseus_lemmatize( @_ );
	71	}
	72
	73	sub reading_lookup {
	74	return __PACKAGE__->perseus_reading_lookup( @_ );
	75	}
	76
307d8db9	77	=head2 regularize( $text )
	78
	79	Returns a regularized form of the reading for the purposes of collation.
	80
	81	=cut
	82
	83	sub regularize {
	84	my( $word ) = @_;
	85	# We don't really distinguish between commas and semicolons properly
	86	# in the manuscript. Make them the same.
	87	$word =~ s/\./\,/g;
	88
	89	# Get rid of accent marks.
	90	$word =~ s/՛//g;
	91	# Get rid of hyphen.
	92	$word =~ s/֊//g;
	93	# Get rid of any backtick that falls mid-word.
	94	$word =~ s/՝(.)/$1/g;
	95	# Standardize ligatures.
	96	$word =~ s/աւ/օ/g; # for easy vocalic comparison to ո
	97	$word =~ s/և/եւ/g;
	98
	99	# TODO split off suspected prefix/suffix markers?
	100	# Downcase the word.
	101	$word = lc( $word );
	102	return $word;
	103	}
	104
0ce8c0cf	105	1;
e92d4229	106