[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / Latin.pm

package Text::Tradition::Language::Latin;

use strict;
use warnings;
use Module::Load;
use Text::Tradition::Language::Base qw/ lemmatize_treetagger lfs_morph_tags /;
use TryCatch;

=head1 NAME

Text::Tradition::Language::Latin - language-specific module for Latin

=head1 DESCRIPTION

Implements morphology lookup for French words in context.  This module
depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.

=head1 SUBROUTINES

=head2 lemmatize( $text )

Evaluates the string using the Flemm package, and returns the results.

=begin testing

use Text::Tradition;
use_ok( 'Text::Tradition::Language::Latin' );

eval "use Lingua::Morph::Perseus";
my $err = $@;

SKIP: {
	skip "Package Lingua::Morph::Perseus not found" if $err;

	my $trad = Text::Tradition->new(
		'language' => 'Latin',
		'file' => 't/data/legendfrag.xml',
		'input' => 'Self' );
	$trad->lemmatize();
	my $ambig = 0;
	foreach my $r ( $trad->collation->readings ) {
		next if $r->is_meta;
		ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
		my @lex = $r->lexemes;
		my $lexstr = join( '', map { $_->string } @lex );
		my $textstr = $r->text;
		$textstr =~ s/\s+//g;
		is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
		foreach my $l ( @lex ) {
			next unless $l->matches;
			next if $l->is_disambiguated;
	 		printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
	 			join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
			$ambig++;
		}
	}
	is( $ambig, 4, "Found 4 ambiguous forms as expected" );
}

=end testing

=cut

sub lemmatize {
	my $tradition = shift;
	my %opts = ( 
		'language' => 'Latin', 
		'callback' => sub { _perseus_lookup_tt( @_ ) } 
		);
	return lemmatize_treetagger( $tradition, %opts );
}

=head2 reading_lookup( $rdg[, $rdg, ...] )

Looks up one or more readings using the Perseus package, and returns the
possible results.  This skips the tree tagger / tokenizer, returning any
match for the word string(s) in the morphology DB.

=cut

sub reading_lookup {
	my @words = @_;
	return map { _perseus_lookup_str( $_ ) } @words;
}

=head2 morphology_tags

Return a data structure describing the available parts of speech and their attributes.

=cut

sub morphology_tags {
	return lfs_morph_tags();
}


{
	my $morph;
	
	sub _morph_connect {
		unless( $morph ) {
			try {
				load 'Lingua::Morph::Perseus';
				$morph = Lingua::Morph::Perseus->connect( 'Latin' );
			} catch {
				warn "Cannot do Latin word lemmatization without Lingua::Morph::Perseus: @_";
				return;
			}
		}
	}
		
	# TODO special case:
	#  passive verbs (-or)
	#  T sapientia -> sapientia
	#  T primus -> unus
	#  T occulta -> occultus (with occulo in next field, hmm...)
	#  T carne -> carnis
	#  T melius -> bonus
	
	my %excep = (
		'absens' => 'absum',
		'aperte' => 'apertus',
		'evolvo' => 'exvolvo',
		'inquiam' => 'inquam',
		'intelligo' => 'intellego',
		'itaque' => 'ita',
		'iuste' => 'iustus',
		'longe' => 'longus',
		'male' => 'malus|malum',
		'multum' => 'multus',
		'nec' => 'neque',
		'nos' => 'ego',
		'occultum' => 'occultus',
		'peregrinans' => 'peregrinor',
		'perfectus' => 'perficio',
		'potius' => 'potis',
		'praesente' => 'praesens',
		'prius' => 'prior',
		'quotidianus' => 'cottidianus',
		'se' => 'sui',
		'septem' => 'septimus',
		'Spiritum' => 'spiritus',
		'viriliter' => 'virilis', # TODO special case -iter?
		'vos' => 'tu',
		
		'datum' => 'do|data|datus',
		'forte' => 'fors|fortis',
		'vere' => 'verum|verus',
		);
		
	sub _perseus_lookup_tt {
		my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
		_morph_connect();
		return unless $morph;
		# Discard results that don't match the lemma, unless lemma is unknown
		my $lookupopts = {};
		unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
			# TODO Perseus lemma might have a number on the end, yuck.
			#  multiple lemmata separated with |
			$lemma =~ s/[^\w|]//g;
			$lemma = $excep{$lemma} if exists $excep{$lemma};
			$lemma =~ s/j/i/g;
			if( $lemma ) { # if we have anything left...
				my %lems;
				map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
				$lookupopts->{'lemma'} = [ keys %lems ];
			}
		}
		$lookupopts->{'ttpos'} = $pos if $pos;
		
		my $result = $morph->lexicon_lookup( $orig, $lookupopts );
		# unless( !keys( %$lookupopts ) ||  $result->{'filtered'} ) {
		# 	warn "Filter on $pos / $lemma returned no results; using all results";
		# }
		my @ret = @{$result->{'objects'}};
		my %unique_wordforms;
		foreach my $obj ( @ret ) {
			my $wf = _wordform_from_row( $obj );
			$unique_wordforms{$wf->to_string} = $wf;
		}
		return values( %unique_wordforms );
	}
	
	sub _perseus_lookup_str {
		my( $orig ) = @_;
		_morph_connect();
		return unless $morph;
		# Simple morph DB lookup, and return the results.
		my $result = $morph->lookup( $orig );
		return map { _wordform_from_row( $_ ) } @{$result->{'objects'}};
	}
	
}

sub _wordform_from_row {
	my( $rowobj ) = @_;
	my $lemma = $rowobj->lemma;
	$lemma =~ s/^(\D+)\d*$/$1/;
	my $wf = Text::Tradition::Collation::Reading::WordForm->new(
		'language' => 'Latin',
		'lemma' => $lemma,
		'morphology' => $rowobj->morphology,
		);
	return $wf;
}
	
1;
Commit	Line	Data
5271a011	1	package Text::Tradition::Language::Latin;
	2
	3	use strict;
	4	use warnings;
	5	use Module::Load;
f8862b58	6	use Text::Tradition::Language::Base qw/ lemmatize_treetagger lfs_morph_tags /;
5271a011	7	use TryCatch;
	8
	9	=head1 NAME
	10
	11	Text::Tradition::Language::Latin - language-specific module for Latin
	12
	13	=head1 DESCRIPTION
	14
	15	Implements morphology lookup for French words in context. This module
f8862b58	16	depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
5271a011	17
	18	=head1 SUBROUTINES
	19
	20	=head2 lemmatize( $text )
	21
	22	Evaluates the string using the Flemm package, and returns the results.
	23
	24	=begin testing
	25
	26	use Text::Tradition;
	27	use_ok( 'Text::Tradition::Language::Latin' );
	28
f8862b58	29	eval "use Lingua::Morph::Perseus";
5271a011	30	my $err = $@;
	31
	32	SKIP: {
f8862b58	33	skip "Package Lingua::Morph::Perseus not found" if $err;
5271a011	34
	35	my $trad = Text::Tradition->new(
	36	'language' => 'Latin',
	37	'file' => 't/data/legendfrag.xml',
	38	'input' => 'Self' );
	39	$trad->lemmatize();
	40	my $ambig = 0;
	41	foreach my $r ( $trad->collation->readings ) {
	42	next if $r->is_meta;
	43	ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
	44	my @lex = $r->lexemes;
	45	my $lexstr = join( '', map { $_->string } @lex );
	46	my $textstr = $r->text;
	47	$textstr =~ s/\s+//g;
	48	is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
	49	foreach my $l ( @lex ) {
fe77efe0	50	next unless $l->matches;
5271a011	51	next if $l->is_disambiguated;
	52	printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
	53	join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
	54	$ambig++;
	55	}
	56	}
f8862b58	57	is( $ambig, 4, "Found 4 ambiguous forms as expected" );
5271a011	58	}
	59
	60	=end testing
	61
	62	=cut
	63
	64	sub lemmatize {
	65	my $tradition = shift;
	66	my %opts = (
	67	'language' => 'Latin',
	68	'callback' => sub { _perseus_lookup_tt( @_ ) }
	69	);
	70	return lemmatize_treetagger( $tradition, %opts );
	71	}
	72
	73	=head2 reading_lookup( $rdg[, $rdg, ...] )
	74
	75	Looks up one or more readings using the Perseus package, and returns the
	76	possible results. This skips the tree tagger / tokenizer, returning any
	77	match for the word string(s) in the morphology DB.
	78
	79	=cut
	80
	81	sub reading_lookup {
	82	my @words = @_;
	83	return map { _perseus_lookup_str( $_ ) } @words;
	84	}
	85
75ae2b25	86	=head2 morphology_tags
	87
	88	Return a data structure describing the available parts of speech and their attributes.
	89
	90	=cut
	91
	92	sub morphology_tags {
75ae2b25	93	return lfs_morph_tags();
	94	}
	95
	96
5271a011	97	{
	98	my $morph;
	99
	100	sub _morph_connect {
	101	unless( $morph ) {
	102	try {
f8862b58	103	load 'Lingua::Morph::Perseus';
f8862b58	104	$morph = Lingua::Morph::Perseus->connect( 'Latin' );
5271a011	105	} catch {
f8862b58	106	warn "Cannot do Latin word lemmatization without Lingua::Morph::Perseus: @_";
5271a011	107	return;
	108	}
	109	}
	110	}
	111
fe77efe0	112	# TODO special case:
	113	# passive verbs (-or)
	114	# T sapientia -> sapientia
	115	# T primus -> unus
	116	# T occulta -> occultus (with occulo in next field, hmm...)
	117	# T carne -> carnis
	118	# T melius -> bonus
	119
fe77efe0	120	my %excep = (
	121	'absens' => 'absum',
	122	'aperte' => 'apertus',
	123	'evolvo' => 'exvolvo',
	124	'inquiam' => 'inquam',
	125	'intelligo' => 'intellego',
	126	'itaque' => 'ita',
	127	'iuste' => 'iustus',
	128	'longe' => 'longus',
	129	'male' => 'malus\|malum',
	130	'multum' => 'multus',
	131	'nec' => 'neque',
	132	'nos' => 'ego',
	133	'occultum' => 'occultus',
	134	'peregrinans' => 'peregrinor',
	135	'perfectus' => 'perficio',
	136	'potius' => 'potis',
	137	'praesente' => 'praesens',
	138	'prius' => 'prior',
	139	'quotidianus' => 'cottidianus',
	140	'se' => 'sui',
	141	'septem' => 'septimus',
	142	'Spiritum' => 'spiritus',
	143	'viriliter' => 'virilis', # TODO special case -iter?
	144	'vos' => 'tu',
	145
	146	'datum' => 'do\|data\|datus',
	147	'forte' => 'fors\|fortis',
	148	'vere' => 'verum\|verus',
	149	);
	150
5271a011	151	sub _perseus_lookup_tt {
	152	my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
	153	_morph_connect();
f8862b58	154	return unless $morph;
5271a011	155	# Discard results that don't match the lemma, unless lemma is unknown
f8862b58	156	my $lookupopts = {};
fe77efe0	157	unless( $lemma eq '<unknown>' \|\| $lemma =~ /^\W+$/ ) {
5271a011	158	# TODO Perseus lemma might have a number on the end, yuck.
fe77efe0	159	# multiple lemmata separated with \|
	160	$lemma =~ s/[^\w\|]//g;
	161	$lemma = $excep{$lemma} if exists $excep{$lemma};
	162	$lemma =~ s/j/i/g;
f8862b58	163	if( $lemma ) { # if we have anything left...
	164	my %lems;
	165	map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\\|/, $lemma );
	166	$lookupopts->{'lemma'} = [ keys %lems ];
	167	}
5271a011	168	}
f8862b58	169	$lookupopts->{'ttpos'} = $pos if $pos;
5271a011	170
f8862b58	171	my $result = $morph->lexicon_lookup( $orig, $lookupopts );
	172	# unless( !keys( %$lookupopts ) \|\| $result->{'filtered'} ) {
	173	# warn "Filter on $pos / $lemma returned no results; using all results";
	174	# }
	175	my @ret = @{$result->{'objects'}};
75ae2b25	176	my %unique_wordforms;
5271a011	177	foreach my $obj ( @ret ) {
75ae2b25	178	my $wf = _wordform_from_row( $obj );
75ae2b25	179	$unique_wordforms{$wf->to_string} = $wf;
5271a011	180	}
75ae2b25	181	return values( %unique_wordforms );
5271a011	182	}
	183
	184	sub _perseus_lookup_str {
	185	my( $orig ) = @_;
	186	_morph_connect();
772edba8	187	return unless $morph;
5271a011	188	# Simple morph DB lookup, and return the results.
	189	my $result = $morph->lookup( $orig );
	190	return map { _wordform_from_row( $_ ) } @{$result->{'objects'}};
	191	}
	192
5271a011	193	}
5271a011	194
fe77efe0	195	sub _wordform_from_row {
fe77efe0	196	my( $rowobj ) = @_;
fe77efe0	197	my $lemma = $rowobj->lemma;
	198	$lemma =~ s/^(\D+)\d*$/$1/;
	199	my $wf = Text::Tradition::Collation::Reading::WordForm->new(
	200	'language' => 'Latin',
	201	'lemma' => $lemma,
f8862b58	202	'morphology' => $rowobj->morphology,
fe77efe0	203	);
	204	return $wf;
	205	}
	206
5271a011	207	1;