[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / Latin.pm

package Text::Tradition::Language::Latin;

use strict;
use warnings;
use Module::Load;
use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct /;
use TryCatch;

=head1 NAME

Text::Tradition::Language::Latin - language-specific module for Latin

=head1 DESCRIPTION

Implements morphology lookup for French words in context.  This module
depends on the Morph::Perseus module for access to PhiloLogic database data.
It also depends on the TreeTagger software
(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
(for now) expected to be installed in $MORPHDIR/TreeTagger.

=head1 SUBROUTINES

=head2 lemmatize( $text )

Evaluates the string using the Flemm package, and returns the results.

=begin testing

use Text::Tradition;
use_ok( 'Text::Tradition::Language::Latin' );

eval "use Morph::Perseus";
my $err = $@;

SKIP: {
	skip "Package Morph::Perseus not found" if $err;

	my $trad = Text::Tradition->new(
		'language' => 'Latin',
		'file' => 't/data/legendfrag.xml',
		'input' => 'Self' );
	$trad->lemmatize();
	my $ambig = 0;
	foreach my $r ( $trad->collation->readings ) {
		next if $r->is_meta;
		ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
		my @lex = $r->lexemes;
		my $lexstr = join( '', map { $_->string } @lex );
		my $textstr = $r->text;
		$textstr =~ s/\s+//g;
		is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
		foreach my $l ( @lex ) {
			next unless $l->matches;
			next if $l->is_disambiguated;
	 		printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
	 			join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
			$ambig++;
		}
	}
	is( $ambig, 7, "Found 7 ambiguous forms as expected" );
}

=end testing

=cut

sub lemmatize {
	my $tradition = shift;
	my %opts = ( 
		'language' => 'Latin', 
		'callback' => sub { _perseus_lookup_tt( @_ ) } 
		);
	return lemmatize_treetagger( $tradition, %opts );
}

=head2 reading_lookup( $rdg[, $rdg, ...] )

Looks up one or more readings using the Perseus package, and returns the
possible results.  This skips the tree tagger / tokenizer, returning any
match for the word string(s) in the morphology DB.

=cut

sub reading_lookup {
	my @words = @_;
	return map { _perseus_lookup_str( $_ ) } @words;
}

{
	my $morph;
	
	sub _morph_connect {
		unless( $morph ) {
			try {
				load 'Morph::Perseus';
				load 'Morph::Perseus::Structure';
				$morph = Morph::Perseus->connect( 'Latin' );
			} catch {
				warn "Cannot do Latin word lemmatization without Morph::Perseus: @_";
				return;
			}
		}
	}
		
	# TODO special case:
	#  passive verbs (-or)
	#  T sapientia -> sapientia
	#  T primus -> unus
	#  T occulta -> occultus (with occulo in next field, hmm...)
	#  T carne -> carnis
	#  T melius -> bonus
	

	my %excep = (
		'absens' => 'absum',
		'aperte' => 'apertus',
		'evolvo' => 'exvolvo',
		'inquiam' => 'inquam',
		'intelligo' => 'intellego',
		'itaque' => 'ita',
		'iuste' => 'iustus',
		'longe' => 'longus',
		'male' => 'malus|malum',
		'multum' => 'multus',
		'nec' => 'neque',
		'nos' => 'ego',
		'occultum' => 'occultus',
		'peregrinans' => 'peregrinor',
		'perfectus' => 'perficio',
		'potius' => 'potis',
		'praesente' => 'praesens',
		'prius' => 'prior',
		'quotidianus' => 'cottidianus',
		'se' => 'sui',
		'septem' => 'septimus',
		'Spiritum' => 'spiritus',
		'viriliter' => 'virilis', # TODO special case -iter?
		'vos' => 'tu',
		
		'datum' => 'do|data|datus',
		'forte' => 'fors|fortis',
		'vere' => 'verum|verus',
		);
		
	sub _perseus_lookup_tt {
		my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
		_morph_connect();
		my $result = $morph->lookup( $orig );
		# Discard results that don't match the lemma, unless lemma is unknown
		my @orig = @{$result->{'objects'}};
		my @ret;
		unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
			# TODO Perseus lemma might have a number on the end, yuck.
			#  multiple lemmata separated with |
			$lemma =~ s/[^\w|]//g;
			$lemma = $excep{$lemma} if exists $excep{$lemma};
			$lemma =~ s/j/i/g;
			my %lems;
			my @forms = 
			map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
			# Now match the lemmata from Treetagger to the lemmata and alt_ls 
			# from Perseus.
			@ret = grep { 
				my $x = $_->lemma; 
				$x =~ s/\d+$//;
				my $y = $_->alt_lex || '';
				$y =~ s/\d+$//;
				$lems{$x} || $lems{$y};
				} @orig;
			warn "TreeTagger lemma $lemma matched no results from Perseus for $orig"
				if @orig && !@ret;
		}
		@ret = @orig unless @ret;
		
		my @wordforms;
		foreach my $obj ( @ret ) {
			push( @wordforms, _wordform_from_row( $obj ) );
		}
		## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet
		# Discard results that don't match the given TreeTagger POS, unless
		# that leaves zero results
# 		my $ttstruct = treetagger_struct( $pos );
# 		my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms;
# 		unless( @ttmatch ) {
# 			warn "TreeTagger POS $pos matched no results from Perseus for $orig";
# 			@ttmatch = @wordforms;
# 		}
# 		return @ttmatch;
		return @wordforms;
	}
	
	sub _perseus_lookup_str {
		my( $orig ) = @_;
		_morph_connect();
		# Simple morph DB lookup, and return the results.
		my $result = $morph->lookup( $orig );
		return map { _wordform_from_row( $_ ) } @{$result->{'objects'}};
	}
	
}

sub _wordform_from_row {
	my( $rowobj ) = @_;
	my $mpstruct;
	try {
		$mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code );
	} catch {
		warn "Could not create morphology structure from "
			. $rowobj->code . ": $!";
	}
	$DB::single = 1 unless $mpstruct;
	my $lemma = $rowobj->lemma;
	$lemma =~ s/^(\D+)\d*$/$1/;
	my $wf = Text::Tradition::Collation::Reading::WordForm->new(
		'language' => 'Latin',
		'lemma' => $lemma,
		'morphology' => $mpstruct,
		);
	return $wf;
}
	
1;
Commit	Line	Data
5271a011	1	package Text::Tradition::Language::Latin;
	2
	3	use strict;
	4	use warnings;
	5	use Module::Load;
	6	use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct /;
	7	use TryCatch;
	8
	9	=head1 NAME
	10
	11	Text::Tradition::Language::Latin - language-specific module for Latin
	12
	13	=head1 DESCRIPTION
	14
	15	Implements morphology lookup for French words in context. This module
	16	depends on the Morph::Perseus module for access to PhiloLogic database data.
	17	It also depends on the TreeTagger software
	18	(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
	19	(for now) expected to be installed in $MORPHDIR/TreeTagger.
	20
	21	=head1 SUBROUTINES
	22
	23	=head2 lemmatize( $text )
	24
	25	Evaluates the string using the Flemm package, and returns the results.
	26
	27	=begin testing
	28
	29	use Text::Tradition;
	30	use_ok( 'Text::Tradition::Language::Latin' );
	31
	32	eval "use Morph::Perseus";
	33	my $err = $@;
	34
	35	SKIP: {
	36	skip "Package Morph::Perseus not found" if $err;
	37
	38	my $trad = Text::Tradition->new(
	39	'language' => 'Latin',
	40	'file' => 't/data/legendfrag.xml',
	41	'input' => 'Self' );
	42	$trad->lemmatize();
	43	my $ambig = 0;
	44	foreach my $r ( $trad->collation->readings ) {
	45	next if $r->is_meta;
	46	ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
	47	my @lex = $r->lexemes;
	48	my $lexstr = join( '', map { $_->string } @lex );
	49	my $textstr = $r->text;
	50	$textstr =~ s/\s+//g;
	51	is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
	52	foreach my $l ( @lex ) {
fe77efe0	53	next unless $l->matches;
5271a011	54	next if $l->is_disambiguated;
	55	printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
	56	join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
	57	$ambig++;
	58	}
	59	}
fe77efe0	60	is( $ambig, 7, "Found 7 ambiguous forms as expected" );
5271a011	61	}
	62
	63	=end testing
	64
	65	=cut
	66
	67	sub lemmatize {
	68	my $tradition = shift;
	69	my %opts = (
	70	'language' => 'Latin',
	71	'callback' => sub { _perseus_lookup_tt( @_ ) }
	72	);
	73	return lemmatize_treetagger( $tradition, %opts );
	74	}
	75
	76	=head2 reading_lookup( $rdg[, $rdg, ...] )
	77
	78	Looks up one or more readings using the Perseus package, and returns the
	79	possible results. This skips the tree tagger / tokenizer, returning any
	80	match for the word string(s) in the morphology DB.
	81
	82	=cut
	83
	84	sub reading_lookup {
	85	my @words = @_;
	86	return map { _perseus_lookup_str( $_ ) } @words;
	87	}
	88
	89	{
	90	my $morph;
	91
	92	sub _morph_connect {
	93	unless( $morph ) {
	94	try {
	95	load 'Morph::Perseus';
	96	load 'Morph::Perseus::Structure';
	97	$morph = Morph::Perseus->connect( 'Latin' );
	98	} catch {
	99	warn "Cannot do Latin word lemmatization without Morph::Perseus: @_";
	100	return;
	101	}
	102	}
	103	}
	104
fe77efe0	105	# TODO special case:
	106	# passive verbs (-or)
	107	# T sapientia -> sapientia
	108	# T primus -> unus
	109	# T occulta -> occultus (with occulo in next field, hmm...)
	110	# T carne -> carnis
	111	# T melius -> bonus
	112
	113
	114	my %excep = (
	115	'absens' => 'absum',
	116	'aperte' => 'apertus',
	117	'evolvo' => 'exvolvo',
	118	'inquiam' => 'inquam',
	119	'intelligo' => 'intellego',
	120	'itaque' => 'ita',
	121	'iuste' => 'iustus',
	122	'longe' => 'longus',
	123	'male' => 'malus\|malum',
	124	'multum' => 'multus',
	125	'nec' => 'neque',
	126	'nos' => 'ego',
	127	'occultum' => 'occultus',
	128	'peregrinans' => 'peregrinor',
	129	'perfectus' => 'perficio',
	130	'potius' => 'potis',
	131	'praesente' => 'praesens',
	132	'prius' => 'prior',
	133	'quotidianus' => 'cottidianus',
	134	'se' => 'sui',
	135	'septem' => 'septimus',
	136	'Spiritum' => 'spiritus',
	137	'viriliter' => 'virilis', # TODO special case -iter?
	138	'vos' => 'tu',
	139
	140	'datum' => 'do\|data\|datus',
	141	'forte' => 'fors\|fortis',
	142	'vere' => 'verum\|verus',
	143	);
	144
5271a011	145	sub _perseus_lookup_tt {
	146	my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
	147	_morph_connect();
	148	my $result = $morph->lookup( $orig );
	149	# Discard results that don't match the lemma, unless lemma is unknown
fe77efe0	150	my @orig = @{$result->{'objects'}};
5271a011	151	my @ret;
fe77efe0	152	unless( $lemma eq '<unknown>' \|\| $lemma =~ /^\W+$/ ) {
5271a011	153	# TODO Perseus lemma might have a number on the end, yuck.
fe77efe0	154	# multiple lemmata separated with \|
	155	$lemma =~ s/[^\w\|]//g;
	156	$lemma = $excep{$lemma} if exists $excep{$lemma};
	157	$lemma =~ s/j/i/g;
	158	my %lems;
	159	my @forms =
	160	map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\\|/, $lemma );
	161	# Now match the lemmata from Treetagger to the lemmata and alt_ls
	162	# from Perseus.
	163	@ret = grep {
	164	my $x = $_->lemma;
	165	$x =~ s/\d+$//;
	166	my $y = $_->alt_lex \|\| '';
	167	$y =~ s/\d+$//;
	168	$lems{$x} \|\| $lems{$y};
	169	} @orig;
5271a011	170	warn "TreeTagger lemma $lemma matched no results from Perseus for $orig"
fe77efe0	171	if @orig && !@ret;
5271a011	172	}
fe77efe0	173	@ret = @orig unless @ret;
5271a011	174
5271a011	175	my @wordforms;
	176	foreach my $obj ( @ret ) {
	177	push( @wordforms, _wordform_from_row( $obj ) );
	178	}
	179	## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet
fe77efe0	180	# Discard results that don't match the given TreeTagger POS, unless
fe77efe0	181	# that leaves zero results
5271a011	182	# my $ttstruct = treetagger_struct( $pos );
	183	# my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms;
	184	# unless( @ttmatch ) {
	185	# warn "TreeTagger POS $pos matched no results from Perseus for $orig";
	186	# @ttmatch = @wordforms;
	187	# }
	188	# return @ttmatch;
	189	return @wordforms;
	190	}
	191
	192	sub _perseus_lookup_str {
	193	my( $orig ) = @_;
	194	_morph_connect();
	195	# Simple morph DB lookup, and return the results.
	196	my $result = $morph->lookup( $orig );
	197	return map { _wordform_from_row( $_ ) } @{$result->{'objects'}};
	198	}
	199
5271a011	200	}
5271a011	201
fe77efe0	202	sub _wordform_from_row {
	203	my( $rowobj ) = @_;
	204	my $mpstruct;
	205	try {
	206	$mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code );
	207	} catch {
	208	warn "Could not create morphology structure from "
	209	. $rowobj->code . ": $!";
	210	}
	211	$DB::single = 1 unless $mpstruct;
	212	my $lemma = $rowobj->lemma;
	213	$lemma =~ s/^(\D+)\d*$/$1/;
	214	my $wf = Text::Tradition::Collation::Reading::WordForm->new(
	215	'language' => 'Latin',
	216	'lemma' => $lemma,
	217	'morphology' => $mpstruct,
	218	);
	219	return $wf;
	220	}
	221
5271a011	222	1;