[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / Latin.pm

package Text::Tradition::Language::Latin;

use strict;
use warnings;
use Module::Load;
use Morph::Perseus::Structure;
use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct 
	lfs_morph_tags /;
use TryCatch;

=head1 NAME

Text::Tradition::Language::Latin - language-specific module for Latin

=head1 DESCRIPTION

Implements morphology lookup for French words in context.  This module
depends on the Morph::Perseus module for access to PhiloLogic database data.
It also depends on the TreeTagger software
(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
(for now) expected to be installed in $MORPHDIR/TreeTagger.

=head1 SUBROUTINES

=head2 lemmatize( $text )

Evaluates the string using the Flemm package, and returns the results.

=begin testing

use Text::Tradition;
use_ok( 'Text::Tradition::Language::Latin' );

eval "use Morph::Perseus";
my $err = $@;

SKIP: {
	skip "Package Morph::Perseus not found" if $err;

	my $trad = Text::Tradition->new(
		'language' => 'Latin',
		'file' => 't/data/legendfrag.xml',
		'input' => 'Self' );
	$trad->lemmatize();
	my $ambig = 0;
	foreach my $r ( $trad->collation->readings ) {
		next if $r->is_meta;
		ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
		my @lex = $r->lexemes;
		my $lexstr = join( '', map { $_->string } @lex );
		my $textstr = $r->text;
		$textstr =~ s/\s+//g;
		is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
		foreach my $l ( @lex ) {
			next unless $l->matches;
			next if $l->is_disambiguated;
	 		printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
	 			join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
			$ambig++;
		}
	}
	is( $ambig, 7, "Found 7 ambiguous forms as expected" );
}

=end testing

=cut

sub lemmatize {
	my $tradition = shift;
	my %opts = ( 
		'language' => 'Latin', 
		'callback' => sub { _perseus_lookup_tt( @_ ) } 
		);
	return lemmatize_treetagger( $tradition, %opts );
}

=head2 reading_lookup( $rdg[, $rdg, ...] )

Looks up one or more readings using the Perseus package, and returns the
possible results.  This skips the tree tagger / tokenizer, returning any
match for the word string(s) in the morphology DB.

=cut

sub reading_lookup {
	my @words = @_;
	return map { _perseus_lookup_str( $_ ) } @words;
}

=head2 morphology_tags

Return a data structure describing the available parts of speech and their attributes.

=cut

sub morphology_tags {
	try {
		load 'Morph::Perseus::Structure';
	} catch {
		warn "Not using Perseus Latin tags";
	}
	return lfs_morph_tags();
}


{
	my $morph;
	
	sub _morph_connect {
		unless( $morph ) {
			try {
				load 'Morph::Perseus';
				load 'Morph::Perseus::Structure';
				$morph = Morph::Perseus->connect( 'Latin' );
			} catch {
				warn "Cannot do Latin word lemmatization without Morph::Perseus: @_";
				return;
			}
		}
	}
		
	# TODO special case:
	#  passive verbs (-or)
	#  T sapientia -> sapientia
	#  T primus -> unus
	#  T occulta -> occultus (with occulo in next field, hmm...)
	#  T carne -> carnis
	#  T melius -> bonus
	

	my %excep = (
		'absens' => 'absum',
		'aperte' => 'apertus',
		'evolvo' => 'exvolvo',
		'inquiam' => 'inquam',
		'intelligo' => 'intellego',
		'itaque' => 'ita',
		'iuste' => 'iustus',
		'longe' => 'longus',
		'male' => 'malus|malum',
		'multum' => 'multus',
		'nec' => 'neque',
		'nos' => 'ego',
		'occultum' => 'occultus',
		'peregrinans' => 'peregrinor',
		'perfectus' => 'perficio',
		'potius' => 'potis',
		'praesente' => 'praesens',
		'prius' => 'prior',
		'quotidianus' => 'cottidianus',
		'se' => 'sui',
		'septem' => 'septimus',
		'Spiritum' => 'spiritus',
		'viriliter' => 'virilis', # TODO special case -iter?
		'vos' => 'tu',
		
		'datum' => 'do|data|datus',
		'forte' => 'fors|fortis',
		'vere' => 'verum|verus',
		);
		
	sub _perseus_lookup_tt {
		my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
		_morph_connect();
		my $result = $morph->lookup( $orig );
		# Discard results that don't match the lemma, unless lemma is unknown
		my @orig = @{$result->{'objects'}};
		my @ret;
		unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
			# TODO Perseus lemma might have a number on the end, yuck.
			#  multiple lemmata separated with |
			$lemma =~ s/[^\w|]//g;
			$lemma = $excep{$lemma} if exists $excep{$lemma};
			$lemma =~ s/j/i/g;
			my %lems;
			my @forms = 
			map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
			# Now match the lemmata from Treetagger to the lemmata and alt_ls 
			# from Perseus.
			@ret = grep { 
				my $x = $_->lemma; 
				$x =~ s/\d+$//;
				my $y = $_->alt_lex || '';
				$y =~ s/\d+$//;
				$lems{$x} || $lems{$y};
				} @orig;
			warn "TreeTagger lemma $lemma matched no results from Perseus for $orig"
				if @orig && !@ret;
		}
		@ret = @orig unless @ret;
		
		my %unique_wordforms;
		foreach my $obj ( @ret ) {
			my $wf = _wordform_from_row( $obj );
			$unique_wordforms{$wf->to_string} = $wf;
		}
		## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet
		# Discard results that don't match the given TreeTagger POS, unless
		# that leaves zero results
# 		my $ttstruct = treetagger_struct( $pos );
# 		my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms;
# 		unless( @ttmatch ) {
# 			warn "TreeTagger POS $pos matched no results from Perseus for $orig";
# 			@ttmatch = @wordforms;
# 		}
# 		return @ttmatch;
		return values( %unique_wordforms );
	}
	
	sub _perseus_lookup_str {
		my( $orig ) = @_;
		_morph_connect();
		# Simple morph DB lookup, and return the results.
		my $result = $morph->lookup( $orig );
		return map { _wordform_from_row( $_ ) } @{$result->{'objects'}};
	}
	
}

sub _wordform_from_row {
	my( $rowobj ) = @_;
	my $mpstruct;
	try {
		$mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code );
	} catch {
		warn "Could not create morphology structure from "
			. $rowobj->code . ": $!";
	}
	my $lemma = $rowobj->lemma;
	$lemma =~ s/^(\D+)\d*$/$1/;
	my $wf = Text::Tradition::Collation::Reading::WordForm->new(
		'language' => 'Latin',
		'lemma' => $lemma,
		'morphology' => $mpstruct,
		);
	return $wf;
}
	
1;
Commit	Line	Data
5271a011	1	package Text::Tradition::Language::Latin;
	2
	3	use strict;
	4	use warnings;
	5	use Module::Load;
75ae2b25	6	use Morph::Perseus::Structure;
	7	use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct
	8	lfs_morph_tags /;
5271a011	9	use TryCatch;
	10
	11	=head1 NAME
	12
	13	Text::Tradition::Language::Latin - language-specific module for Latin
	14
	15	=head1 DESCRIPTION
	16
	17	Implements morphology lookup for French words in context. This module
	18	depends on the Morph::Perseus module for access to PhiloLogic database data.
	19	It also depends on the TreeTagger software
	20	(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
	21	(for now) expected to be installed in $MORPHDIR/TreeTagger.
	22
	23	=head1 SUBROUTINES
	24
	25	=head2 lemmatize( $text )
	26
	27	Evaluates the string using the Flemm package, and returns the results.
	28
	29	=begin testing
	30
	31	use Text::Tradition;
	32	use_ok( 'Text::Tradition::Language::Latin' );
	33
	34	eval "use Morph::Perseus";
	35	my $err = $@;
	36
	37	SKIP: {
	38	skip "Package Morph::Perseus not found" if $err;
	39
	40	my $trad = Text::Tradition->new(
	41	'language' => 'Latin',
	42	'file' => 't/data/legendfrag.xml',
	43	'input' => 'Self' );
	44	$trad->lemmatize();
	45	my $ambig = 0;
	46	foreach my $r ( $trad->collation->readings ) {
	47	next if $r->is_meta;
	48	ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
	49	my @lex = $r->lexemes;
	50	my $lexstr = join( '', map { $_->string } @lex );
	51	my $textstr = $r->text;
	52	$textstr =~ s/\s+//g;
	53	is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
	54	foreach my $l ( @lex ) {
fe77efe0	55	next unless $l->matches;
5271a011	56	next if $l->is_disambiguated;
	57	printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
	58	join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
	59	$ambig++;
	60	}
	61	}
fe77efe0	62	is( $ambig, 7, "Found 7 ambiguous forms as expected" );
5271a011	63	}
	64
	65	=end testing
	66
	67	=cut
	68
	69	sub lemmatize {
	70	my $tradition = shift;
	71	my %opts = (
	72	'language' => 'Latin',
	73	'callback' => sub { _perseus_lookup_tt( @_ ) }
	74	);
	75	return lemmatize_treetagger( $tradition, %opts );
	76	}
	77
	78	=head2 reading_lookup( $rdg[, $rdg, ...] )
	79
	80	Looks up one or more readings using the Perseus package, and returns the
	81	possible results. This skips the tree tagger / tokenizer, returning any
	82	match for the word string(s) in the morphology DB.
	83
	84	=cut
	85
	86	sub reading_lookup {
	87	my @words = @_;
	88	return map { _perseus_lookup_str( $_ ) } @words;
	89	}
	90
75ae2b25	91	=head2 morphology_tags
	92
	93	Return a data structure describing the available parts of speech and their attributes.
	94
	95	=cut
	96
	97	sub morphology_tags {
	98	try {
	99	load 'Morph::Perseus::Structure';
	100	} catch {
	101	warn "Not using Perseus Latin tags";
	102	}
	103	return lfs_morph_tags();
	104	}
	105
	106
5271a011	107	{
	108	my $morph;
	109
	110	sub _morph_connect {
	111	unless( $morph ) {
	112	try {
	113	load 'Morph::Perseus';
	114	load 'Morph::Perseus::Structure';
	115	$morph = Morph::Perseus->connect( 'Latin' );
	116	} catch {
	117	warn "Cannot do Latin word lemmatization without Morph::Perseus: @_";
	118	return;
	119	}
	120	}
	121	}
	122
fe77efe0	123	# TODO special case:
	124	# passive verbs (-or)
	125	# T sapientia -> sapientia
	126	# T primus -> unus
	127	# T occulta -> occultus (with occulo in next field, hmm...)
	128	# T carne -> carnis
	129	# T melius -> bonus
	130
	131
	132	my %excep = (
	133	'absens' => 'absum',
	134	'aperte' => 'apertus',
	135	'evolvo' => 'exvolvo',
	136	'inquiam' => 'inquam',
	137	'intelligo' => 'intellego',
	138	'itaque' => 'ita',
	139	'iuste' => 'iustus',
	140	'longe' => 'longus',
	141	'male' => 'malus\|malum',
	142	'multum' => 'multus',
	143	'nec' => 'neque',
	144	'nos' => 'ego',
	145	'occultum' => 'occultus',
	146	'peregrinans' => 'peregrinor',
	147	'perfectus' => 'perficio',
	148	'potius' => 'potis',
	149	'praesente' => 'praesens',
	150	'prius' => 'prior',
	151	'quotidianus' => 'cottidianus',
	152	'se' => 'sui',
	153	'septem' => 'septimus',
	154	'Spiritum' => 'spiritus',
	155	'viriliter' => 'virilis', # TODO special case -iter?
	156	'vos' => 'tu',
	157
	158	'datum' => 'do\|data\|datus',
	159	'forte' => 'fors\|fortis',
	160	'vere' => 'verum\|verus',
	161	);
	162
5271a011	163	sub _perseus_lookup_tt {
	164	my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
	165	_morph_connect();
	166	my $result = $morph->lookup( $orig );
	167	# Discard results that don't match the lemma, unless lemma is unknown
fe77efe0	168	my @orig = @{$result->{'objects'}};
5271a011	169	my @ret;
fe77efe0	170	unless( $lemma eq '<unknown>' \|\| $lemma =~ /^\W+$/ ) {
5271a011	171	# TODO Perseus lemma might have a number on the end, yuck.
fe77efe0	172	# multiple lemmata separated with \|
	173	$lemma =~ s/[^\w\|]//g;
	174	$lemma = $excep{$lemma} if exists $excep{$lemma};
	175	$lemma =~ s/j/i/g;
	176	my %lems;
	177	my @forms =
	178	map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\\|/, $lemma );
	179	# Now match the lemmata from Treetagger to the lemmata and alt_ls
	180	# from Perseus.
	181	@ret = grep {
	182	my $x = $_->lemma;
	183	$x =~ s/\d+$//;
	184	my $y = $_->alt_lex \|\| '';
	185	$y =~ s/\d+$//;
	186	$lems{$x} \|\| $lems{$y};
	187	} @orig;
5271a011	188	warn "TreeTagger lemma $lemma matched no results from Perseus for $orig"
fe77efe0	189	if @orig && !@ret;
5271a011	190	}
fe77efe0	191	@ret = @orig unless @ret;
5271a011	192
75ae2b25	193	my %unique_wordforms;
5271a011	194	foreach my $obj ( @ret ) {
75ae2b25	195	my $wf = _wordform_from_row( $obj );
75ae2b25	196	$unique_wordforms{$wf->to_string} = $wf;
5271a011	197	}
5271a011	198	## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet
fe77efe0	199	# Discard results that don't match the given TreeTagger POS, unless
fe77efe0	200	# that leaves zero results
5271a011	201	# my $ttstruct = treetagger_struct( $pos );
	202	# my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms;
	203	# unless( @ttmatch ) {
	204	# warn "TreeTagger POS $pos matched no results from Perseus for $orig";
	205	# @ttmatch = @wordforms;
	206	# }
	207	# return @ttmatch;
75ae2b25	208	return values( %unique_wordforms );
5271a011	209	}
	210
	211	sub _perseus_lookup_str {
	212	my( $orig ) = @_;
	213	_morph_connect();
	214	# Simple morph DB lookup, and return the results.
	215	my $result = $morph->lookup( $orig );
	216	return map { _wordform_from_row( $_ ) } @{$result->{'objects'}};
	217	}
	218
5271a011	219	}
5271a011	220
fe77efe0	221	sub _wordform_from_row {
	222	my( $rowobj ) = @_;
	223	my $mpstruct;
	224	try {
	225	$mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code );
	226	} catch {
	227	warn "Could not create morphology structure from "
	228	. $rowobj->code . ": $!";
	229	}
fe77efe0	230	my $lemma = $rowobj->lemma;
	231	$lemma =~ s/^(\D+)\d*$/$1/;
	232	my $wf = Text::Tradition::Collation::Reading::WordForm->new(
	233	'language' => 'Latin',
	234	'lemma' => $lemma,
	235	'morphology' => $mpstruct,
	236	);
	237	return $wf;
	238	}
	239
5271a011	240	1;