[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / Latin.pm

package Text::Tradition::Language::Latin;

use strict;
use warnings;
use Module::Load;
use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct 
	lfs_morph_tags /;
use TryCatch;

=head1 NAME

Text::Tradition::Language::Latin - language-specific module for Latin

=head1 DESCRIPTION

Implements morphology lookup for French words in context.  This module
depends on the Morph::Perseus module for access to PhiloLogic database data.
It also depends on the TreeTagger software
(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
(for now) expected to be installed in $MORPHDIR/TreeTagger.

=head1 SUBROUTINES

=head2 lemmatize( $text )

Evaluates the string using the Flemm package, and returns the results.

=begin testing

use Text::Tradition;
use_ok( 'Text::Tradition::Language::Latin' );

eval "use Morph::Perseus";
my $err = $@;

SKIP: {
	skip "Package Morph::Perseus not found" if $err;

	my $trad = Text::Tradition->new(
		'language' => 'Latin',
		'file' => 't/data/legendfrag.xml',
		'input' => 'Self' );
	$trad->lemmatize();
	my $ambig = 0;
	foreach my $r ( $trad->collation->readings ) {
		next if $r->is_meta;
		ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
		my @lex = $r->lexemes;
		my $lexstr = join( '', map { $_->string } @lex );
		my $textstr = $r->text;
		$textstr =~ s/\s+//g;
		is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
		foreach my $l ( @lex ) {
			next unless $l->matches;
			next if $l->is_disambiguated;
	 		printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
	 			join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
			$ambig++;
		}
	}
	is( $ambig, 7, "Found 7 ambiguous forms as expected" );
}

=end testing

=cut

sub lemmatize {
	my $tradition = shift;
	my %opts = ( 
		'language' => 'Latin', 
		'callback' => sub { _perseus_lookup_tt( @_ ) } 
		);
	return lemmatize_treetagger( $tradition, %opts );
}

=head2 reading_lookup( $rdg[, $rdg, ...] )

Looks up one or more readings using the Perseus package, and returns the
possible results.  This skips the tree tagger / tokenizer, returning any
match for the word string(s) in the morphology DB.

=cut

sub reading_lookup {
	my @words = @_;
	return map { _perseus_lookup_str( $_ ) } @words;
}

=head2 morphology_tags

Return a data structure describing the available parts of speech and their attributes.

=cut

sub morphology_tags {
	try {
		load 'Morph::Perseus::Structure';
	} catch {
		warn "Not using Perseus Latin tags";
	}
	return lfs_morph_tags();
}


{
	my $morph;
	
	sub _morph_connect {
		unless( $morph ) {
			try {
				load 'Morph::Perseus';
				load 'Morph::Perseus::Structure';
				$morph = Morph::Perseus->connect( 'Latin' );
			} catch {
				warn "Cannot do Latin word lemmatization without Morph::Perseus: @_";
				return;
			}
		}
	}
		
	# TODO special case:
	#  passive verbs (-or)
	#  T sapientia -> sapientia
	#  T primus -> unus
	#  T occulta -> occultus (with occulo in next field, hmm...)
	#  T carne -> carnis
	#  T melius -> bonus
	

	my %excep = (
		'absens' => 'absum',
		'aperte' => 'apertus',
		'evolvo' => 'exvolvo',
		'inquiam' => 'inquam',
		'intelligo' => 'intellego',
		'itaque' => 'ita',
		'iuste' => 'iustus',
		'longe' => 'longus',
		'male' => 'malus|malum',
		'multum' => 'multus',
		'nec' => 'neque',
		'nos' => 'ego',
		'occultum' => 'occultus',
		'peregrinans' => 'peregrinor',
		'perfectus' => 'perficio',
		'potius' => 'potis',
		'praesente' => 'praesens',
		'prius' => 'prior',
		'quotidianus' => 'cottidianus',
		'se' => 'sui',
		'septem' => 'septimus',
		'Spiritum' => 'spiritus',
		'viriliter' => 'virilis', # TODO special case -iter?
		'vos' => 'tu',
		
		'datum' => 'do|data|datus',
		'forte' => 'fors|fortis',
		'vere' => 'verum|verus',
		);
		
	sub _perseus_lookup_tt {
		my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
		_morph_connect();
		return unlesss $morph;
		my $result = $morph->lookup( $orig );
		# Discard results that don't match the lemma, unless lemma is unknown
		my @orig = @{$result->{'objects'}};
		my @ret;
		unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
			# TODO Perseus lemma might have a number on the end, yuck.
			#  multiple lemmata separated with |
			$lemma =~ s/[^\w|]//g;
			$lemma = $excep{$lemma} if exists $excep{$lemma};
			$lemma =~ s/j/i/g;
			my %lems;
			my @forms = 
			map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
			# Now match the lemmata from Treetagger to the lemmata and alt_ls 
			# from Perseus.
			@ret = grep { 
				my $x = $_->lemma; 
				$x =~ s/\d+$//;
				my $y = $_->alt_lex || '';
				$y =~ s/\d+$//;
				$lems{$x} || $lems{$y};
				} @orig;
			warn "TreeTagger lemma $lemma matched no results from Perseus for $orig"
				if @orig && !@ret;
		}
		@ret = @orig unless @ret;
		
		my %unique_wordforms;
		foreach my $obj ( @ret ) {
			my $wf = _wordform_from_row( $obj );
			$unique_wordforms{$wf->to_string} = $wf;
		}
		## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet
		# Discard results that don't match the given TreeTagger POS, unless
		# that leaves zero results
# 		my $ttstruct = treetagger_struct( $pos );
# 		my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms;
# 		unless( @ttmatch ) {
# 			warn "TreeTagger POS $pos matched no results from Perseus for $orig";
# 			@ttmatch = @wordforms;
# 		}
# 		return @ttmatch;
		return values( %unique_wordforms );
	}
	
	sub _perseus_lookup_str {
		my( $orig ) = @_;
		_morph_connect();
		return unless $morph;
		# Simple morph DB lookup, and return the results.
		my $result = $morph->lookup( $orig );
		return map { _wordform_from_row( $_ ) } @{$result->{'objects'}};
	}
	
}

sub _wordform_from_row {
	my( $rowobj ) = @_;
	my $mpstruct;
	try {
		# M::P::St will be loaded already if we got here
		$mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code );
	} catch {
		warn "Could not create morphology structure from "
			. $rowobj->code . ": $!";
	}
	my $lemma = $rowobj->lemma;
	$lemma =~ s/^(\D+)\d*$/$1/;
	my $wf = Text::Tradition::Collation::Reading::WordForm->new(
		'language' => 'Latin',
		'lemma' => $lemma,
		'morphology' => $mpstruct,
		);
	return $wf;
}
	
1;
Commit	Line	Data
5271a011	1	package Text::Tradition::Language::Latin;
	2
	3	use strict;
	4	use warnings;
	5	use Module::Load;
75ae2b25	6	use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct
75ae2b25	7	lfs_morph_tags /;
5271a011	8	use TryCatch;
	9
	10	=head1 NAME
	11
	12	Text::Tradition::Language::Latin - language-specific module for Latin
	13
	14	=head1 DESCRIPTION
	15
	16	Implements morphology lookup for French words in context. This module
	17	depends on the Morph::Perseus module for access to PhiloLogic database data.
	18	It also depends on the TreeTagger software
	19	(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
	20	(for now) expected to be installed in $MORPHDIR/TreeTagger.
	21
	22	=head1 SUBROUTINES
	23
	24	=head2 lemmatize( $text )
	25
	26	Evaluates the string using the Flemm package, and returns the results.
	27
	28	=begin testing
	29
	30	use Text::Tradition;
	31	use_ok( 'Text::Tradition::Language::Latin' );
	32
	33	eval "use Morph::Perseus";
	34	my $err = $@;
	35
	36	SKIP: {
	37	skip "Package Morph::Perseus not found" if $err;
	38
	39	my $trad = Text::Tradition->new(
	40	'language' => 'Latin',
	41	'file' => 't/data/legendfrag.xml',
	42	'input' => 'Self' );
	43	$trad->lemmatize();
	44	my $ambig = 0;
	45	foreach my $r ( $trad->collation->readings ) {
	46	next if $r->is_meta;
	47	ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
	48	my @lex = $r->lexemes;
	49	my $lexstr = join( '', map { $_->string } @lex );
	50	my $textstr = $r->text;
	51	$textstr =~ s/\s+//g;
	52	is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
	53	foreach my $l ( @lex ) {
fe77efe0	54	next unless $l->matches;
5271a011	55	next if $l->is_disambiguated;
	56	printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
	57	join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
	58	$ambig++;
	59	}
	60	}
fe77efe0	61	is( $ambig, 7, "Found 7 ambiguous forms as expected" );
5271a011	62	}
	63
	64	=end testing
	65
	66	=cut
	67
	68	sub lemmatize {
	69	my $tradition = shift;
	70	my %opts = (
	71	'language' => 'Latin',
	72	'callback' => sub { _perseus_lookup_tt( @_ ) }
	73	);
	74	return lemmatize_treetagger( $tradition, %opts );
	75	}
	76
	77	=head2 reading_lookup( $rdg[, $rdg, ...] )
	78
	79	Looks up one or more readings using the Perseus package, and returns the
	80	possible results. This skips the tree tagger / tokenizer, returning any
	81	match for the word string(s) in the morphology DB.
	82
	83	=cut
	84
	85	sub reading_lookup {
	86	my @words = @_;
	87	return map { _perseus_lookup_str( $_ ) } @words;
	88	}
	89
75ae2b25	90	=head2 morphology_tags
	91
	92	Return a data structure describing the available parts of speech and their attributes.
	93
	94	=cut
	95
	96	sub morphology_tags {
	97	try {
	98	load 'Morph::Perseus::Structure';
	99	} catch {
	100	warn "Not using Perseus Latin tags";
	101	}
	102	return lfs_morph_tags();
	103	}
	104
	105
5271a011	106	{
	107	my $morph;
	108
	109	sub _morph_connect {
	110	unless( $morph ) {
	111	try {
	112	load 'Morph::Perseus';
	113	load 'Morph::Perseus::Structure';
	114	$morph = Morph::Perseus->connect( 'Latin' );
	115	} catch {
	116	warn "Cannot do Latin word lemmatization without Morph::Perseus: @_";
	117	return;
	118	}
	119	}
	120	}
	121
fe77efe0	122	# TODO special case:
	123	# passive verbs (-or)
	124	# T sapientia -> sapientia
	125	# T primus -> unus
	126	# T occulta -> occultus (with occulo in next field, hmm...)
	127	# T carne -> carnis
	128	# T melius -> bonus
	129
	130
	131	my %excep = (
	132	'absens' => 'absum',
	133	'aperte' => 'apertus',
	134	'evolvo' => 'exvolvo',
	135	'inquiam' => 'inquam',
	136	'intelligo' => 'intellego',
	137	'itaque' => 'ita',
	138	'iuste' => 'iustus',
	139	'longe' => 'longus',
	140	'male' => 'malus\|malum',
	141	'multum' => 'multus',
	142	'nec' => 'neque',
	143	'nos' => 'ego',
	144	'occultum' => 'occultus',
	145	'peregrinans' => 'peregrinor',
	146	'perfectus' => 'perficio',
	147	'potius' => 'potis',
	148	'praesente' => 'praesens',
	149	'prius' => 'prior',
	150	'quotidianus' => 'cottidianus',
	151	'se' => 'sui',
	152	'septem' => 'septimus',
	153	'Spiritum' => 'spiritus',
	154	'viriliter' => 'virilis', # TODO special case -iter?
	155	'vos' => 'tu',
	156
	157	'datum' => 'do\|data\|datus',
	158	'forte' => 'fors\|fortis',
	159	'vere' => 'verum\|verus',
	160	);
	161
5271a011	162	sub _perseus_lookup_tt {
	163	my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
	164	_morph_connect();
772edba8	165	return unlesss $morph;
5271a011	166	my $result = $morph->lookup( $orig );
5271a011	167	# Discard results that don't match the lemma, unless lemma is unknown
fe77efe0	168	my @orig = @{$result->{'objects'}};
5271a011	169	my @ret;
fe77efe0	170	unless( $lemma eq '<unknown>' \|\| $lemma =~ /^\W+$/ ) {
5271a011	171	# TODO Perseus lemma might have a number on the end, yuck.
fe77efe0	172	# multiple lemmata separated with \|
	173	$lemma =~ s/[^\w\|]//g;
	174	$lemma = $excep{$lemma} if exists $excep{$lemma};
	175	$lemma =~ s/j/i/g;
	176	my %lems;
	177	my @forms =
	178	map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\\|/, $lemma );
	179	# Now match the lemmata from Treetagger to the lemmata and alt_ls
	180	# from Perseus.
	181	@ret = grep {
	182	my $x = $_->lemma;
	183	$x =~ s/\d+$//;
	184	my $y = $_->alt_lex \|\| '';
	185	$y =~ s/\d+$//;
	186	$lems{$x} \|\| $lems{$y};
	187	} @orig;
5271a011	188	warn "TreeTagger lemma $lemma matched no results from Perseus for $orig"
fe77efe0	189	if @orig && !@ret;
5271a011	190	}
fe77efe0	191	@ret = @orig unless @ret;
5271a011	192
75ae2b25	193	my %unique_wordforms;
5271a011	194	foreach my $obj ( @ret ) {
75ae2b25	195	my $wf = _wordform_from_row( $obj );
75ae2b25	196	$unique_wordforms{$wf->to_string} = $wf;
5271a011	197	}
5271a011	198	## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet
fe77efe0	199	# Discard results that don't match the given TreeTagger POS, unless
fe77efe0	200	# that leaves zero results
5271a011	201	# my $ttstruct = treetagger_struct( $pos );
	202	# my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms;
	203	# unless( @ttmatch ) {
	204	# warn "TreeTagger POS $pos matched no results from Perseus for $orig";
	205	# @ttmatch = @wordforms;
	206	# }
	207	# return @ttmatch;
75ae2b25	208	return values( %unique_wordforms );
5271a011	209	}
	210
	211	sub _perseus_lookup_str {
	212	my( $orig ) = @_;
	213	_morph_connect();
772edba8	214	return unless $morph;
5271a011	215	# Simple morph DB lookup, and return the results.
	216	my $result = $morph->lookup( $orig );
	217	return map { _wordform_from_row( $_ ) } @{$result->{'objects'}};
	218	}
	219
5271a011	220	}
5271a011	221
fe77efe0	222	sub _wordform_from_row {
	223	my( $rowobj ) = @_;
	224	my $mpstruct;
	225	try {
772edba8	226	# M::P::St will be loaded already if we got here
fe77efe0	227	$mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code );
	228	} catch {
	229	warn "Could not create morphology structure from "
	230	. $rowobj->code . ": $!";
	231	}
fe77efe0	232	my $lemma = $rowobj->lemma;
	233	$lemma =~ s/^(\D+)\d*$/$1/;
	234	my $wf = Text::Tradition::Collation::Reading::WordForm->new(
	235	'language' => 'Latin',
	236	'lemma' => $lemma,
	237	'morphology' => $mpstruct,
	238	);
	239	return $wf;
	240	}
	241
5271a011	242	1;