[scpubgit/stemmatology.git] / morphology / lib / Text / Tradition / Language / Base.pm

package Text::Tradition::Language::Base;

use strict;
use warnings;
use Encode qw/ encode_utf8 decode_utf8 /;
use Exporter 'import';
use vars qw/ @EXPORT_OK /;
use IPC::Run qw/ run /;
use Module::Load;
use Text::Tradition::Collation::Reading::Lexeme;
use Text::Tradition::Collation::Reading::WordForm;
use TryCatch;
use Unicode::Normalize;

@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags 
	unicode_regularize /;

=head1 NAME

Text::Tradition::Language::Base - Base subroutines for lemmatization of words

=head1 DESCRIPTION

Common routines for applying morphological tagging to a Text::Tradition. Used
with callbacks from the named language packages.

=head1 SUBROUTINES

=head2 lemmatize_treetagger( $tradition )

Evaluates the tradition with the given options, and returns the results.

=cut

sub lemmatize_treetagger {
	my( $tradition, %opts ) = @_;

	# Given a tradition, lemmatize it witness by witness and see what we get.
	my $c = $tradition->collation;
	# First, clear out all existing lexemes from the readings. 
	my %witness_paths = _clear_reading_lexemes( $tradition );
	
	foreach my $sig ( keys %witness_paths ) {
		# Get the text as a sequence of readings and as a string
		my %witopts = (
			'path' => $witness_paths{$sig},
			%opts
			);
		_lemmatize_treetagger_sequence( %witopts );
	}
}

sub _clear_reading_lexemes {
	my $tradition = shift;
		my $c = $tradition->collation;
	# Clear out all existing lexemes from the readings. Save the path as long 
	# as we went to the trouble of generating it.
	my %witness_paths;
	foreach my $wit ( $tradition->witnesses ) {
		my @sigla = ( $wit->sigil );
		push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
		foreach my $sig ( @sigla ) {
			my @path = grep { !$_->is_meta } 
				$c->reading_sequence( $c->start, $c->end, $sig );
			map { $_->clear_lexemes } @path;
			$witness_paths{$sig} = \@path;
		}
	}
	return %witness_paths;
}

=head2 reading_lookup( $rdg[, $rdg, ...] )

Looks up one or more readings using the Flemm package, and returns the
possible results.  This uses the same logic as L<lemmatize> above for the
entire tradition, but can also be used to (re-)analyze individual readings.

=cut

sub reading_lookup_treetagger {
	my %opts = @_;
	$opts{'replace'} = 1;
	return _lemmatize_treetagger_sequence( %opts );
}

sub _lemmatize_treetagger_sequence {
	my %opts = @_;
	my @path = @{$opts{'path'}};
	my $tagresult = _treetag_string( _text_from_path( 1, @path ), $opts{'language'} );
	if( $tagresult ) {
		# Map the tagged words onto the original readings, splitting 
		# them up into lexemes where necessary.
		# NOTE we can have multiple lexemes in a reading, but not
		# multiple readings to a lexeme.
		my @tags = split( /\n/, $tagresult );
		my @lexemes;
		my $curr_rdg = shift @path;
		my @curr_lexemes;
		my $unused_rdg_part;
		foreach my $tag ( @tags ) {
			# Get the original word
			my( $lexeme, @rest ) = split( /\t/, $tag );
			# Lemmatize the whole
			# TODO error trap this
			my @forms = $opts{'callback'}( $tag );

			my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
				'string' => $lexeme, 'language' => $opts{'language'},
				'wordform_matchlist' => \@forms );
			# Find the next non-meta reading
			while( $curr_rdg && $curr_rdg->is_meta ) {
				$curr_rdg = shift @path;
			}
			unless( $curr_rdg ) {
				warn "Ran out of readings in sequence at $lexeme";
				last;
			}
			my $curr_rdg_text = $curr_rdg->normal_form;
			if( $unused_rdg_part &&
				$unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
				# Nth part of curr_rdg
				$unused_rdg_part = $2;
				push( @curr_lexemes, $lexobj );
			} elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
				# Flag an error if there is already an unused reading part.
				warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
					if $unused_rdg_part;
				$unused_rdg_part = $2; # will be empty if the whole reading matched
				push( @curr_lexemes, $lexobj );
			} else {
				# We do not cope with the idea of a lexeme being 
				# spread across multiple readings.
				warn "Word sequence changed unexpectedly in text";
				# See if we can find a matching reading
				my @lookahead;
				my $matched;
				while( my $nr = shift @path ) {
					my $nrtext = $nr->normal_form;
					if( $nrtext =~ /^\Q$lexeme\E/ ) {
						$curr_rdg = $lookahead[-1] if @lookahead;
						$matched = 1;
						last;
					} else {
						push( @lookahead, $nr );
					}
				}
				# No match? Restore the state we had
				unless( $matched ) {
					unshift( @path, @lookahead );
				}
				# Trigger a move
				$unused_rdg_part = '';
			}
			
			unless( $unused_rdg_part ) {
				# Record the lexemes for the given reading.
				#print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
				#	join( ' ', map { $_->string } @curr_lexemes ),
				#	$curr_rdg->id, $curr_rdg->text );
				_update_reading_lexemes( $opts{replace}, $curr_rdg, @curr_lexemes );
				$curr_rdg = shift @path;
				@curr_lexemes = ();
			}
		}
	}
}

sub _update_reading_lexemes {
	my( $replace, $reading, @lexemes ) = @_;
	if( $reading->has_lexemes && !$replace ) {
		# We need to merge what is in @lexemes with what we have already.
		my @oldlex = $reading->lexemes;
		my $cmp1 = join( '||', map { $_->string } @oldlex );
		my $cmp2 = join( '||', map { $_->string } @lexemes );
		if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
			# The lexeme strings are the same, so merge the possible
			# word forms from new to old.
			foreach my $i ( 0 .. $#lexemes ) {
				my $ol = $oldlex[$i];
				my $nl = $lexemes[$i];
				my %ofw;
				map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
				foreach my $form ( $nl->matching_forms ) {
					unless( $ofw{$form->to_string} ) {
						# print STDERR "Adding form " . $form->to_string . 
						# 	" to lexeme " . $nl->string . " at $reading\n";
						$ol->add_matching_form( $form );
						$ol->is_disambiguated(0);
					}
				}
			}
		} else {
			warn "Lexeme layout for $reading changed; replacing the lot";
			$reading->clear_lexemes;
			$reading->add_lexeme( @lexemes );
		}
	} else {
		$reading->clear_lexemes if $replace;
		$reading->add_lexeme( @lexemes );
	}
}

# Utility function so that we can cheat and use it when we need both the path
# and its text.
sub _text_from_path {
	my( $normalize, @path ) = @_;
	my $pathtext = '';
	my $last;
	foreach my $r ( @path ) {
		unless ( $r->join_prior || !$last || $last->join_next ) {
			$pathtext .= ' ';
		} 
		$pathtext .= $normalize ? $r->normal_form : $r->text;
		$last = $r;
	}
	return $pathtext;
}

# Utility function that actually calls the tree tagger.
sub _treetag_string {
	my( $text, $lang ) = @_;
	my $wittext = encode_utf8( $text );
	# Then see if we have TreeTagger
	try {
		load( 'Lingua::TreeTagger' );
	} catch {
		warn "Cannot run TreeTagger without Lingua::TreeTagger module";
		return '';
	}
	# OK, we can run it then.
	# First upgrade to UTF8 for necessary languages.
	my @utf8_supported = qw/ French Latin Greek /;
	my %ttopts = ( 'language' => $lang, 'options' => [ qw/ -token -lemma / ] );
	if( grep { $_ eq $lang } @utf8_supported ) {
		$ttopts{'use_utf8'} = 1;
	}
	# Now instantiate and run the tagger.
	my $tagger = Lingua::TreeTagger->new( %ttopts );
	my $tagresult = $tagger->tag_text( \$text );
	
	# TODO maybe send the tokens back rather than the interpreted string...
	return $tagresult->as_text();
}

=head2 lfs_morph_tags

Return a data structure describing the available parts of speech and their attributes
from the Lingua::Features::Structure class currently defined.

=cut

sub lfs_morph_tags {
	load('Lingua::Features::StructureType');
	my $tagset = { 'structures' => [], 'features' => {} };
	foreach my $lfs ( sort { _by_structid( $a->id, $b->id ) } Lingua::Features::StructureType->types() ) {
		my $tsstruct = { 'id' => $lfs->id, 'desc' => $lfs->desc, 'use_features' => [] };
		foreach my $ftid ( Lingua::Features::StructureType->type($lfs->id)->features ) {
			my $ftype = $lfs->feature_type( $ftid );
			if( !$ftype && $lfs->base ) {
				$ftype = $lfs->base->feature_type( $ftid );
			}
			if( $ftype ) {
				push( @{$tsstruct->{'use_features'}}, $ftid );
				if( $ftid eq 'type' ) {
					# Type values change according to category
					$ftid .= " (" . $lfs->id . ")";
				}
				my $tfstruct = { 'id' => $ftid, 'values' => [] };
				foreach my $fval( $ftype->values ) {
					push( @{$tfstruct->{'values'}}, 
						{ 'short' => $fval, 'long' => $ftype->value_name( $fval ) } );
				}
				$tagset->{'features'}->{$ftid} = $tfstruct;
			}
		}
		push( @{$tagset->{'structures'}}, $tsstruct );
	}
	return $tagset;
}

sub _by_structid {
	my( $a, $b ) = @_;
	return -1 if $a eq 'cat';
	return 1 if $b eq 'cat';
	return $a cmp $b;
}

=head2 unicode_regularize( $word )

Returns a lowercased and accent-stripped version of the word.

=cut

sub unicode_regularize {
	my $word = shift;
	my @normalized;
	my @letters = split( '', lc( $word ) );
	foreach my $l ( @letters ) {
		my $d = chr( ord( NFKD( $l ) ) );
		next unless $d =~ /[[:alnum:]]/; # toss out e.g. Greek underdots
		push( @normalized, $d );
	}
	return join( '', @normalized );
}

1;

=head2 TODO

=over

=item * Handle package dependencies more gracefully

=back
Commit	Line	Data
e0f6836a	1	package Text::Tradition::Language::Base;
	2
	3	use strict;
	4	use warnings;
	5	use Encode qw/ encode_utf8 decode_utf8 /;
	6	use Exporter 'import';
	7	use vars qw/ @EXPORT_OK /;
	8	use IPC::Run qw/ run /;
e0f6836a	9	use Module::Load;
	10	use Text::Tradition::Collation::Reading::Lexeme;
	11	use Text::Tradition::Collation::Reading::WordForm;
	12	use TryCatch;
307d8db9	13	use Unicode::Normalize;
e0f6836a	14
307d8db9	15	@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags
307d8db9	16	unicode_regularize /;
e0f6836a	17
	18	=head1 NAME
	19
	20	Text::Tradition::Language::Base - Base subroutines for lemmatization of words
	21
	22	=head1 DESCRIPTION
	23
	24	Common routines for applying morphological tagging to a Text::Tradition. Used
	25	with callbacks from the named language packages.
	26
	27	=head1 SUBROUTINES
	28
	29	=head2 lemmatize_treetagger( $tradition )
	30
	31	Evaluates the tradition with the given options, and returns the results.
	32
	33	=cut
	34
	35	sub lemmatize_treetagger {
	36	my( $tradition, %opts ) = @_;
	37
	38	# Given a tradition, lemmatize it witness by witness and see what we get.
	39	my $c = $tradition->collation;
	40	# First, clear out all existing lexemes from the readings.
	41	my %witness_paths = _clear_reading_lexemes( $tradition );
	42
	43	foreach my $sig ( keys %witness_paths ) {
	44	# Get the text as a sequence of readings and as a string
e0f6836a	45	my %witopts = (
	46	'path' => $witness_paths{$sig},
	47	%opts
	48	);
	49	_lemmatize_treetagger_sequence( %witopts );
	50	}
	51	}
	52
	53	sub _clear_reading_lexemes {
	54	my $tradition = shift;
	55	my $c = $tradition->collation;
	56	# Clear out all existing lexemes from the readings. Save the path as long
	57	# as we went to the trouble of generating it.
	58	my %witness_paths;
	59	foreach my $wit ( $tradition->witnesses ) {
	60	my @sigla = ( $wit->sigil );
	61	push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
	62	foreach my $sig ( @sigla ) {
	63	my @path = grep { !$_->is_meta }
	64	$c->reading_sequence( $c->start, $c->end, $sig );
	65	map { $_->clear_lexemes } @path;
	66	$witness_paths{$sig} = \@path;
	67	}
	68	}
	69	return %witness_paths;
	70	}
	71
	72	=head2 reading_lookup( $rdg[, $rdg, ...] )
	73
	74	Looks up one or more readings using the Flemm package, and returns the
	75	possible results. This uses the same logic as L<lemmatize> above for the
	76	entire tradition, but can also be used to (re-)analyze individual readings.
	77
	78	=cut
	79
	80	sub reading_lookup_treetagger {
	81	my %opts = @_;
	82	$opts{'replace'} = 1;
	83	return _lemmatize_treetagger_sequence( %opts );
	84	}
	85
	86	sub _lemmatize_treetagger_sequence {
	87	my %opts = @_;
	88	my @path = @{$opts{'path'}};
	89	my $tagresult = _treetag_string( _text_from_path( 1, @path ), $opts{'language'} );
	90	if( $tagresult ) {
	91	# Map the tagged words onto the original readings, splitting
	92	# them up into lexemes where necessary.
	93	# NOTE we can have multiple lexemes in a reading, but not
	94	# multiple readings to a lexeme.
	95	my @tags = split( /\n/, $tagresult );
	96	my @lexemes;
	97	my $curr_rdg = shift @path;
	98	my @curr_lexemes;
	99	my $unused_rdg_part;
	100	foreach my $tag ( @tags ) {
	101	# Get the original word
	102	my( $lexeme, @rest ) = split( /\t/, $tag );
	103	# Lemmatize the whole
	104	# TODO error trap this
	105	my @forms = $opts{'callback'}( $tag );
	106
	107	my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
	108	'string' => $lexeme, 'language' => $opts{'language'},
109	'wordform_matchlist' => \@forms );
110	# Find the next non-meta reading
111	while( $curr_rdg && $curr_rdg->is_meta ) {
112	$curr_rdg = shift @path;
113	}
114	unless( $curr_rdg ) {
115	warn "Ran out of readings in sequence at $lexeme";
116	last;
117	}
367e901b	118	my $curr_rdg_text = $curr_rdg->normal_form;
e0f6836a	119	if( $unused_rdg_part &&
	120	$unused_rdg_part =~ /^\Q$lexeme\E(\s)(.)$/ ) {
	121	# Nth part of curr_rdg
	122	$unused_rdg_part = $2;
	123	push( @curr_lexemes, $lexobj );
	124	} elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s)(.)$/ ) {
	125	# Flag an error if there is already an unused reading part.
	126	warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
	127	if $unused_rdg_part;
	128	$unused_rdg_part = $2; # will be empty if the whole reading matched
	129	push( @curr_lexemes, $lexobj );
	130	} else {
	131	# We do not cope with the idea of a lexeme being
	132	# spread across multiple readings.
	133	warn "Word sequence changed unexpectedly in text";
	134	# See if we can find a matching reading
	135	my @lookahead;
	136	my $matched;
	137	while( my $nr = shift @path ) {
367e901b	138	my $nrtext = $nr->normal_form;
e0f6836a	139	if( $nrtext =~ /^\Q$lexeme\E/ ) {
	140	$curr_rdg = $lookahead[-1] if @lookahead;
	141	$matched = 1;
	142	last;
	143	} else {
	144	push( @lookahead, $nr );
	145	}
	146	}
	147	# No match? Restore the state we had
	148	unless( $matched ) {
	149	unshift( @path, @lookahead );
	150	}
	151	# Trigger a move
	152	$unused_rdg_part = '';
	153	}
	154
	155	unless( $unused_rdg_part ) {
	156	# Record the lexemes for the given reading.
	157	#print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
	158	# join( ' ', map { $_->string } @curr_lexemes ),
	159	# $curr_rdg->id, $curr_rdg->text );
	160	_update_reading_lexemes( $opts{replace}, $curr_rdg, @curr_lexemes );
	161	$curr_rdg = shift @path;
	162	@curr_lexemes = ();
	163	}
	164	}
	165	}
	166	}
	167
	168	sub _update_reading_lexemes {
	169	my( $replace, $reading, @lexemes ) = @_;
	170	if( $reading->has_lexemes && !$replace ) {
	171	# We need to merge what is in @lexemes with what we have already.
	172	my @oldlex = $reading->lexemes;
	173	my $cmp1 = join( '\|\|', map { $_->string } @oldlex );
	174	my $cmp2 = join( '\|\|', map { $_->string } @lexemes );
	175	if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
	176	# The lexeme strings are the same, so merge the possible
	177	# word forms from new to old.
	178	foreach my $i ( 0 .. $#lexemes ) {
	179	my $ol = $oldlex[$i];
	180	my $nl = $lexemes[$i];
	181	my %ofw;
	182	map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
	183	foreach my $form ( $nl->matching_forms ) {
	184	unless( $ofw{$form->to_string} ) {
a3ef385d	185	# print STDERR "Adding form " . $form->to_string .
a3ef385d	186	# " to lexeme " . $nl->string . " at $reading\n";
e0f6836a	187	$ol->add_matching_form( $form );
	188	$ol->is_disambiguated(0);
	189	}
	190	}
	191	}
	192	} else {
	193	warn "Lexeme layout for $reading changed; replacing the lot";
	194	$reading->clear_lexemes;
	195	$reading->add_lexeme( @lexemes );
	196	}
	197	} else {
	198	$reading->clear_lexemes if $replace;
	199	$reading->add_lexeme( @lexemes );
	200	}
	201	}
	202
	203	# Utility function so that we can cheat and use it when we need both the path
	204	# and its text.
	205	sub _text_from_path {
	206	my( $normalize, @path ) = @_;
	207	my $pathtext = '';
	208	my $last;
	209	foreach my $r ( @path ) {
	210	unless ( $r->join_prior \|\| !$last \|\| $last->join_next ) {
	211	$pathtext .= ' ';
	212	}
367e901b	213	$pathtext .= $normalize ? $r->normal_form : $r->text;
e0f6836a	214	$last = $r;
	215	}
	216	return $pathtext;
	217	}
	218
	219	# Utility function that actually calls the tree tagger.
	220	sub _treetag_string {
	221	my( $text, $lang ) = @_;
	222	my $wittext = encode_utf8( $text );
	223	# Then see if we have TreeTagger
	224	try {
	225	load( 'Lingua::TreeTagger' );
	226	} catch {
	227	warn "Cannot run TreeTagger without Lingua::TreeTagger module";
	228	return '';
	229	}
	230	# OK, we can run it then.
	231	# First upgrade to UTF8 for necessary languages.
ad2a2c47	232	my @utf8_supported = qw/ French Latin Greek /;
a3ef385d	233	my %ttopts = ( 'language' => $lang, 'options' => [ qw/ -token -lemma / ] );
e0f6836a	234	if( grep { $_ eq $lang } @utf8_supported ) {
a3ef385d	235	$ttopts{'use_utf8'} = 1;
e0f6836a	236	}
e0f6836a	237	# Now instantiate and run the tagger.
a3ef385d	238	my $tagger = Lingua::TreeTagger->new( %ttopts );
e0f6836a	239	my $tagresult = $tagger->tag_text( \$text );
	240
	241	# TODO maybe send the tokens back rather than the interpreted string...
e0f6836a	242	return $tagresult->as_text();
	243	}
	244
75ae2b25	245	=head2 lfs_morph_tags
	246
	247	Return a data structure describing the available parts of speech and their attributes
	248	from the Lingua::Features::Structure class currently defined.
	249
	250	=cut
	251
	252	sub lfs_morph_tags {
	253	load('Lingua::Features::StructureType');
	254	my $tagset = { 'structures' => [], 'features' => {} };
	255	foreach my $lfs ( sort { _by_structid( $a->id, $b->id ) } Lingua::Features::StructureType->types() ) {
f8862b58	256	my $tsstruct = { 'id' => $lfs->id, 'desc' => $lfs->desc, 'use_features' => [] };
75ae2b25	257	foreach my $ftid ( Lingua::Features::StructureType->type($lfs->id)->features ) {
75ae2b25	258	my $ftype = $lfs->feature_type( $ftid );
f8862b58	259	if( !$ftype && $lfs->base ) {
	260	$ftype = $lfs->base->feature_type( $ftid );
	261	}
	262	if( $ftype ) {
	263	push( @{$tsstruct->{'use_features'}}, $ftid );
	264	if( $ftid eq 'type' ) {
	265	# Type values change according to category
	266	$ftid .= " (" . $lfs->id . ")";
	267	}
	268	my $tfstruct = { 'id' => $ftid, 'values' => [] };
	269	foreach my $fval( $ftype->values ) {
	270	push( @{$tfstruct->{'values'}},
	271	{ 'short' => $fval, 'long' => $ftype->value_name( $fval ) } );
	272	}
	273	$tagset->{'features'}->{$ftid} = $tfstruct;
75ae2b25	274	}
75ae2b25	275	}
	276	push( @{$tagset->{'structures'}}, $tsstruct );
	277	}
	278	return $tagset;
	279	}
	280
	281	sub _by_structid {
	282	my( $a, $b ) = @_;
	283	return -1 if $a eq 'cat';
	284	return 1 if $b eq 'cat';
	285	return $a cmp $b;
	286	}
	287
307d8db9	288	=head2 unicode_regularize( $word )
	289
	290	Returns a lowercased and accent-stripped version of the word.
	291
	292	=cut
	293
	294	sub unicode_regularize {
	295	my $word = shift;
	296	my @normalized;
	297	my @letters = split( '', lc( $word ) );
	298	foreach my $l ( @letters ) {
	299	my $d = chr( ord( NFKD( $l ) ) );
	300	next unless $d =~ /[[:alnum:]]/; # toss out e.g. Greek underdots
	301	push( @normalized, $d );
	302	}
	303	return join( '', @normalized );
	304	}
	305
e0f6836a	306	1;
	307
	308	=head2 TODO
	309
	310	=over
	311
	312	=item * Handle package dependencies more gracefully
	313
	314	=back
	315