package Text::Tradition::Collation::Reading;
use Moose;
+use Module::Load;
use overload '""' => \&_stringify, 'fallback' => 1;
=head1 NAME
has 'language' => (
is => 'ro',
isa => 'Str',
- default => 'Default',
+ predicate => 'has_language',
);
has 'is_start' => (
A few methods to try to tack on morphological information.
-=head2 use_lexemes
+=head2 lexemes
-TBD
+=head2 has_lexemes
+
+=head2 clear_lexemes
+
+=head2 add_lexeme
+
+=head2 lemmatize
=cut
-# sub use_lexemes {
-# my( $self, @lexemes ) = @_;
-# # The lexemes need to be the same as $self->text.
-# my $cmpstr = $self->has_normal_form ? lc( $self->normal_form ) : lc( $self->text );
-# $cmpstr =~ s/[\s-]+//g;
-# my $lexstr = lc( join( '', @lexemes ) );
-# $lexstr =~ s/[\s-]+//g;
-# unless( $lexstr eq $cmpstr ) {
-# warn "Cannot split " . $self->text . " into " . join( '.', @lexemes );
-# return;
-# }
-# $self->_clear_morph;
-# map { $self->_add_morph( { $_ => [] } ) } @lexemes;
-# }
-#
-# sub add_morphological_tag {
-# my( $self, $lexeme, $opts ) = @_;
-# my $struct;
-# unless( $opts ) {
-# # No lexeme was passed; use reading text.
-# $opts = $lexeme;
-# $lexeme = $self->text;
-# $self->use_lexemes( $lexeme );
-# }
-# # Get the correct container
-# ( $struct ) = grep { exists $_->{$lexeme} } $self->lexemes;
-# unless( $struct ) {
-# warn "No lexeme $lexeme exists in this reading";
-# return;
-# }
-# # Now make the morph object and add it to this lexeme.
-# my $morph_obj = Text::Tradition::Collation::Reading::Morphology->new( $opts );
-# # TODO Check for existence
-# push( @{$struct->{$lexeme}}, $morph_obj );
-# }
+sub lemmatize {
+ my $self = shift;
+ unless( $self->has_language ) {
+ warn "Please set a language to lemmatize a tradition";
+ return;
+ }
+ my $mod = "Text::Tradition::Language::" . $self->language;
+ load( $mod );
+ $mod->can( 'reading_lookup' )->( $self );
+
+}
## Utility methods
no Moose;
__PACKAGE__->meta->make_immutable;
-###################################################
-### Morphology objects, to be attached to readings
-###################################################
-
-package Text::Tradition::Collation::Reading::Morphology;
-
-use Moose;
-
-has 'lemma' => (
- is => 'ro',
- isa => 'Str',
- required => 1,
- );
-
-has 'code' => (
- is => 'ro',
- isa => 'Str',
- required => 1,
- );
-
-has 'language' => (
- is => 'ro',
- isa => 'Str',
- required => 1,
- );
-
-## Transmute codes into comparison arrays for our various languages.
-
-around BUILDARGS => sub {
- my $orig = shift;
- my $class = shift;
- my $args;
- if( @_ == 1 && ref( $_[0] ) ) {
- $args = shift;
- } else {
- $args = { @_ };
- }
- if( exists( $args->{'serial'} ) ) {
- my( $lemma, $code ) = split( /!!/, delete $args->{'serial'} );
- $args->{'lemma'} = $lemma;
- $args->{'code'} = $code;
- }
- $class->$orig( $args );
-};
-
-sub serialization {
- my $self = shift;
- return join( '!!', $self->lemma, $self->code );
-};
-
-sub comparison_array {
- my $self = shift;
- if( $self->language eq 'French' ) {
- my @array;
- my @bits = split( /\+/, $self->code );
- # First push the non k/v parts.
- while( @bits && $bits[0] !~ /=/ ) {
- push( @array, shift @bits );
- }
- while( @array < 2 ) {
- push( @array, undef );
- }
- # Now push the k/v parts in a known order.
- my @fields = qw/ Pers Nb Temps Genre Spec Fonc /;
- my %props;
- map { my( $k, $v ) = split( /=/, $_ ); $props{$k} = $v; } @bits;
- foreach my $k ( @fields ) {
- push( @array, $props{$k} );
- }
- # Give the answer.
- return @array;
- } elsif( $self->language eq 'English' ) {
- # Do something as yet undetermined
- } else {
- # Latin or Greek or Armenian, just split the chars
- return split( '', $self->code );
- }
-};
-
-no Moose;
-__PACKAGE__->meta->make_immutable;
-
1;
-
package Text::Tradition::Language::French;
use Encode qw/ encode_utf8 decode_utf8 /;
-use IPC::Run qw/ run binary /;
+use IPC::Run qw/ run /;
+use Lingua::TagSet::Multext;
+use Lingua::TagSet::TreeTagger;
use Module::Load;
use Text::Tradition::Collation::Reading::Lexeme;
use Text::Tradition::Collation::Reading::WordForm;
=head1 NAME
-Text::Tradition::Language::French - language-specific modules for French
+Text::Tradition::Language::French - language-specific module for French
=head1 DESCRIPTION
-Implements morphology lookup for French words in context.
+Implements morphology lookup for French words in context. This module
+depends on the Flemm module for French lemmatization
+(L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with
+the TreeTagger software
+(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
+(for now) expected to be installed in $MORPHDIR/TreeTagger.
=head1 SUBROUTINES
Evaluates the string using the Flemm package, and returns the results.
+=begin testing
+
+binmode STDOUT, ':utf8';
+use Text::Tradition;
+use_ok( 'Text::Tradition::Language::French' );
+
+eval "use Flemm";
+my $err = $@;
+
+SKIP: {
+ skip "Package Flemm not found" if $err;
+ my $tf = Text::Tradition->new(
+ 'input' => 'Self',
+ 'file' => 't/data/besoin.xml',
+ 'language' => 'French' );
+
+ is( $tf->language, 'French', "Set language okay" );
+ $tf->lemmatize();
+ # Test the lemmatization. How many readings now have morphological info?
+ # Do the lexemes match the reading?
+ my $ambig = 0;
+ foreach my $r ( $tf->collation->readings ) {
+ next if $r->is_meta;
+ ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
+ my @lex = $r->lexemes;
+ my $lexstr = join( '', map { $_->string } @lex );
+ my $textstr = $r->text;
+ $textstr =~ s/\s+//g;
+ is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
+ foreach my $l ( @lex ) {
+ next if $l->is_disambiguated;
+ # printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
+ # join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
+ $ambig++;
+ }
+ }
+ is( $ambig, 102, "Found 102 ambiguous forms as expected" );
+
+ # Try setting the normal form of a reading and re-analyzing
+ my $mr = $tf->collation->reading('99,2');
+ is( $mr->text, 'minspire', "Picked correct test reading" );
+ is( $mr->language, 'French', "Reading has correct language setting" );
+ $mr->normal_form( "m'inspire" );
+ $mr->lemmatize;
+ is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
+}
+
+=end testing
+
=cut
sub lemmatize {
foreach my $sig ( keys %witness_paths ) {
# Get the text as a sequence of readings and as a string
print STDERR "Morphologizing witness $sig\n";
- my @path = @{$witness_paths{$sig}};
- my $tagresult = _treetag_string( $c->_text_from_path( @path ) );
- if( $tagresult ) {
- # Map the tagged words onto the original readings, splitting
- # them up into lexemes where necessary.
- # NOTE we can have multiple lexemes in a reading, but not
- # multiple readings to a lexeme.
- my @tags = split( /\n/, $tagresult );
- my @lexemes;
- my $curr_rdg = shift @path;
- my @curr_lexemes;
- my $unused_rdg_part;
- foreach my $tag ( @tags ) {
- # Get the original word
- my( $lexeme, @rest ) = split( /\t/, $tag );
- # Lemmatize the whole
- my @forms = _parse_wordform( _flemm_lookup( $tag ) );
- my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
- 'string' => $lexeme, 'language' => 'French',
- 'wordform_matchlist' => \@forms );
- # Find the next non-meta reading
- while( $curr_rdg->is_meta ) {
- $curr_rdg = shift @path;
- }
- unless( $curr_rdg ) {
- warn "Ran out of readings in sequence for " . $wit->sigil
- . " at $lexeme";
- last;
- }
- if( $unused_rdg_part &&
- $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
- # Nth part of curr_rdg
- $unused_rdg_part = $2;
- push( @curr_lexemes, $lexobj );
- } elsif( $curr_rdg->text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
- # Flag an error if there is already an unused reading part.
- warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
- if $unused_rdg_part;
- $unused_rdg_part = $2; # will be empty if the whole reading matched
- push( @curr_lexemes, $lexobj );
- } else {
- # We do not cope with the idea of a lexeme being
- # spread across multiple readings.
- warn "Word sequence changed unexpectedly in text";
- # See if we can find a matching reading
- my @lookahead;
- my $matched;
- while( my $nr = shift @path ) {
- if( $nr->text =~ /^\Q$lexeme\E/ ) {
- $curr_rdg = $lookahead[-1] if @lookahead;
- $matched = 1;
- last;
- } else {
- push( @lookahead, $nr );
- }
- }
- # No match? Restore the state we had
- unless( $matched ) {
- unshift( @path, @lookahead );
- }
- # Trigger a move
- $unused_rdg_part = '';
- }
-
- unless( $unused_rdg_part ) {
- # Record the lexemes for the given reading.
- #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
- # join( ' ', map { $_->string } @curr_lexemes ),
- # $curr_rdg->id, $curr_rdg->text );
- _update_reading_lexemes( $curr_rdg, @curr_lexemes );
- $curr_rdg = shift @path;
- @curr_lexemes = ();
- }
- }
- }
+ _lemmatize_sequence( undef, @{$witness_paths{$sig}} );
}
}
sub _update_reading_lexemes {
- my( $reading, @lexemes ) = @_;
- if( $reading->has_lexemes ) {
+ my( $replace, $reading, @lexemes ) = @_;
+ if( $reading->has_lexemes && !$replace ) {
# We need to merge what is in @lexemes with what we have already.
my @oldlex = $reading->lexemes;
my $cmp1 = join( '||', map { $_->string } @oldlex );
my $cmp2 = join( '||', map { $_->string } @lexemes );
- if ( @oldlex == @lexemes && $cmp1 == $cmp2 ) {
+ if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
# The lexeme strings are the same, so merge the possible
# word forms from new to old.
foreach my $i ( 0 .. $#lexemes ) {
my $ol = $oldlex[$i];
my $nl = $lexemes[$i];
my %ofw;
- map { $ofw{$_->_stringify} = 1 } $ol->matching_forms;
+ map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
foreach my $form ( $nl->matching_forms ) {
- unless( $ofw{$form->_stringify} ) {
- print STDERR "Adding form " . $form->_stringify .
+ unless( $ofw{$form->to_string} ) {
+ print STDERR "Adding form " . $form->to_string .
" to lexeme " . $nl->string . " at $reading\n";
$ol->add_matching_form( $form );
$ol->is_disambiguated(0);
}
}
} else {
- $DB::single = 1;
warn "Lexeme layout for $reading changed; replacing the lot";
$reading->clear_lexemes;
$reading->add_lexeme( @lexemes );
}
} else {
+ $reading->clear_lexemes if $replace;
$reading->add_lexeme( @lexemes );
}
}
-=head2 word_lookup( $word )
+=head2 reading_lookup( $rdg[, $rdg, ...] )
-Looks up a word using the Flemm package, and returns the possible results.
-It is better to use L<lemmatize> for context sensitivity.
+Looks up one or more readings using the Flemm package, and returns the
+possible results. This uses the same logic as L<lemmatize> above for the
+entire tradition, but can also be used to (re-)analyze individual readings.
=cut
-sub word_lookup {
- my $word = shift;
- my $tagresult = _treetag_string( $word );
- my $lemmatizer;
- try {
- load 'Flemm';
- $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
- } catch {
- warn "Cannot do French word lemmatization without Flemm: @_";
- return;
+sub reading_lookup {
+ return _lemmatize_sequence( 1, @_ );
+}
+
+sub _lemmatize_sequence {
+ my( $replace, @path ) = @_;
+ $DB::single = 1 if $replace;
+ my $tagresult = _treetag_string( _text_from_path( 1, @path ) );
+ if( $tagresult ) {
+ # Map the tagged words onto the original readings, splitting
+ # them up into lexemes where necessary.
+ # NOTE we can have multiple lexemes in a reading, but not
+ # multiple readings to a lexeme.
+ my @tags = split( /\n/, $tagresult );
+ my @lexemes;
+ my $curr_rdg = shift @path;
+ my @curr_lexemes;
+ my $unused_rdg_part;
+ foreach my $tag ( @tags ) {
+ # Get the original word
+ my( $lexeme, @rest ) = split( /\t/, $tag );
+ # Lemmatize the whole
+ my @forms = _parse_wordform( _flemm_lookup( $tag ) );
+ my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
+ 'string' => $lexeme, 'language' => 'French',
+ 'wordform_matchlist' => \@forms );
+ # Find the next non-meta reading
+ while( $curr_rdg && $curr_rdg->is_meta ) {
+ $curr_rdg = shift @path;
+ }
+ unless( $curr_rdg ) {
+ warn "Ran out of readings in sequence for " . $wit->sigil
+ . " at $lexeme";
+ last;
+ }
+ my $curr_rdg_text = $curr_rdg->has_normal_form
+ ? $curr_rdg->normal_form : $curr_rdg->text;
+ if( $unused_rdg_part &&
+ $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
+ # Nth part of curr_rdg
+ $unused_rdg_part = $2;
+ push( @curr_lexemes, $lexobj );
+ } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
+ # Flag an error if there is already an unused reading part.
+ warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
+ if $unused_rdg_part;
+ $unused_rdg_part = $2; # will be empty if the whole reading matched
+ push( @curr_lexemes, $lexobj );
+ } else {
+ # We do not cope with the idea of a lexeme being
+ # spread across multiple readings.
+ warn "Word sequence changed unexpectedly in text";
+ # See if we can find a matching reading
+ my @lookahead;
+ my $matched;
+ while( my $nr = shift @path ) {
+ my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
+ if( $nrtext =~ /^\Q$lexeme\E/ ) {
+ $curr_rdg = $lookahead[-1] if @lookahead;
+ $matched = 1;
+ last;
+ } else {
+ push( @lookahead, $nr );
+ }
+ }
+ # No match? Restore the state we had
+ unless( $matched ) {
+ unshift( @path, @lookahead );
+ }
+ # Trigger a move
+ $unused_rdg_part = '';
+ }
+
+ unless( $unused_rdg_part ) {
+ # Record the lexemes for the given reading.
+ #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
+ # join( ' ', map { $_->string } @curr_lexemes ),
+ # $curr_rdg->id, $curr_rdg->text );
+ _update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes );
+ $curr_rdg = shift @path;
+ @curr_lexemes = ();
+ }
+ }
+ }
+}
+
+# Utility function so that we can cheat and use it when we need both the path
+# and its text.
+sub _text_from_path {
+ my( $normalize, @path ) = @_;
+ my $pathtext = '';
+ my $last;
+ foreach my $r ( @path ) {
+ unless ( $r->join_prior || !$last || $last->join_next ) {
+ $pathtext .= ' ';
+ }
+ $pathtext .= ( $normalize && $r->has_normal_form )
+ ? $r->normal_form : $r->text;
+ $last = $r;
}
- return _parse_wordform( _flemm_lookup( $tagresult ) );
+ return $pathtext;
}
# Utility function that actually calls the tree tagger.
foreach ( @results ) {
my( $orig, $tag, $lemma ) = split( /\t/, $_ );
my( $pos, $morph ) = split( /:/, $tag );
- my $wf = Text::Tradition::Collation::Reading::WordForm->new(
- 'language' => 'French',
- 'lemma' => $lemma,
- 'morphology' => [ split( //, $morph ) ],
- );
- push( @forms, $wf );
+ my $morphobj;
+ if( $morph ) {
+ $morphobj = Lingua::TagSet::Multext->tag2structure( $morph );
+ } else {
+ # Use the TreeTagger info if there is no Flemm morphology.
+ $morphobj = Lingua::TagSet::TreeTagger->tag2structure( $pos );
+ }
+ if( $morphobj ) {
+ my $wf = Text::Tradition::Collation::Reading::WordForm->new(
+ 'language' => 'French',
+ 'lemma' => $lemma,
+ 'morphology' => $morphobj,
+ );
+ push( @forms, $wf );
+ } else {
+ warn "No morphology found for word: $_";
+ }
}
return @forms;
}
1;
+=head2 TODO
+
+=over
+
+=item * Handle package dependencies more gracefully
+
+=back
+
=head1 LICENSE
This package is free software and is provided "as is" without express