From: Tara L Andrews Date: Mon, 23 Apr 2012 14:06:57 +0000 (+0200) Subject: start implementing morphology on readings X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=cca4f996c756a6989b0c38aa13f974b31f3da54a;p=scpubgit%2Fstemmatology.git start implementing morphology on readings --- diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm index a57e68f..47bd0f7 100644 --- a/lib/Text/Tradition/Collation/Reading.pm +++ b/lib/Text/Tradition/Collation/Reading.pm @@ -137,20 +137,16 @@ has 'normal_form' => ( predicate => 'has_normal_form', ); -has 'lemma' => ( - is => 'rw', - isa => 'Str', - predicate => 'has_lemma', - ); - -has 'morphology' => ( +# Holds the word form. If is_disambiguated is true, the form at index zero +# is the correct one. +has 'lexemes' => ( traits => ['Array'], - isa => 'ArrayRef[HashRef[ArrayRef[Text::Tradition::Collation::Reading::Morphology]]]', + isa => 'ArrayRef[Text::Tradition::Collation::Lexeme]', handles => { lexemes => 'elements', - has_morphology => 'count', - _clear_morph => 'clear', - _add_morph => 'push', + has_lexemes => 'count', + _clear_lexemes => 'clear', + _add_lexeme => 'push', }, ); @@ -280,88 +276,47 @@ sub _stringify { A few methods to try to tack on morphological information. -=head2 is_disambiguated - -Returns true if there is only one tag per lexeme in this reading. - =head2 use_lexemes TBD -=head2 add_morphological_tag - -TBD - -=head2 disambiguate - -TBD - =cut -sub use_lexemes { - my( $self, @lexemes ) = @_; - # The lexemes need to be the same as $self->text. - my $cmpstr = $self->has_normal_form ? lc( $self->normal_form ) : lc( $self->text ); - $cmpstr =~ s/[\s-]+//g; - my $lexstr = lc( join( '', @lexemes ) ); - $lexstr =~ s/[\s-]+//g; - unless( $lexstr eq $cmpstr ) { - warn "Cannot split " . $self->text . " into " . join( '.', @lexemes ); - return; - } - $self->_clear_morph; - map { $self->_add_morph( { $_ => [] } ) } @lexemes; -} - -sub add_morphological_tag { - my( $self, $lexeme, $opts ) = @_; - my $struct; - unless( $opts ) { - # No lexeme was passed; use reading text. - $opts = $lexeme; - $lexeme = $self->text; - $self->use_lexemes( $lexeme ); - } - # Get the correct container - ( $struct ) = grep { exists $_->{$lexeme} } $self->lexemes; - unless( $struct ) { - warn "No lexeme $lexeme exists in this reading"; - return; - } - # Now make the morph object and add it to this lexeme. - my $morph_obj = Text::Tradition::Collation::Reading::Morphology->new( $opts ); - # TODO Check for existence - push( @{$struct->{$lexeme}}, $morph_obj ); -} - -sub disambiguate { - my( $self, $lexeme, $index ) = @_; - my $struct; - unless( $index ) { - # No lexeme was passed; use reading text. - $index = $lexeme; - $lexeme = $self->text; - } - # Get the correct container - ( $struct ) = grep { exists $_->{$lexeme} } $self->lexemes; - unless( $struct ) { - warn "No lexeme $lexeme exists in this reading"; - return; - } - # Keep the object at the selected index - my $selected = $struct->{$lexeme}->[$index]; - $struct->{$lexeme} = [ $selected ]; -} - -sub is_disambiguated { - my $self = shift; - return undef unless $self->has_morphology; - foreach my $lexeme ( $self->lexemes ) { - my( $key ) = keys %$lexeme; # will be only one - return undef unless @{$lexeme->{$key}} == 1; - } - return 1; -} +# sub use_lexemes { +# my( $self, @lexemes ) = @_; +# # The lexemes need to be the same as $self->text. +# my $cmpstr = $self->has_normal_form ? lc( $self->normal_form ) : lc( $self->text ); +# $cmpstr =~ s/[\s-]+//g; +# my $lexstr = lc( join( '', @lexemes ) ); +# $lexstr =~ s/[\s-]+//g; +# unless( $lexstr eq $cmpstr ) { +# warn "Cannot split " . $self->text . " into " . join( '.', @lexemes ); +# return; +# } +# $self->_clear_morph; +# map { $self->_add_morph( { $_ => [] } ) } @lexemes; +# } +# +# sub add_morphological_tag { +# my( $self, $lexeme, $opts ) = @_; +# my $struct; +# unless( $opts ) { +# # No lexeme was passed; use reading text. +# $opts = $lexeme; +# $lexeme = $self->text; +# $self->use_lexemes( $lexeme ); +# } +# # Get the correct container +# ( $struct ) = grep { exists $_->{$lexeme} } $self->lexemes; +# unless( $struct ) { +# warn "No lexeme $lexeme exists in this reading"; +# return; +# } +# # Now make the morph object and add it to this lexeme. +# my $morph_obj = Text::Tradition::Collation::Reading::Morphology->new( $opts ); +# # TODO Check for existence +# push( @{$struct->{$lexeme}}, $morph_obj ); +# } ## Utility methods diff --git a/lib/Text/Tradition/Collation/Reading/Lexeme.pm b/lib/Text/Tradition/Collation/Reading/Lexeme.pm new file mode 100644 index 0000000..4ea9bf3 --- /dev/null +++ b/lib/Text/Tradition/Collation/Reading/Lexeme.pm @@ -0,0 +1,148 @@ +package Text::Tradition::Collation::Reading::Lexeme; + +use Moose; +use Module::Load; + +=head1 NAME + +Text::Tradition::Collation::Reading::Lexeme - represents the components of +a Reading. + +=head1 DESCRIPTION + +Text::Tradition is a library for representation and analysis of collated +texts, particularly medieval ones. A word form is used for the analysis of +Reading objects; it consists of a lemma, a language, and a code to +represent its part of speech. In general the word forms for a particular +language should be read from / written to some morphological database. + +=head1 METHODS + +=head2 new + +Creates a new lexeme from the passed options. + +=head2 language + +Returns the language to which this lexeme belongs. + +=head2 normalized + +Returns the canonical string version of this lexeme. + +=head2 matches + +Returns the number of possible word forms for this lexeme, as drawn from +the appropriate database. + +=head2 matching_forms + +Returns an array of the possible word forms for this lexeme. + +=head2 matching_form( $index ) + +Returns the form at $index in the list of matching forms. + +=head2 is_disambiguated + +Returns true if a single wordform has been picked as 'correct' for this +lexeme in its context. + +=head2 form + +Returns the correct word form (if any has been selected) for the lexeme in +its context. + +=cut + +# TODO need to be able to populate this from DB +has 'language' => ( + is => 'ro', + isa => 'Str', + required => 1, + ); + +has 'string' => ( + is => 'rw', + isa => 'Str', + required => 1, + ); + +has 'wordform_matchlist' => ( + isa => 'ArrayRef[Text::Tradition::Collation::Reading::WordForm]', + traits => ['Array'], + handles => { + 'matches' => 'count', + 'matching_forms' => 'elements', + 'matching_form' => 'get', + 'add_matching_form' => 'push', + ); + +has 'is_disambiguated' => ( + is => 'ro', + isa => 'Bool', + default => undef, + writer => '_set_disambiguated', + ); + +has 'form' => ( + is => 'ro', + isa => 'Text::Tradition::Collation::Reading::WordForm', + writer => '_set_form', + ); + + +=head2 disambiguate( $index ) + +Selects the word form at $index in the list of matching forms, and asserts +that this is the correct form for the lexeme. + +=cut + +sub disambiguate { + my( $self, $idx ) = @_; + my $form = $self->matching_form( $idx ); + throw( "There is no candidate wordform at index $idx" ) + unless $form; + $self->_set_form( $form ); + $self->_set_disambiguated( 1 ); +} + +=head2 lookup + +Uses the module for the declared language to look up the lexeme in the +language database (if any.) Sets the returned morphological matches in +matching_forms, and returns the list as an array of WordForm objects. + +=cut + +sub lookup { + my $self = shift; + my $lang = $self->language; + my @answers; + try { + my $langmod = "Text::Tradition::Language::$lang"; + load( $langmod ); + @answers = $langmod->can( 'word_lookup' )->( $self->string ); + } catch { + throw( "No language module for $lang, or the module has no word_lookup functionality" ); + } + $self->clear_matching_forms; + $self->add_matching_form( @answers ); + return @answers; +} + +no Moose; +__PACKAGE__->meta->make_immutable; + +1; + +=head1 LICENSE + +This package is free software and is provided "as is" without express +or implied warranty. You can redistribute it and/or modify it under +the same terms as Perl itself. + +=head1 AUTHOR + +Tara L Andrews Eaurum@cpan.orgE diff --git a/lib/Text/Tradition/Collation/Reading/WordForm.pm b/lib/Text/Tradition/Collation/Reading/WordForm.pm new file mode 100644 index 0000000..a56cd2b --- /dev/null +++ b/lib/Text/Tradition/Collation/Reading/WordForm.pm @@ -0,0 +1,94 @@ +package Text::Tradition::Collation::Reading::WordForm; + +use Moose; + +=head1 NAME + +Text::Tradition::Collation::Reading::WordForm - represents a +language/lemma/morphology triplet that can be associated with a Reading. + +=head1 DESCRIPTION + +Text::Tradition is a library for representation and analysis of collated +texts, particularly medieval ones. A word form is used for the analysis of +Reading objects; it consists of a lemma, a language, and a code to +represent its part of speech. In general the word forms for a particular +language should be read from / written to some morphological database. + +=head1 METHODS + +=head2 new + +Creates a new word form from the passed options. + +=head2 language + +Returns the language to which this word form belongs. + +=head2 lemma + +Returns the lemma for the word form. + +=head2 morphology + +Returns an array representing this word's morphology. The contents of the +array depend on the language being used. + +=cut + +has 'language' => ( + is => 'ro', + isa => 'Str', + required => 1, + ); + +# TODO do we need this? +has 'form' => ( + is => 'ro', + isa => 'Str', + # required => 1, + ); + +has 'lemma' => ( + is => 'ro', + isa => 'Str', + required => 1, + ); + +has 'morphology' => ( + is => 'ro', + isa => 'ArrayRef', + required => 1, + ); + +around BUILDARGS => sub { + my $orig = shift; + my $class = shift; + my %args = @_ == 1 ? %{$_[0]} : @_; + unless( ref( $args{'morphology'} ) ) { + my @morph = split( '', $args{'morphology'} ); + $args{'morphology'} = \@morph; + } + $class->$orig( %args ); +}; + +sub _stringify { + my $self = shift; + return sprintf( "%s//%s//%s", $self->language, $self->lemma, + join( '', $self->morphology ) ); +} + +no Moose; +__PACKAGE__->meta->make_immutable; + +1; + +=head1 LICENSE + +This package is free software and is provided "as is" without express +or implied warranty. You can redistribute it and/or modify it under +the same terms as Perl itself. + +=head1 AUTHOR + +Tara L Andrews Eaurum@cpan.orgE diff --git a/lib/Text/Tradition/Error.pm b/lib/Text/Tradition/Error.pm index 42a9997..0910175 100644 --- a/lib/Text/Tradition/Error.pm +++ b/lib/Text/Tradition/Error.pm @@ -19,7 +19,7 @@ __PACKAGE__->meta->make_immutable( inline_constructor => 0 ); =head1 NAME -Text::Tradition::Error - throwable error class for CollateX package +Text::Tradition::Error - throwable error class for Tradition package =head1 DESCRIPTION diff --git a/lib/Text/Tradition/Language/French.pm b/lib/Text/Tradition/Language/French.pm new file mode 100644 index 0000000..d103bc0 --- /dev/null +++ b/lib/Text/Tradition/Language/French.pm @@ -0,0 +1,48 @@ +package Text::Tradition::Language::French; + +#use Flemm; +use Text::Tradition::Collation::Reading::WordForm; + +=head1 NAME + +Text::Tradition::Language::French - language-specific modules for French + +=head1 DESCRIPTION + +Implements morphology lookup for French words in context. + +=head1 SUBROUTINES + +=head2 lemmatize( $text ) + +Evaluates the string using the Flemm package, and returns the results. + +=cut + +sub lemmatize { + my $text = shift; + + +} + +=head2 word_lookup( $word ) + +Looks up a word using the Flemm package, and returns the possible results. +It is better to use L for context sensitivity. + +=cut + +sub word_lookup { + my $word = shift; + +} + +=head1 LICENSE + +This package is free software and is provided "as is" without express +or implied warranty. You can redistribute it and/or modify it under +the same terms as Perl itself. + +=head1 AUTHOR + +Tara L Andrews Eaurum@cpan.orgE