From: Tara L Andrews Date: Thu, 27 Sep 2012 11:51:23 +0000 (+0200) Subject: distinguish between collation fuzzy normalization and orthographic regularization X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=58f9c2b92303a12ae9811463875daa4220e05722;p=scpubgit%2Fstemmatology.git distinguish between collation fuzzy normalization and orthographic regularization --- diff --git a/morphology/lib/Text/Tradition/Language/Armenian.pm b/morphology/lib/Text/Tradition/Language/Armenian.pm index da962ef..a28204f 100644 --- a/morphology/lib/Text/Tradition/Language/Armenian.pm +++ b/morphology/lib/Text/Tradition/Language/Armenian.pm @@ -76,16 +76,12 @@ sub reading_lookup { =head2 regularize( $text ) -Returns a regularized form of the reading for the purposes of collation. +Returns an orthographically regular form of the reading. =cut sub regularize { my( $word ) = @_; - # We don't really distinguish between commas and semicolons properly - # in the manuscript. Make them the same. - $word =~ s/\./\,/g; - # Get rid of accent marks. $word =~ s/՛//g; # Get rid of hyphen. @@ -96,7 +92,6 @@ sub regularize { $word =~ s/աւ/օ/g; # for easy vocalic comparison to ո $word =~ s/և/եւ/g; - # TODO split off suspected prefix/suffix markers? # Downcase the word. $word = lc( $word ); return $word; diff --git a/morphology/lib/Text/Tradition/Language/Base.pm b/morphology/lib/Text/Tradition/Language/Base.pm index fa57723..5f46b9b 100644 --- a/morphology/lib/Text/Tradition/Language/Base.pm +++ b/morphology/lib/Text/Tradition/Language/Base.pm @@ -303,6 +303,17 @@ sub unicode_regularize { return join( '', @normalized ); } +=head2 regularize( $word ) + +Fallback function for reading regularization in the event that it is not +implemented by a module. Currently just lowercases the string. + +=cut + +sub regularize { + return lc( $_[0] ); +} + 1; =head2 TODO @@ -311,5 +322,7 @@ sub unicode_regularize { =item * Handle package dependencies more gracefully +=item * Make this some kind of real base class for the language modules + =back diff --git a/morphology/lib/Text/Tradition/Language/English.pm b/morphology/lib/Text/Tradition/Language/English.pm index bb62fb5..e772b83 100644 --- a/morphology/lib/Text/Tradition/Language/English.pm +++ b/morphology/lib/Text/Tradition/Language/English.pm @@ -94,12 +94,12 @@ sub _parse_wordform { =head2 regularize( $text ) -Returns a regularized form of the reading for the purposes of collation. +Returns an orthographically regular form of the reading. =cut sub regularize { - return unicode_regularize( @_ ); + return Text::Tradition::Language::Base::regularize( @_ ); } =head2 TODO diff --git a/morphology/lib/Text/Tradition/Language/French.pm b/morphology/lib/Text/Tradition/Language/French.pm index 7af89c9..4974729 100644 --- a/morphology/lib/Text/Tradition/Language/French.pm +++ b/morphology/lib/Text/Tradition/Language/French.pm @@ -176,11 +176,21 @@ sub _parse_wordform { =head2 regularize( $text ) -Returns a regularized form of the reading for the purposes of collation. +Returns an orthographically regular form of the reading. =cut sub regularize { + return Text::Tradition::Language::Base::regularize( @_ ); +} + +=head2 collation_normalize( $text ) + +Returns a normalized form of the reading for the purposes of collation. + +=cut + +sub collation_normalize { return unicode_regularize( @_ ); } diff --git a/morphology/lib/Text/Tradition/Language/Greek.pm b/morphology/lib/Text/Tradition/Language/Greek.pm index e4159b2..c3d52be 100644 --- a/morphology/lib/Text/Tradition/Language/Greek.pm +++ b/morphology/lib/Text/Tradition/Language/Greek.pm @@ -77,11 +77,21 @@ sub reading_lookup { =head2 regularize( $text ) -Returns a regularized form of the reading for the purposes of collation. +Returns an orthographically regular form of the reading. =cut sub regularize { + return Text::Tradition::Language::Base::regularize( @_ ); +} + +=head2 collation_normalize( $text ) + +Returns a normalized form of the reading for the purposes of collation. + +=cut + +sub collation_normalize { return unicode_regularize( @_ ); } diff --git a/morphology/lib/Text/Tradition/Language/Latin.pm b/morphology/lib/Text/Tradition/Language/Latin.pm index c2b89f8..a102923 100644 --- a/morphology/lib/Text/Tradition/Language/Latin.pm +++ b/morphology/lib/Text/Tradition/Language/Latin.pm @@ -87,16 +87,33 @@ sub reading_lookup { =head2 regularize( $text ) -Returns a regularized form of the reading for the purposes of collation. +Returns an orthographically regular form of the reading. =cut +# TODO Check this against Perseus regularization standards + sub regularize { my( $word ) = @_; $word = lc( $word ); $word =~ s/v/u/g; $word =~ s/w/u/g; $word =~ s/j/i/g; + return $word; +} + +=head2 collation_normalize( $text ) + +Returns a normalized form of the reading for the purposes of collation. + +=cut + +sub collation_normalize { + my( $word ) = @_; + $word = lc( $word ); + $word =~ s/v/u/g; + $word =~ s/w/u/g; + $word =~ s/j/i/g; $word =~ s/ci/ti/g; $word =~ s/cha/ca/g; return $word; diff --git a/morphology/lib/Text/Tradition/Morphology.pm b/morphology/lib/Text/Tradition/Morphology.pm index a0581f2..7f8a93a 100644 --- a/morphology/lib/Text/Tradition/Morphology.pm +++ b/morphology/lib/Text/Tradition/Morphology.pm @@ -29,6 +29,12 @@ morphology object structure. requires 'is_identical', 'is_combinable', '_combine'; +has 'language' => ( + is => 'ro', + isa => 'Str', + predicate => 'has_language', + ); + has 'grammar_invalid' => ( is => 'rw', isa => 'Bool', @@ -166,6 +172,31 @@ sub filter_lexemes { } } +=head2 regularize + +Call the 'regularize' function of the appropriate language model on our +own reading text. This is a rules-based function distinct from 'normal_form', +which can be set to any arbitrary string. + +=cut + +# TODO Test this stuff + +sub regularize { + my $self = shift; + if( $self->has_language ) { + # If we do have a language, regularize the tokens in $answer. + my $mod = 'Text::Tradition::Language::' . $self->language; + my $rsub; + eval { load( $mod ); }; + # If a module doesn't exist for our language, use the base routine + $mod = 'Text::Tradition::Language::Base' if $@; + return $mod->can( 'regularize' )->( $self->text ); + } else { + return $self->text; + } +} + around 'is_identical' => sub { my $orig = shift; my $self = shift; diff --git a/morphology/lib/Text/Tradition/WitLanguage.pm b/morphology/lib/Text/Tradition/WitLanguage.pm index 7008c40..28d8a3c 100644 --- a/morphology/lib/Text/Tradition/WitLanguage.pm +++ b/morphology/lib/Text/Tradition/WitLanguage.pm @@ -47,7 +47,7 @@ around 'language' => sub { try { load( "Text::Tradition::Language::".$_[0] ); } catch ( $e ) { - warn( "Cannot load language module for @_: $e" ); + print STDERR "No language module defined for @_\n"; } } elsif( !$self->has_language && $self->tradition->has_language ) { return $self->tradition->language; @@ -61,16 +61,16 @@ around 'export_as_json' => sub { my $answer = $self->$orig( @_ ); if( $self->has_language || $self->tradition->has_language ) { # If we do have a language, regularize the tokens in $answer. - my $mod = "Text::Tradition::Language::" . $self->language; - load( $mod ); - my $rsub = $mod->can( 'regularize' ); + my $mod = 'Text::Tradition::Language::' . $self->language; + eval { load( $mod ); }; + # If a module doesn't exist for our language, use the base routine + $mod = 'Text::Tradition::Language::Base' if $@; + my $rsub = $mod->can( 'collation_normalize' ) || $mod->can( 'regularize' ); map { $_->{'n'} = $rsub->( $_->{'t'} ) } @{$answer->{tokens}}; if( exists $answer->{layertokens} ) { map { $_->{'n'} = $rsub->( $_->{'t'} ) } @{$answer->{layertokens}}; } - } else { - warn "Please set a language to regularize a tradition"; - } + } return $answer; };