distinguish between collation fuzzy normalization and orthographic regularization

diff --git a/morphology/lib/Text/Tradition/Language/Armenian.pm b/morphology/lib/Text/Tradition/Language/Armenian.pm

index da962ef..a28204f 100644 (file)
--- a/morphology/lib/Text/Tradition/Language/Armenian.pm
+++ b/morphology/lib/Text/Tradition/Language/Armenian.pm
@@ -76,16 +76,12 @@ sub reading_lookup {
 
 =head2 regularize( $text )
 
-Returns a regularized form of the reading for the purposes of collation.
+Returns an orthographically regular form of the reading.
 
 =cut
 
 sub regularize {
        my( $word ) = @_;
-       # We don't really distinguish between commas and semicolons properly
-       # in the manuscript.  Make them the same.
-       $word =~ s/\./\,/g;
-
        # Get rid of accent marks.
        $word =~ s/՛//g;
        # Get rid of hyphen.
@@ -96,7 +92,6 @@ sub regularize {
        $word =~ s/աւ/օ/g;   # for easy vocalic comparison to ո
        $word =~ s/և/եւ/g;
        
-       # TODO split off suspected prefix/suffix markers?
        # Downcase the word.
        $word = lc( $word );
        return $word;
diff --git a/morphology/lib/Text/Tradition/Language/Base.pm b/morphology/lib/Text/Tradition/Language/Base.pm

index fa57723..5f46b9b 100644 (file)
--- a/morphology/lib/Text/Tradition/Language/Base.pm
+++ b/morphology/lib/Text/Tradition/Language/Base.pm
@@ -303,6 +303,17 @@ sub unicode_regularize {
        return join( '', @normalized );
 }
 
+=head2 regularize( $word )
+
+Fallback function for reading regularization in the event that it is not
+implemented by a module. Currently just lowercases the string.
+
+=cut
+
+sub regularize { 
+       return lc( $_[0] );
+}
+
 1;
 
 =head2 TODO
@@ -311,5 +322,7 @@ sub unicode_regularize {
 
 =item * Handle package dependencies more gracefully
 
+=item * Make this some kind of real base class for the language modules
+
 =back
 
diff --git a/morphology/lib/Text/Tradition/Language/English.pm b/morphology/lib/Text/Tradition/Language/English.pm

index bb62fb5..e772b83 100644 (file)
--- a/morphology/lib/Text/Tradition/Language/English.pm
+++ b/morphology/lib/Text/Tradition/Language/English.pm
@@ -94,12 +94,12 @@ sub _parse_wordform {
 
 =head2 regularize( $text )
 
-Returns a regularized form of the reading for the purposes of collation.
+Returns an orthographically regular form of the reading.
 
 =cut
 
 sub regularize {
-       return unicode_regularize( @_ );
+       return Text::Tradition::Language::Base::regularize( @_ );
 }
 
 =head2 TODO
diff --git a/morphology/lib/Text/Tradition/Language/French.pm b/morphology/lib/Text/Tradition/Language/French.pm

index 7af89c9..4974729 100644 (file)
--- a/morphology/lib/Text/Tradition/Language/French.pm
+++ b/morphology/lib/Text/Tradition/Language/French.pm
@@ -176,11 +176,21 @@ sub _parse_wordform {
 
 =head2 regularize( $text )
 
-Returns a regularized form of the reading for the purposes of collation.
+Returns an orthographically regular form of the reading.
 
 =cut
 
 sub regularize {
+       return Text::Tradition::Language::Base::regularize( @_ );
+}
+
+=head2 collation_normalize( $text )
+
+Returns a normalized form of the reading for the purposes of collation.
+
+=cut
+
+sub collation_normalize {
        return unicode_regularize( @_ );
 }
 
diff --git a/morphology/lib/Text/Tradition/Language/Greek.pm b/morphology/lib/Text/Tradition/Language/Greek.pm

index e4159b2..c3d52be 100644 (file)
--- a/morphology/lib/Text/Tradition/Language/Greek.pm
+++ b/morphology/lib/Text/Tradition/Language/Greek.pm
@@ -77,11 +77,21 @@ sub reading_lookup {
 
 =head2 regularize( $text )
 
-Returns a regularized form of the reading for the purposes of collation.
+Returns an orthographically regular form of the reading.
 
 =cut
 
 sub regularize {
+       return Text::Tradition::Language::Base::regularize( @_ );
+}
+
+=head2 collation_normalize( $text )
+
+Returns a normalized form of the reading for the purposes of collation.
+
+=cut
+
+sub collation_normalize {
        return unicode_regularize( @_ );
 }
 
diff --git a/morphology/lib/Text/Tradition/Language/Latin.pm b/morphology/lib/Text/Tradition/Language/Latin.pm

index c2b89f8..a102923 100644 (file)
--- a/morphology/lib/Text/Tradition/Language/Latin.pm
+++ b/morphology/lib/Text/Tradition/Language/Latin.pm
@@ -87,16 +87,33 @@ sub reading_lookup {
 
 =head2 regularize( $text )
 
-Returns a regularized form of the reading for the purposes of collation.
+Returns an orthographically regular form of the reading.
 
 =cut
 
+# TODO Check this against Perseus regularization standards
+
 sub regularize {
        my( $word ) = @_;
        $word = lc( $word );
        $word =~ s/v/u/g;
        $word =~ s/w/u/g;
        $word =~ s/j/i/g;
+       return $word;
+}
+
+=head2 collation_normalize( $text )
+
+Returns a normalized form of the reading for the purposes of collation.
+
+=cut
+
+sub collation_normalize {
+       my( $word ) = @_;
+       $word = lc( $word );
+       $word =~ s/v/u/g;
+       $word =~ s/w/u/g;
+       $word =~ s/j/i/g;
        $word =~ s/ci/ti/g;
        $word =~ s/cha/ca/g;
        return $word;
diff --git a/morphology/lib/Text/Tradition/Morphology.pm b/morphology/lib/Text/Tradition/Morphology.pm

index a0581f2..7f8a93a 100644 (file)
--- a/morphology/lib/Text/Tradition/Morphology.pm
+++ b/morphology/lib/Text/Tradition/Morphology.pm
@@ -29,6 +29,12 @@ morphology object structure.
 
 requires 'is_identical', 'is_combinable', '_combine';
 
+has 'language' => (
+       is => 'ro',
+       isa => 'Str',
+       predicate => 'has_language',
+       );
+       
 has 'grammar_invalid' => (
        is => 'rw',
        isa => 'Bool',
@@ -166,6 +172,31 @@ sub filter_lexemes {
        }
 }
 
+=head2 regularize
+
+Call the 'regularize' function of the appropriate language model on our
+own reading text. This is a rules-based function distinct from 'normal_form',
+which can be set to any arbitrary string.
+
+=cut
+
+# TODO Test this stuff
+
+sub regularize {
+       my $self = shift;
+       if( $self->has_language ) {
+               # If we do have a language, regularize the tokens in $answer.
+               my $mod = 'Text::Tradition::Language::' . $self->language;
+               my $rsub;
+               eval { load( $mod ); };
+               # If a module doesn't exist for our language, use the base routine
+               $mod = 'Text::Tradition::Language::Base' if $@;
+               return $mod->can( 'regularize' )->( $self->text );
+       } else {
+               return $self->text;
+       }
+}
+
 around 'is_identical' => sub {
        my $orig = shift;
        my $self = shift;
diff --git a/morphology/lib/Text/Tradition/WitLanguage.pm b/morphology/lib/Text/Tradition/WitLanguage.pm

index 7008c40..28d8a3c 100644 (file)
--- a/morphology/lib/Text/Tradition/WitLanguage.pm
+++ b/morphology/lib/Text/Tradition/WitLanguage.pm
@@ -47,7 +47,7 @@ around 'language' => sub {
                try {
                        load( "Text::Tradition::Language::".$_[0] );
                } catch ( $e ) {
-                       warn( "Cannot load language module for @_: $e" );
+                       print STDERR "No language module defined for @_\n";
                }
        } elsif( !$self->has_language && $self->tradition->has_language ) {
                return $self->tradition->language;
@@ -61,16 +61,16 @@ around 'export_as_json' => sub {
        my $answer = $self->$orig( @_ );
        if( $self->has_language || $self->tradition->has_language ) {
                # If we do have a language, regularize the tokens in $answer.
-               my $mod = "Text::Tradition::Language::" . $self->language;
-               load( $mod );
-               my $rsub = $mod->can( 'regularize' );
+               my $mod = 'Text::Tradition::Language::' . $self->language;
+               eval { load( $mod ); };
+               # If a module doesn't exist for our language, use the base routine
+               $mod = 'Text::Tradition::Language::Base' if $@;
+               my $rsub = $mod->can( 'collation_normalize' ) || $mod->can( 'regularize' );
                map { $_->{'n'} = $rsub->( $_->{'t'} ) } @{$answer->{tokens}};
                if( exists $answer->{layertokens} ) {
                        map { $_->{'n'} = $rsub->( $_->{'t'} ) } @{$answer->{layertokens}};
                }
-       } else {
-               warn "Please set a language to regularize a tradition";
-       }
+       } 
        return $answer;
 };
morphology/lib/Text/Tradition/Language/Armenian.pm		patch \| blob \| blame \| history
morphology/lib/Text/Tradition/Language/Base.pm		patch \| blob \| blame \| history
morphology/lib/Text/Tradition/Language/English.pm		patch \| blob \| blame \| history
morphology/lib/Text/Tradition/Language/French.pm		patch \| blob \| blame \| history
morphology/lib/Text/Tradition/Language/Greek.pm		patch \| blob \| blame \| history
morphology/lib/Text/Tradition/Language/Latin.pm		patch \| blob \| blame \| history
morphology/lib/Text/Tradition/Morphology.pm		patch \| blob \| blame \| history
morphology/lib/Text/Tradition/WitLanguage.pm		patch \| blob \| blame \| history