From: Tara L Andrews <tla@mit.edu>
Date: Thu, 27 Sep 2012 11:51:23 +0000 (+0200)
Subject: distinguish between collation fuzzy normalization and orthographic regularization
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=58f9c2b92303a12ae9811463875daa4220e05722;p=scpubgit%2Fstemmatology.git

distinguish between collation fuzzy normalization and orthographic regularization
---

diff --git a/morphology/lib/Text/Tradition/Language/Armenian.pm b/morphology/lib/Text/Tradition/Language/Armenian.pm
index da962ef..a28204f 100644
--- a/morphology/lib/Text/Tradition/Language/Armenian.pm
+++ b/morphology/lib/Text/Tradition/Language/Armenian.pm
@@ -76,16 +76,12 @@ sub reading_lookup {
 
 =head2 regularize( $text )
 
-Returns a regularized form of the reading for the purposes of collation.
+Returns an orthographically regular form of the reading.
 
 =cut
 
 sub regularize {
 	my( $word ) = @_;
-	# We don't really distinguish between commas and semicolons properly
-	# in the manuscript.  Make them the same.
-	$word =~ s/\./\,/g;
-
 	# Get rid of accent marks.
 	$word =~ s/Õ//g;
 	# Get rid of hyphen.
@@ -96,7 +92,6 @@ sub regularize {
 	$word =~ s/Õ¡Ö/Ö/g;	# for easy vocalic comparison to Õ¸
 	$word =~ s/Ö/Õ¥Ö/g;
 	
-	# TODO split off suspected prefix/suffix markers?
 	# Downcase the word.
 	$word = lc( $word );
 	return $word;
diff --git a/morphology/lib/Text/Tradition/Language/Base.pm b/morphology/lib/Text/Tradition/Language/Base.pm
index fa57723..5f46b9b 100644
--- a/morphology/lib/Text/Tradition/Language/Base.pm
+++ b/morphology/lib/Text/Tradition/Language/Base.pm
@@ -303,6 +303,17 @@ sub unicode_regularize {
 	return join( '', @normalized );
 }
 
+=head2 regularize( $word )
+
+Fallback function for reading regularization in the event that it is not
+implemented by a module. Currently just lowercases the string.
+
+=cut
+
+sub regularize { 
+	return lc( $_[0] );
+}
+
 1;
 
 =head2 TODO
@@ -311,5 +322,7 @@ sub unicode_regularize {
 
 =item * Handle package dependencies more gracefully
 
+=item * Make this some kind of real base class for the language modules
+
 =back
 
diff --git a/morphology/lib/Text/Tradition/Language/English.pm b/morphology/lib/Text/Tradition/Language/English.pm
index bb62fb5..e772b83 100644
--- a/morphology/lib/Text/Tradition/Language/English.pm
+++ b/morphology/lib/Text/Tradition/Language/English.pm
@@ -94,12 +94,12 @@ sub _parse_wordform {
 
 =head2 regularize( $text )
 
-Returns a regularized form of the reading for the purposes of collation.
+Returns an orthographically regular form of the reading.
 
 =cut
 
 sub regularize {
-	return unicode_regularize( @_ );
+	return Text::Tradition::Language::Base::regularize( @_ );
 }
 
 =head2 TODO
diff --git a/morphology/lib/Text/Tradition/Language/French.pm b/morphology/lib/Text/Tradition/Language/French.pm
index 7af89c9..4974729 100644
--- a/morphology/lib/Text/Tradition/Language/French.pm
+++ b/morphology/lib/Text/Tradition/Language/French.pm
@@ -176,11 +176,21 @@ sub _parse_wordform {
 
 =head2 regularize( $text )
 
-Returns a regularized form of the reading for the purposes of collation.
+Returns an orthographically regular form of the reading.
 
 =cut
 
 sub regularize {
+	return Text::Tradition::Language::Base::regularize( @_ );
+}
+
+=head2 collation_normalize( $text )
+
+Returns a normalized form of the reading for the purposes of collation.
+
+=cut
+
+sub collation_normalize {
 	return unicode_regularize( @_ );
 }
 
diff --git a/morphology/lib/Text/Tradition/Language/Greek.pm b/morphology/lib/Text/Tradition/Language/Greek.pm
index e4159b2..c3d52be 100644
--- a/morphology/lib/Text/Tradition/Language/Greek.pm
+++ b/morphology/lib/Text/Tradition/Language/Greek.pm
@@ -77,11 +77,21 @@ sub reading_lookup {
 
 =head2 regularize( $text )
 
-Returns a regularized form of the reading for the purposes of collation.
+Returns an orthographically regular form of the reading.
 
 =cut
 
 sub regularize {
+	return Text::Tradition::Language::Base::regularize( @_ );
+}
+
+=head2 collation_normalize( $text )
+
+Returns a normalized form of the reading for the purposes of collation.
+
+=cut
+
+sub collation_normalize {
 	return unicode_regularize( @_ );
 }
 
diff --git a/morphology/lib/Text/Tradition/Language/Latin.pm b/morphology/lib/Text/Tradition/Language/Latin.pm
index c2b89f8..a102923 100644
--- a/morphology/lib/Text/Tradition/Language/Latin.pm
+++ b/morphology/lib/Text/Tradition/Language/Latin.pm
@@ -87,16 +87,33 @@ sub reading_lookup {
 
 =head2 regularize( $text )
 
-Returns a regularized form of the reading for the purposes of collation.
+Returns an orthographically regular form of the reading.
 
 =cut
 
+# TODO Check this against Perseus regularization standards
+
 sub regularize {
 	my( $word ) = @_;
 	$word = lc( $word );
 	$word =~ s/v/u/g;
 	$word =~ s/w/u/g;
 	$word =~ s/j/i/g;
+	return $word;
+}
+
+=head2 collation_normalize( $text )
+
+Returns a normalized form of the reading for the purposes of collation.
+
+=cut
+
+sub collation_normalize {
+	my( $word ) = @_;
+	$word = lc( $word );
+	$word =~ s/v/u/g;
+	$word =~ s/w/u/g;
+	$word =~ s/j/i/g;
 	$word =~ s/ci/ti/g;
 	$word =~ s/cha/ca/g;
 	return $word;
diff --git a/morphology/lib/Text/Tradition/Morphology.pm b/morphology/lib/Text/Tradition/Morphology.pm
index a0581f2..7f8a93a 100644
--- a/morphology/lib/Text/Tradition/Morphology.pm
+++ b/morphology/lib/Text/Tradition/Morphology.pm
@@ -29,6 +29,12 @@ morphology object structure.
 
 requires 'is_identical', 'is_combinable', '_combine';
 
+has 'language' => (
+	is => 'ro',
+	isa => 'Str',
+	predicate => 'has_language',
+	);
+	
 has 'grammar_invalid' => (
 	is => 'rw',
 	isa => 'Bool',
@@ -166,6 +172,31 @@ sub filter_lexemes {
 	}
 }
 
+=head2 regularize
+
+Call the 'regularize' function of the appropriate language model on our
+own reading text. This is a rules-based function distinct from 'normal_form',
+which can be set to any arbitrary string.
+
+=cut
+
+# TODO Test this stuff
+
+sub regularize {
+	my $self = shift;
+	if( $self->has_language ) {
+		# If we do have a language, regularize the tokens in $answer.
+		my $mod = 'Text::Tradition::Language::' . $self->language;
+		my $rsub;
+		eval { load( $mod ); };
+		# If a module doesn't exist for our language, use the base routine
+		$mod = 'Text::Tradition::Language::Base' if $@;
+		return $mod->can( 'regularize' )->( $self->text );
+	} else {
+		return $self->text;
+	}
+}
+
 around 'is_identical' => sub {
 	my $orig = shift;
 	my $self = shift;
diff --git a/morphology/lib/Text/Tradition/WitLanguage.pm b/morphology/lib/Text/Tradition/WitLanguage.pm
index 7008c40..28d8a3c 100644
--- a/morphology/lib/Text/Tradition/WitLanguage.pm
+++ b/morphology/lib/Text/Tradition/WitLanguage.pm
@@ -47,7 +47,7 @@ around 'language' => sub {
 		try {
 			load( "Text::Tradition::Language::".$_[0] );
 		} catch ( $e ) {
-			warn( "Cannot load language module for @_: $e" );
+			print STDERR "No language module defined for @_\n";
 		}
 	} elsif( !$self->has_language && $self->tradition->has_language ) {
 		return $self->tradition->language;
@@ -61,16 +61,16 @@ around 'export_as_json' => sub {
 	my $answer = $self->$orig( @_ );
 	if( $self->has_language || $self->tradition->has_language ) {
 		# If we do have a language, regularize the tokens in $answer.
-		my $mod = "Text::Tradition::Language::" . $self->language;
-		load( $mod );
-		my $rsub = $mod->can( 'regularize' );
+		my $mod = 'Text::Tradition::Language::' . $self->language;
+		eval { load( $mod ); };
+		# If a module doesn't exist for our language, use the base routine
+		$mod = 'Text::Tradition::Language::Base' if $@;
+		my $rsub = $mod->can( 'collation_normalize' ) || $mod->can( 'regularize' );
 		map { $_->{'n'} = $rsub->( $_->{'t'} ) } @{$answer->{tokens}};
 		if( exists $answer->{layertokens} ) {
 			map { $_->{'n'} = $rsub->( $_->{'t'} ) } @{$answer->{layertokens}};
 		}
-	} else {
-		warn "Please set a language to regularize a tradition";
-	}
+	} 
 	return $answer;
 };