=head2 regularize( $text )
-Returns a regularized form of the reading for the purposes of collation.
+Returns an orthographically regular form of the reading.
=cut
sub regularize {
my( $word ) = @_;
- # We don't really distinguish between commas and semicolons properly
- # in the manuscript. Make them the same.
- $word =~ s/\./\,/g;
-
# Get rid of accent marks.
$word =~ s/՛//g;
# Get rid of hyphen.
$word =~ s/աւ/օ/g; # for easy vocalic comparison to ո
$word =~ s/և/եւ/g;
- # TODO split off suspected prefix/suffix markers?
# Downcase the word.
$word = lc( $word );
return $word;
return join( '', @normalized );
}
+=head2 regularize( $word )
+
+Fallback function for reading regularization in the event that it is not
+implemented by a module. Currently just lowercases the string.
+
+=cut
+
+sub regularize {
+ return lc( $_[0] );
+}
+
1;
=head2 TODO
=item * Handle package dependencies more gracefully
+=item * Make this some kind of real base class for the language modules
+
=back
=head2 regularize( $text )
-Returns a regularized form of the reading for the purposes of collation.
+Returns an orthographically regular form of the reading.
=cut
sub regularize {
- return unicode_regularize( @_ );
+ return Text::Tradition::Language::Base::regularize( @_ );
}
=head2 TODO
=head2 regularize( $text )
-Returns a regularized form of the reading for the purposes of collation.
+Returns an orthographically regular form of the reading.
=cut
sub regularize {
+ return Text::Tradition::Language::Base::regularize( @_ );
+}
+
+=head2 collation_normalize( $text )
+
+Returns a normalized form of the reading for the purposes of collation.
+
+=cut
+
+sub collation_normalize {
return unicode_regularize( @_ );
}
=head2 regularize( $text )
-Returns a regularized form of the reading for the purposes of collation.
+Returns an orthographically regular form of the reading.
=cut
sub regularize {
+ return Text::Tradition::Language::Base::regularize( @_ );
+}
+
+=head2 collation_normalize( $text )
+
+Returns a normalized form of the reading for the purposes of collation.
+
+=cut
+
+sub collation_normalize {
return unicode_regularize( @_ );
}
=head2 regularize( $text )
-Returns a regularized form of the reading for the purposes of collation.
+Returns an orthographically regular form of the reading.
=cut
+# TODO Check this against Perseus regularization standards
+
sub regularize {
my( $word ) = @_;
$word = lc( $word );
$word =~ s/v/u/g;
$word =~ s/w/u/g;
$word =~ s/j/i/g;
+ return $word;
+}
+
+=head2 collation_normalize( $text )
+
+Returns a normalized form of the reading for the purposes of collation.
+
+=cut
+
+sub collation_normalize {
+ my( $word ) = @_;
+ $word = lc( $word );
+ $word =~ s/v/u/g;
+ $word =~ s/w/u/g;
+ $word =~ s/j/i/g;
$word =~ s/ci/ti/g;
$word =~ s/cha/ca/g;
return $word;
requires 'is_identical', 'is_combinable', '_combine';
+has 'language' => (
+ is => 'ro',
+ isa => 'Str',
+ predicate => 'has_language',
+ );
+
has 'grammar_invalid' => (
is => 'rw',
isa => 'Bool',
}
}
+=head2 regularize
+
+Call the 'regularize' function of the appropriate language model on our
+own reading text. This is a rules-based function distinct from 'normal_form',
+which can be set to any arbitrary string.
+
+=cut
+
+# TODO Test this stuff
+
+sub regularize {
+ my $self = shift;
+ if( $self->has_language ) {
+ # If we do have a language, regularize the tokens in $answer.
+ my $mod = 'Text::Tradition::Language::' . $self->language;
+ my $rsub;
+ eval { load( $mod ); };
+ # If a module doesn't exist for our language, use the base routine
+ $mod = 'Text::Tradition::Language::Base' if $@;
+ return $mod->can( 'regularize' )->( $self->text );
+ } else {
+ return $self->text;
+ }
+}
+
around 'is_identical' => sub {
my $orig = shift;
my $self = shift;
try {
load( "Text::Tradition::Language::".$_[0] );
} catch ( $e ) {
- warn( "Cannot load language module for @_: $e" );
+ print STDERR "No language module defined for @_\n";
}
} elsif( !$self->has_language && $self->tradition->has_language ) {
return $self->tradition->language;
my $answer = $self->$orig( @_ );
if( $self->has_language || $self->tradition->has_language ) {
# If we do have a language, regularize the tokens in $answer.
- my $mod = "Text::Tradition::Language::" . $self->language;
- load( $mod );
- my $rsub = $mod->can( 'regularize' );
+ my $mod = 'Text::Tradition::Language::' . $self->language;
+ eval { load( $mod ); };
+ # If a module doesn't exist for our language, use the base routine
+ $mod = 'Text::Tradition::Language::Base' if $@;
+ my $rsub = $mod->can( 'collation_normalize' ) || $mod->can( 'regularize' );
map { $_->{'n'} = $rsub->( $_->{'t'} ) } @{$answer->{tokens}};
if( exists $answer->{layertokens} ) {
map { $_->{'n'} = $rsub->( $_->{'t'} ) } @{$answer->{layertokens}};
}
- } else {
- warn "Please set a language to regularize a tradition";
- }
+ }
return $answer;
};