Evaluates the string using Treetagger and Perseus, and returns the results.
+=head2 reading_lookup( $word )
+
+Returns a single-word morphological lookup of the given word using Perseus.
+
=begin testing
use Text::Tradition;
return __PACKAGE__->perseus_reading_lookup( @_ );
}
+=head2 regularize( $text )
+
+Returns a regularized form of the reading for the purposes of collation.
+
+=cut
+
+sub regularize {
+ my( $word ) = @_;
+ # We don't really distinguish between commas and semicolons properly
+ # in the manuscript. Make them the same.
+ $word =~ s/\./\,/g;
+
+ # Get rid of accent marks.
+ $word =~ s/՛//g;
+ # Get rid of hyphen.
+ $word =~ s/֊//g;
+ # Get rid of any backtick that falls mid-word.
+ $word =~ s/՝(.)/$1/g;
+ # Standardize ligatures.
+ $word =~ s/աւ/օ/g; # for easy vocalic comparison to ո
+ $word =~ s/և/եւ/g;
+
+ # TODO split off suspected prefix/suffix markers?
+ # Downcase the word.
+ $word = lc( $word );
+ return $word;
+}
+
1;
use Text::Tradition::Collation::Reading::Lexeme;
use Text::Tradition::Collation::Reading::WordForm;
use TryCatch;
+use Unicode::Normalize;
-@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /;
+@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags
+ unicode_regularize /;
=head1 NAME
return $a cmp $b;
}
+=head2 unicode_regularize( $word )
+
+Returns a lowercased and accent-stripped version of the word.
+
+=cut
+
+sub unicode_regularize {
+ my $word = shift;
+ my @normalized;
+ my @letters = split( '', lc( $word ) );
+ foreach my $l ( @letters ) {
+ my $d = chr( ord( NFKD( $l ) ) );
+ next unless $d =~ /[[:alnum:]]/; # toss out e.g. Greek underdots
+ push( @normalized, $d );
+ }
+ return join( '', @normalized );
+}
+
1;
=head2 TODO
use warnings;
use Lingua::TagSet::TreeTagger::English;
use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger
- lfs_morph_tags /;
+ lfs_morph_tags unicode_regularize /;
use TryCatch;
=head1 NAME
1;
+
+=head2 regularize( $text )
+
+Returns a regularized form of the reading for the purposes of collation.
+
+=cut
+
+sub regularize {
+ return unicode_regularize( @_ );
+}
+
=head2 TODO
=over
use Lingua::TagSet::Multext;
use Lingua::TagSet::TreeTagger::French;
use Module::Load qw/ load /;
-use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /;
+use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger
+ lfs_morph_tags unicode_regularize /;
use TryCatch;
=head1 NAME
return @forms;
}
+=head2 regularize( $text )
+
+Returns a regularized form of the reading for the purposes of collation.
+
+=cut
+
+sub regularize {
+ return unicode_regularize( @_ );
+}
+
1;
=head2 TODO
use warnings;
use Module::Load;
use parent qw/ Text::Tradition::Language::Perseus /;
+use Text::Tradition::Language::Base qw/ unicode_regularize /;
=head1 NAME
Evaluates the string using Treetagger and Perseus, and returns the results.
+=head2 reading_lookup( $word )
+
+Returns a single-word morphological lookup of the given word using Perseus.
+
=begin testing
use Text::Tradition;
return __PACKAGE__->perseus_reading_lookup( @_ );
}
+=head2 regularize( $text )
+
+Returns a regularized form of the reading for the purposes of collation.
+
+=cut
+
+sub regularize {
+ return unicode_regularize( @_ );
+}
+
1;
Evaluates the string using Treetagger and Perseus, and returns the results.
+=head2 reading_lookup( $word )
+
+Returns a single-word morphological lookup of the given word using Perseus.
+
=begin testing
use Text::Tradition;
use_ok( 'Text::Tradition::Language::Latin' );
+my $trad = Text::Tradition->new(
+ 'language' => 'Latin',
+ 'file' => 't/data/legendfrag.xml',
+ 'input' => 'Self' );
+
eval "use Lingua::Morph::Perseus";
my $err = $@;
-
SKIP: {
skip "Package Lingua::Morph::Perseus not found" if $err;
- my $trad = Text::Tradition->new(
- 'language' => 'Latin',
- 'file' => 't/data/legendfrag.xml',
- 'input' => 'Self' );
$trad->lemmatize();
my $ambig = 0;
foreach my $r ( $trad->collation->readings ) {
}
is( $ambig, 4, "Found 4 ambiguous forms as expected" );
}
+
+# Try exporting some witnesses
+my $e_v = 'in suetia uenerabilis pontifex beatus henricus in anglia oriundus';
+my $struct_v = $trad->witness('V')->export_as_json;
+my $g_v = join( ' ', map { $_->{'n'} } @{$struct_v->{'tokens'}} );
+is( $g_v, $e_v, "Got expected regularization of witness V" );
+my $e_n = 'in suetia beatus henricus uenerabilis pontifex de anglia oriundus';
+my $struct_n = $trad->witness('N')->export_as_json;
+my $g_n = join( ' ', map { $_->{'n'} } @{$struct_n->{'tokens'}} );
+is( $g_n, $e_n, "Got expected regularization of witness N" );
+
=end testing
return __PACKAGE__->perseus_reading_lookup( @_ );
}
+=head2 regularize( $text )
+
+Returns a regularized form of the reading for the purposes of collation.
+
+=cut
+
+sub regularize {
+ my( $word ) = @_;
+ $word = lc( $word );
+ $word =~ s/v/u/g;
+ $word =~ s/w/u/g;
+ $word =~ s/j/i/g;
+ $word =~ s/ci/ti/g;
+ $word =~ s/cha/ca/g;
+ return $word;
+}
+
1;
--- /dev/null
+package Text::Tradition::WitLanguage;
+
+use strict;
+use warnings;
+use Module::Load;
+use Moose::Role;
+use TryCatch;
+
+=head1 NAME
+
+Text::Tradition::WitLanguage - add-on role to enable language awareness and
+morphology functions to a Text::Tradition::Witness object. Please see
+L<Text::Tradition::Morphology> for more information on the morphology
+add-on distribution.
+
+=head1 METHODS
+
+=head2 language
+
+Accessor for the primary language of the tradition. Must correspond to one
+of the Text::Tradition::Language::* modules in this package. Used for JSON
+export of a language-regularized witness text.
+
+=begin testing
+
+use Test::Warn;
+use TryCatch;
+use_ok( 'Text::Tradition' ); # with Language
+use_ok( 'Text::Tradition::Witness' ); # with WitLanguage
+
+=end testing
+
+=cut
+
+has 'language' => (
+ is => 'rw',
+ isa => 'Str',
+ predicate => 'has_language',
+ );
+
+around 'language' => sub {
+ my $orig = shift;
+ my $self = shift;
+ if( @_ && $_[0] ne 'Default' ) {
+ # We are trying to set the language; check that the corresponding
+ # module exists.
+ try {
+ load( "Text::Tradition::Language::".$_[0] );
+ } catch ( $e ) {
+ warn( "Cannot load language module for @_: $e" );
+ }
+ } elsif( !$self->has_language && $self->tradition->has_language ) {
+ return $self->tradition->language;
+ }
+ $self->$orig( @_ );
+};
+
+around 'export_as_json' => sub {
+ my $orig = shift;
+ my $self = shift;
+ my $answer = $self->$orig( @_ );
+ if( $self->has_language || $self->tradition->has_language ) {
+ # If we do have a language, regularize the tokens in $answer.
+ my $mod = "Text::Tradition::Language::" . $self->language;
+ load( $mod );
+ my $rsub = $mod->can( 'regularize' );
+ map { $_->{'n'} = $rsub->( $_->{'t'} ) } @{$answer->{tokens}};
+ if( exists $answer->{layertokens} ) {
+ map { $_->{'n'} = $rsub->( $_->{'t'} ) } @{$answer->{layertokens}};
+ }
+ } else {
+ warn "Please set a language to regularize a tradition";
+ }
+ return $answer;
+};
+
+1;
+
use Text::Tradition;
use_ok( 'Text::Tradition::Language::Latin' );
+my $trad = Text::Tradition->new(
+ 'language' => 'Latin',
+ 'file' => 't/data/legendfrag.xml',
+ 'input' => 'Self' );
+
eval "use Lingua::Morph::Perseus";
my $err = $@;
-
SKIP: {
skip "Package Lingua::Morph::Perseus not found" if $err;
- my $trad = Text::Tradition->new(
- 'language' => 'Latin',
- 'file' => 't/data/legendfrag.xml',
- 'input' => 'Self' );
$trad->lemmatize();
my $ambig = 0;
foreach my $r ( $trad->collation->readings ) {
}
is( $ambig, 4, "Found 4 ambiguous forms as expected" );
}
+
+# Try exporting some witnesses
+my $e_v = 'in suetia uenerabilis pontifex beatus henricus in anglia oriundus';
+my $struct_v = $trad->witness('V')->export_as_json;
+my $g_v = join( ' ', map { $_->{'n'} } @{$struct_v->{'tokens'}} );
+is( $g_v, $e_v, "Got expected regularization of witness V" );
+my $e_n = 'in suetia beatus henricus uenerabilis pontifex de anglia oriundus';
+my $struct_n = $trad->witness('N')->export_as_json;
+my $g_n = join( ' ', map { $_->{'n'} } @{$struct_n->{'tokens'}} );
+is( $g_n, $e_n, "Got expected regularization of witness N" );
}
--- /dev/null
+#!/usr/bin/perl -w
+
+use strict;
+use Test::More 'no_plan';
+$| = 1;
+
+
+
+# =begin testing
+{
+use Test::Warn;
+use TryCatch;
+use_ok( 'Text::Tradition' ); # with Language
+use_ok( 'Text::Tradition::Witness' ); # with WitLanguage
+}
+
+
+
+
+1;