add witness role for collation regularization

diff --git a/morphology/lib/Text/Tradition/Language/Armenian.pm b/morphology/lib/Text/Tradition/Language/Armenian.pm

index 3030f0f..da962ef 100644 (file)
--- a/morphology/lib/Text/Tradition/Language/Armenian.pm
+++ b/morphology/lib/Text/Tradition/Language/Armenian.pm
@@ -20,6 +20,10 @@ depends on the Lingua::Morph::Perseus module for access to PhiloLogic database d
 
 Evaluates the string using Treetagger and Perseus, and returns the results.
 
+=head2 reading_lookup( $word )
+
+Returns a single-word morphological lookup of the given word using Perseus.
+
 =begin testing
 
 use Text::Tradition;
@@ -70,5 +74,33 @@ sub reading_lookup {
        return __PACKAGE__->perseus_reading_lookup( @_ );
 }
 
+=head2 regularize( $text )
+
+Returns a regularized form of the reading for the purposes of collation.
+
+=cut
+
+sub regularize {
+       my( $word ) = @_;
+       # We don't really distinguish between commas and semicolons properly
+       # in the manuscript.  Make them the same.
+       $word =~ s/\./\,/g;
+
+       # Get rid of accent marks.
+       $word =~ s/՛//g;
+       # Get rid of hyphen.
+       $word =~ s/֊//g;
+       # Get rid of any backtick that falls mid-word.
+       $word =~ s/՝(.)/$1/g;
+       # Standardize ligatures.
+       $word =~ s/աւ/օ/g;   # for easy vocalic comparison to ո
+       $word =~ s/և/եւ/g;
+       
+       # TODO split off suspected prefix/suffix markers?
+       # Downcase the word.
+       $word = lc( $word );
+       return $word;
+}
+
 1;
 
diff --git a/morphology/lib/Text/Tradition/Language/Base.pm b/morphology/lib/Text/Tradition/Language/Base.pm

index fd56c04..fa57723 100644 (file)
--- a/morphology/lib/Text/Tradition/Language/Base.pm
+++ b/morphology/lib/Text/Tradition/Language/Base.pm
@@ -10,8 +10,10 @@ use Module::Load;
 use Text::Tradition::Collation::Reading::Lexeme;
 use Text::Tradition::Collation::Reading::WordForm;
 use TryCatch;
+use Unicode::Normalize;
 
-@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /;
+@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags 
+       unicode_regularize /;
 
 =head1 NAME
 
@@ -283,6 +285,24 @@ sub _by_structid {
        return $a cmp $b;
 }
 
+=head2 unicode_regularize( $word )
+
+Returns a lowercased and accent-stripped version of the word.
+
+=cut
+
+sub unicode_regularize {
+       my $word = shift;
+       my @normalized;
+       my @letters = split( '', lc( $word ) );
+       foreach my $l ( @letters ) {
+               my $d = chr( ord( NFKD( $l ) ) );
+               next unless $d =~ /[[:alnum:]]/; # toss out e.g. Greek underdots
+               push( @normalized, $d );
+       }
+       return join( '', @normalized );
+}
+
 1;
 
 =head2 TODO
diff --git a/morphology/lib/Text/Tradition/Language/English.pm b/morphology/lib/Text/Tradition/Language/English.pm

index 4277d62..bb62fb5 100644 (file)
--- a/morphology/lib/Text/Tradition/Language/English.pm
+++ b/morphology/lib/Text/Tradition/Language/English.pm
@@ -4,7 +4,7 @@ use strict;
 use warnings;
 use Lingua::TagSet::TreeTagger::English;
 use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger
-       lfs_morph_tags /;
+       lfs_morph_tags unicode_regularize /;
 use TryCatch;
 
 =head1 NAME
@@ -91,6 +91,17 @@ sub _parse_wordform {
 
 1;
 
+
+=head2 regularize( $text )
+
+Returns a regularized form of the reading for the purposes of collation.
+
+=cut
+
+sub regularize {
+       return unicode_regularize( @_ );
+}
+
 =head2 TODO
 
 =over
diff --git a/morphology/lib/Text/Tradition/Language/French.pm b/morphology/lib/Text/Tradition/Language/French.pm

index 13ad869..7af89c9 100644 (file)
--- a/morphology/lib/Text/Tradition/Language/French.pm
+++ b/morphology/lib/Text/Tradition/Language/French.pm
@@ -5,7 +5,8 @@ use warnings;
 use Lingua::TagSet::Multext;
 use Lingua::TagSet::TreeTagger::French;
 use Module::Load qw/ load /;
-use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /;
+use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger 
+       lfs_morph_tags unicode_regularize /;
 use TryCatch;
 
 =head1 NAME
@@ -173,6 +174,16 @@ sub _parse_wordform {
        return @forms;
 }
 
+=head2 regularize( $text )
+
+Returns a regularized form of the reading for the purposes of collation.
+
+=cut
+
+sub regularize {
+       return unicode_regularize( @_ );
+}
+
 1;
 
 =head2 TODO
diff --git a/morphology/lib/Text/Tradition/Language/Greek.pm b/morphology/lib/Text/Tradition/Language/Greek.pm

index 8106ec6..e4159b2 100644 (file)
--- a/morphology/lib/Text/Tradition/Language/Greek.pm
+++ b/morphology/lib/Text/Tradition/Language/Greek.pm
@@ -4,6 +4,7 @@ use strict;
 use warnings;
 use Module::Load;
 use parent qw/ Text::Tradition::Language::Perseus /;
+use Text::Tradition::Language::Base qw/ unicode_regularize /;
 
 =head1 NAME
 
@@ -20,6 +21,10 @@ depends on the Lingua::Morph::Perseus module for access to PhiloLogic database d
 
 Evaluates the string using Treetagger and Perseus, and returns the results.
 
+=head2 reading_lookup( $word )
+
+Returns a single-word morphological lookup of the given word using Perseus.
+
 =begin testing
 
 use Text::Tradition;
@@ -70,5 +75,15 @@ sub reading_lookup {
        return __PACKAGE__->perseus_reading_lookup( @_ );
 }
 
+=head2 regularize( $text )
+
+Returns a regularized form of the reading for the purposes of collation.
+
+=cut
+
+sub regularize {
+       return unicode_regularize( @_ );
+}
+
 1;
 
diff --git a/morphology/lib/Text/Tradition/Language/Latin.pm b/morphology/lib/Text/Tradition/Language/Latin.pm

index ad36208..c2b89f8 100644 (file)
--- a/morphology/lib/Text/Tradition/Language/Latin.pm
+++ b/morphology/lib/Text/Tradition/Language/Latin.pm
@@ -20,21 +20,25 @@ depends on the Lingua::Morph::Perseus module for access to PhiloLogic database d
 
 Evaluates the string using Treetagger and Perseus, and returns the results.
 
+=head2 reading_lookup( $word )
+
+Returns a single-word morphological lookup of the given word using Perseus.
+
 =begin testing
 
 use Text::Tradition;
 use_ok( 'Text::Tradition::Language::Latin' );
 
+my $trad = Text::Tradition->new(
+       'language' => 'Latin',
+       'file' => 't/data/legendfrag.xml',
+       'input' => 'Self' );
+
 eval "use Lingua::Morph::Perseus";
 my $err = $@;
-
 SKIP: {
        skip "Package Lingua::Morph::Perseus not found" if $err;
 
-       my $trad = Text::Tradition->new(
-               'language' => 'Latin',
-               'file' => 't/data/legendfrag.xml',
-               'input' => 'Self' );
        $trad->lemmatize();
        my $ambig = 0;
        foreach my $r ( $trad->collation->readings ) {
@@ -55,6 +59,17 @@ SKIP: {
        }
        is( $ambig, 4, "Found 4 ambiguous forms as expected" );
 }
+       
+# Try exporting some witnesses
+my $e_v = 'in suetia uenerabilis pontifex beatus henricus in anglia oriundus';
+my $struct_v = $trad->witness('V')->export_as_json;
+my $g_v = join( ' ', map { $_->{'n'} } @{$struct_v->{'tokens'}} );
+is( $g_v, $e_v, "Got expected regularization of witness V" );
+my $e_n = 'in suetia beatus henricus uenerabilis pontifex de anglia oriundus';
+my $struct_n = $trad->witness('N')->export_as_json;
+my $g_n = join( ' ', map { $_->{'n'} } @{$struct_n->{'tokens'}} );
+is( $g_n, $e_n, "Got expected regularization of witness N" );
+
 
 =end testing
 
@@ -70,5 +85,22 @@ sub reading_lookup {
        return __PACKAGE__->perseus_reading_lookup( @_ );
 }
 
+=head2 regularize( $text )
+
+Returns a regularized form of the reading for the purposes of collation.
+
+=cut
+
+sub regularize {
+       my( $word ) = @_;
+       $word = lc( $word );
+       $word =~ s/v/u/g;
+       $word =~ s/w/u/g;
+       $word =~ s/j/i/g;
+       $word =~ s/ci/ti/g;
+       $word =~ s/cha/ca/g;
+       return $word;
+}
+
 1;
 
diff --git a/morphology/lib/Text/Tradition/WitLanguage.pm b/morphology/lib/Text/Tradition/WitLanguage.pm

new file mode 100644 (file)

index 0000000..7008c40
--- /dev/null
+++ b/morphology/lib/Text/Tradition/WitLanguage.pm
@@ -0,0 +1,78 @@
+package Text::Tradition::WitLanguage;
+
+use strict;
+use warnings;
+use Module::Load;
+use Moose::Role;
+use TryCatch;
+
+=head1 NAME
+
+Text::Tradition::WitLanguage - add-on role to enable language awareness and 
+morphology functions to a Text::Tradition::Witness object.  Please see
+L<Text::Tradition::Morphology> for more information on the morphology 
+add-on distribution.
+
+=head1 METHODS
+
+=head2 language
+
+Accessor for the primary language of the tradition. Must correspond to one
+of the Text::Tradition::Language::* modules in this package. Used for JSON
+export of a language-regularized witness text.
+
+=begin testing
+
+use Test::Warn;
+use TryCatch;
+use_ok( 'Text::Tradition' ); # with Language
+use_ok( 'Text::Tradition::Witness' ); # with WitLanguage
+
+=end testing
+
+=cut
+
+has 'language' => (
+       is => 'rw',
+       isa => 'Str',
+       predicate => 'has_language',
+       );
+       
+around 'language' => sub {
+       my $orig = shift;
+       my $self = shift;
+       if( @_ && $_[0] ne 'Default' ) {
+               # We are trying to set the language; check that the corresponding
+               # module exists.
+               try {
+                       load( "Text::Tradition::Language::".$_[0] );
+               } catch ( $e ) {
+                       warn( "Cannot load language module for @_: $e" );
+               }
+       } elsif( !$self->has_language && $self->tradition->has_language ) {
+               return $self->tradition->language;
+       }
+       $self->$orig( @_ );
+};
+    
+around 'export_as_json' => sub {
+       my $orig = shift;
+       my $self = shift;
+       my $answer = $self->$orig( @_ );
+       if( $self->has_language || $self->tradition->has_language ) {
+               # If we do have a language, regularize the tokens in $answer.
+               my $mod = "Text::Tradition::Language::" . $self->language;
+               load( $mod );
+               my $rsub = $mod->can( 'regularize' );
+               map { $_->{'n'} = $rsub->( $_->{'t'} ) } @{$answer->{tokens}};
+               if( exists $answer->{layertokens} ) {
+                       map { $_->{'n'} = $rsub->( $_->{'t'} ) } @{$answer->{layertokens}};
+               }
+       } else {
+               warn "Please set a language to regularize a tradition";
+       }
+       return $answer;
+};
+
+1;
+
diff --git a/morphology/t/text_tradition_language_latin.t b/morphology/t/text_tradition_language_latin.t

index 4b948f6..ac2a877 100644 (file)
--- a/morphology/t/text_tradition_language_latin.t
+++ b/morphology/t/text_tradition_language_latin.t
@@ -11,16 +11,16 @@ $| = 1;
 use Text::Tradition;
 use_ok( 'Text::Tradition::Language::Latin' );
 
+my $trad = Text::Tradition->new(
+       'language' => 'Latin',
+       'file' => 't/data/legendfrag.xml',
+       'input' => 'Self' );
+
 eval "use Lingua::Morph::Perseus";
 my $err = $@;
-
 SKIP: {
        skip "Package Lingua::Morph::Perseus not found" if $err;
 
-       my $trad = Text::Tradition->new(
-               'language' => 'Latin',
-               'file' => 't/data/legendfrag.xml',
-               'input' => 'Self' );
        $trad->lemmatize();
        my $ambig = 0;
        foreach my $r ( $trad->collation->readings ) {
@@ -41,6 +41,16 @@ SKIP: {
        }
        is( $ambig, 4, "Found 4 ambiguous forms as expected" );
 }
+       
+# Try exporting some witnesses
+my $e_v = 'in suetia uenerabilis pontifex beatus henricus in anglia oriundus';
+my $struct_v = $trad->witness('V')->export_as_json;
+my $g_v = join( ' ', map { $_->{'n'} } @{$struct_v->{'tokens'}} );
+is( $g_v, $e_v, "Got expected regularization of witness V" );
+my $e_n = 'in suetia beatus henricus uenerabilis pontifex de anglia oriundus';
+my $struct_n = $trad->witness('N')->export_as_json;
+my $g_n = join( ' ', map { $_->{'n'} } @{$struct_n->{'tokens'}} );
+is( $g_n, $e_n, "Got expected regularization of witness N" );
 }
 
 
diff --git a/morphology/t/text_tradition_witlanguage.t b/morphology/t/text_tradition_witlanguage.t

new file mode 100644 (file)

index 0000000..97349b5
--- /dev/null
+++ b/morphology/t/text_tradition_witlanguage.t
@@ -0,0 +1,20 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Test::More 'no_plan';
+$| = 1;
+
+
+
+# =begin testing
+{
+use Test::Warn;
+use TryCatch;
+use_ok( 'Text::Tradition' ); # with Language
+use_ok( 'Text::Tradition::Witness' ); # with WitLanguage
+}
+
+
+
+
+1;
morphology/lib/Text/Tradition/Language/Armenian.pm		patch \| blob \| blame \| history
morphology/lib/Text/Tradition/Language/Base.pm		patch \| blob \| blame \| history
morphology/lib/Text/Tradition/Language/English.pm		patch \| blob \| blame \| history
morphology/lib/Text/Tradition/Language/French.pm		patch \| blob \| blame \| history
morphology/lib/Text/Tradition/Language/Greek.pm		patch \| blob \| blame \| history
morphology/lib/Text/Tradition/Language/Latin.pm		patch \| blob \| blame \| history
morphology/lib/Text/Tradition/WitLanguage.pm	[new file with mode: 0644]	patch \| blob
morphology/t/text_tradition_language_latin.t		patch \| blob \| blame \| history
morphology/t/text_tradition_witlanguage.t	[new file with mode: 0644]	patch \| blob