From: Tara L Andrews Date: Mon, 24 Sep 2012 08:47:18 +0000 (+0200) Subject: add witness role for collation regularization X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=307d8db9928c8b6fe865ab45986aa7e572b679bd;p=scpubgit%2Fstemmatology.git add witness role for collation regularization --- diff --git a/morphology/lib/Text/Tradition/Language/Armenian.pm b/morphology/lib/Text/Tradition/Language/Armenian.pm index 3030f0f..da962ef 100644 --- a/morphology/lib/Text/Tradition/Language/Armenian.pm +++ b/morphology/lib/Text/Tradition/Language/Armenian.pm @@ -20,6 +20,10 @@ depends on the Lingua::Morph::Perseus module for access to PhiloLogic database d Evaluates the string using Treetagger and Perseus, and returns the results. +=head2 reading_lookup( $word ) + +Returns a single-word morphological lookup of the given word using Perseus. + =begin testing use Text::Tradition; @@ -70,5 +74,33 @@ sub reading_lookup { return __PACKAGE__->perseus_reading_lookup( @_ ); } +=head2 regularize( $text ) + +Returns a regularized form of the reading for the purposes of collation. + +=cut + +sub regularize { + my( $word ) = @_; + # We don't really distinguish between commas and semicolons properly + # in the manuscript. Make them the same. + $word =~ s/\./\,/g; + + # Get rid of accent marks. + $word =~ s/՛//g; + # Get rid of hyphen. + $word =~ s/֊//g; + # Get rid of any backtick that falls mid-word. + $word =~ s/՝(.)/$1/g; + # Standardize ligatures. + $word =~ s/աւ/օ/g; # for easy vocalic comparison to ո + $word =~ s/և/եւ/g; + + # TODO split off suspected prefix/suffix markers? + # Downcase the word. + $word = lc( $word ); + return $word; +} + 1; diff --git a/morphology/lib/Text/Tradition/Language/Base.pm b/morphology/lib/Text/Tradition/Language/Base.pm index fd56c04..fa57723 100644 --- a/morphology/lib/Text/Tradition/Language/Base.pm +++ b/morphology/lib/Text/Tradition/Language/Base.pm @@ -10,8 +10,10 @@ use Module::Load; use Text::Tradition::Collation::Reading::Lexeme; use Text::Tradition::Collation::Reading::WordForm; use TryCatch; +use Unicode::Normalize; -@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /; +@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags + unicode_regularize /; =head1 NAME @@ -283,6 +285,24 @@ sub _by_structid { return $a cmp $b; } +=head2 unicode_regularize( $word ) + +Returns a lowercased and accent-stripped version of the word. + +=cut + +sub unicode_regularize { + my $word = shift; + my @normalized; + my @letters = split( '', lc( $word ) ); + foreach my $l ( @letters ) { + my $d = chr( ord( NFKD( $l ) ) ); + next unless $d =~ /[[:alnum:]]/; # toss out e.g. Greek underdots + push( @normalized, $d ); + } + return join( '', @normalized ); +} + 1; =head2 TODO diff --git a/morphology/lib/Text/Tradition/Language/English.pm b/morphology/lib/Text/Tradition/Language/English.pm index 4277d62..bb62fb5 100644 --- a/morphology/lib/Text/Tradition/Language/English.pm +++ b/morphology/lib/Text/Tradition/Language/English.pm @@ -4,7 +4,7 @@ use strict; use warnings; use Lingua::TagSet::TreeTagger::English; use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger - lfs_morph_tags /; + lfs_morph_tags unicode_regularize /; use TryCatch; =head1 NAME @@ -91,6 +91,17 @@ sub _parse_wordform { 1; + +=head2 regularize( $text ) + +Returns a regularized form of the reading for the purposes of collation. + +=cut + +sub regularize { + return unicode_regularize( @_ ); +} + =head2 TODO =over diff --git a/morphology/lib/Text/Tradition/Language/French.pm b/morphology/lib/Text/Tradition/Language/French.pm index 13ad869..7af89c9 100644 --- a/morphology/lib/Text/Tradition/Language/French.pm +++ b/morphology/lib/Text/Tradition/Language/French.pm @@ -5,7 +5,8 @@ use warnings; use Lingua::TagSet::Multext; use Lingua::TagSet::TreeTagger::French; use Module::Load qw/ load /; -use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /; +use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger + lfs_morph_tags unicode_regularize /; use TryCatch; =head1 NAME @@ -173,6 +174,16 @@ sub _parse_wordform { return @forms; } +=head2 regularize( $text ) + +Returns a regularized form of the reading for the purposes of collation. + +=cut + +sub regularize { + return unicode_regularize( @_ ); +} + 1; =head2 TODO diff --git a/morphology/lib/Text/Tradition/Language/Greek.pm b/morphology/lib/Text/Tradition/Language/Greek.pm index 8106ec6..e4159b2 100644 --- a/morphology/lib/Text/Tradition/Language/Greek.pm +++ b/morphology/lib/Text/Tradition/Language/Greek.pm @@ -4,6 +4,7 @@ use strict; use warnings; use Module::Load; use parent qw/ Text::Tradition::Language::Perseus /; +use Text::Tradition::Language::Base qw/ unicode_regularize /; =head1 NAME @@ -20,6 +21,10 @@ depends on the Lingua::Morph::Perseus module for access to PhiloLogic database d Evaluates the string using Treetagger and Perseus, and returns the results. +=head2 reading_lookup( $word ) + +Returns a single-word morphological lookup of the given word using Perseus. + =begin testing use Text::Tradition; @@ -70,5 +75,15 @@ sub reading_lookup { return __PACKAGE__->perseus_reading_lookup( @_ ); } +=head2 regularize( $text ) + +Returns a regularized form of the reading for the purposes of collation. + +=cut + +sub regularize { + return unicode_regularize( @_ ); +} + 1; diff --git a/morphology/lib/Text/Tradition/Language/Latin.pm b/morphology/lib/Text/Tradition/Language/Latin.pm index ad36208..c2b89f8 100644 --- a/morphology/lib/Text/Tradition/Language/Latin.pm +++ b/morphology/lib/Text/Tradition/Language/Latin.pm @@ -20,21 +20,25 @@ depends on the Lingua::Morph::Perseus module for access to PhiloLogic database d Evaluates the string using Treetagger and Perseus, and returns the results. +=head2 reading_lookup( $word ) + +Returns a single-word morphological lookup of the given word using Perseus. + =begin testing use Text::Tradition; use_ok( 'Text::Tradition::Language::Latin' ); +my $trad = Text::Tradition->new( + 'language' => 'Latin', + 'file' => 't/data/legendfrag.xml', + 'input' => 'Self' ); + eval "use Lingua::Morph::Perseus"; my $err = $@; - SKIP: { skip "Package Lingua::Morph::Perseus not found" if $err; - my $trad = Text::Tradition->new( - 'language' => 'Latin', - 'file' => 't/data/legendfrag.xml', - 'input' => 'Self' ); $trad->lemmatize(); my $ambig = 0; foreach my $r ( $trad->collation->readings ) { @@ -55,6 +59,17 @@ SKIP: { } is( $ambig, 4, "Found 4 ambiguous forms as expected" ); } + +# Try exporting some witnesses +my $e_v = 'in suetia uenerabilis pontifex beatus henricus in anglia oriundus'; +my $struct_v = $trad->witness('V')->export_as_json; +my $g_v = join( ' ', map { $_->{'n'} } @{$struct_v->{'tokens'}} ); +is( $g_v, $e_v, "Got expected regularization of witness V" ); +my $e_n = 'in suetia beatus henricus uenerabilis pontifex de anglia oriundus'; +my $struct_n = $trad->witness('N')->export_as_json; +my $g_n = join( ' ', map { $_->{'n'} } @{$struct_n->{'tokens'}} ); +is( $g_n, $e_n, "Got expected regularization of witness N" ); + =end testing @@ -70,5 +85,22 @@ sub reading_lookup { return __PACKAGE__->perseus_reading_lookup( @_ ); } +=head2 regularize( $text ) + +Returns a regularized form of the reading for the purposes of collation. + +=cut + +sub regularize { + my( $word ) = @_; + $word = lc( $word ); + $word =~ s/v/u/g; + $word =~ s/w/u/g; + $word =~ s/j/i/g; + $word =~ s/ci/ti/g; + $word =~ s/cha/ca/g; + return $word; +} + 1; diff --git a/morphology/lib/Text/Tradition/WitLanguage.pm b/morphology/lib/Text/Tradition/WitLanguage.pm new file mode 100644 index 0000000..7008c40 --- /dev/null +++ b/morphology/lib/Text/Tradition/WitLanguage.pm @@ -0,0 +1,78 @@ +package Text::Tradition::WitLanguage; + +use strict; +use warnings; +use Module::Load; +use Moose::Role; +use TryCatch; + +=head1 NAME + +Text::Tradition::WitLanguage - add-on role to enable language awareness and +morphology functions to a Text::Tradition::Witness object. Please see +L for more information on the morphology +add-on distribution. + +=head1 METHODS + +=head2 language + +Accessor for the primary language of the tradition. Must correspond to one +of the Text::Tradition::Language::* modules in this package. Used for JSON +export of a language-regularized witness text. + +=begin testing + +use Test::Warn; +use TryCatch; +use_ok( 'Text::Tradition' ); # with Language +use_ok( 'Text::Tradition::Witness' ); # with WitLanguage + +=end testing + +=cut + +has 'language' => ( + is => 'rw', + isa => 'Str', + predicate => 'has_language', + ); + +around 'language' => sub { + my $orig = shift; + my $self = shift; + if( @_ && $_[0] ne 'Default' ) { + # We are trying to set the language; check that the corresponding + # module exists. + try { + load( "Text::Tradition::Language::".$_[0] ); + } catch ( $e ) { + warn( "Cannot load language module for @_: $e" ); + } + } elsif( !$self->has_language && $self->tradition->has_language ) { + return $self->tradition->language; + } + $self->$orig( @_ ); +}; + +around 'export_as_json' => sub { + my $orig = shift; + my $self = shift; + my $answer = $self->$orig( @_ ); + if( $self->has_language || $self->tradition->has_language ) { + # If we do have a language, regularize the tokens in $answer. + my $mod = "Text::Tradition::Language::" . $self->language; + load( $mod ); + my $rsub = $mod->can( 'regularize' ); + map { $_->{'n'} = $rsub->( $_->{'t'} ) } @{$answer->{tokens}}; + if( exists $answer->{layertokens} ) { + map { $_->{'n'} = $rsub->( $_->{'t'} ) } @{$answer->{layertokens}}; + } + } else { + warn "Please set a language to regularize a tradition"; + } + return $answer; +}; + +1; + diff --git a/morphology/t/text_tradition_language_latin.t b/morphology/t/text_tradition_language_latin.t index 4b948f6..ac2a877 100644 --- a/morphology/t/text_tradition_language_latin.t +++ b/morphology/t/text_tradition_language_latin.t @@ -11,16 +11,16 @@ $| = 1; use Text::Tradition; use_ok( 'Text::Tradition::Language::Latin' ); +my $trad = Text::Tradition->new( + 'language' => 'Latin', + 'file' => 't/data/legendfrag.xml', + 'input' => 'Self' ); + eval "use Lingua::Morph::Perseus"; my $err = $@; - SKIP: { skip "Package Lingua::Morph::Perseus not found" if $err; - my $trad = Text::Tradition->new( - 'language' => 'Latin', - 'file' => 't/data/legendfrag.xml', - 'input' => 'Self' ); $trad->lemmatize(); my $ambig = 0; foreach my $r ( $trad->collation->readings ) { @@ -41,6 +41,16 @@ SKIP: { } is( $ambig, 4, "Found 4 ambiguous forms as expected" ); } + +# Try exporting some witnesses +my $e_v = 'in suetia uenerabilis pontifex beatus henricus in anglia oriundus'; +my $struct_v = $trad->witness('V')->export_as_json; +my $g_v = join( ' ', map { $_->{'n'} } @{$struct_v->{'tokens'}} ); +is( $g_v, $e_v, "Got expected regularization of witness V" ); +my $e_n = 'in suetia beatus henricus uenerabilis pontifex de anglia oriundus'; +my $struct_n = $trad->witness('N')->export_as_json; +my $g_n = join( ' ', map { $_->{'n'} } @{$struct_n->{'tokens'}} ); +is( $g_n, $e_n, "Got expected regularization of witness N" ); } diff --git a/morphology/t/text_tradition_witlanguage.t b/morphology/t/text_tradition_witlanguage.t new file mode 100644 index 0000000..97349b5 --- /dev/null +++ b/morphology/t/text_tradition_witlanguage.t @@ -0,0 +1,20 @@ +#!/usr/bin/perl -w + +use strict; +use Test::More 'no_plan'; +$| = 1; + + + +# =begin testing +{ +use Test::Warn; +use TryCatch; +use_ok( 'Text::Tradition' ); # with Language +use_ok( 'Text::Tradition::Witness' ); # with WitLanguage +} + + + + +1;