Revision history for Perl extension Unicode::Normalize.
+0.20 Sun Mar 02 13:29:25 2003
+ - decompose Hangul syllables in a decomposition mapping.
+
+0.18 ... unreleased
+ - synchronization with bleadperl.
+ - Change 16262: by me
+
0.17 Sun Apr 28 23:13:32 2002
- now normalize('NFC',$1) should work.
- Some croak()'s are added in mkheader.
use warnings;
use Carp;
-our $VERSION = '0.17';
+our $VERSION = '0.20';
our $PACKAGE = __PACKAGE__;
require Exporter;
=head2 Quick Check
-(see Annex 8, UAX #15; F<DerivedNormalizationProps.txt>)
+(see Annex 8, UAX #15, and F<DerivedNormalizationProps.txt>)
The following functions check whether the string is in that normalization form.
Returns a boolean whether the character of the specified codepoint is
a singleton.
-=item C<$is_non_startar_decomposition = isNonStDecomp($codepoint)>
+=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
Returns a boolean whether the canonical decomposition
of the character of the specified codepoint
http://homepage1.nifty.com/nomenclator/perl/
- Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
+ Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
- This program is free software; you can redistribute it and/or
- modify it under the same terms as Perl itself.
+ This module is free software; you can redistribute it
+ and/or modify it under the same terms as Perl itself.
=head1 SEE ALSO
/* Perl 5.6.1 ? */
#ifndef uvuni_to_utf8
#define uvuni_to_utf8 uv_to_utf8
-#endif /* uvuni_to_utf8 */
+#endif /* uvuni_to_utf8 */
/* Perl 5.6.1 ? */
#ifndef utf8n_to_uvuni
#define utf8n_to_uvuni utf8_to_uv
-#endif /* utf8n_to_uvuni */
+#endif /* utf8n_to_uvuni */
/* At present, char > 0x10ffff are unaffected without complaint, right? */
#define VALID_UTF_MAX (0x10ffff)
STRLEN pos; /* position */
} UNF_cc;
-int compare_cc(const void *a, const void *b)
+int compare_cc (const void *a, const void *b)
{
int ret_cc;
- ret_cc = (*(UNF_cc*)a).cc - (*(UNF_cc*)b).cc;
+ ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
if (ret_cc)
return ret_cc;
- return (*(UNF_cc*)a).pos - (*(UNF_cc*)b).pos;
+
+ return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
+ - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
}
U8* dec_canonical (UV uv)
isMAYBE = TRUE;
else if (ix) {
char *canon, *compat;
- /*
- * NFKC_NO when having compatibility mapping;
- * i.e. dec_compat(uv) defined & different with dec_canonical(uv).
- */
+ /* NFKC_NO when having compatibility mapping. */
canon = (char *) dec_canonical(uv);
compat = (char *) dec_compat(uv);
- if (compat && (!canon || strNE(canon, compat)))
+ if (compat && !(canon && strEQ(canon, compat)))
XSRETURN_NO;
} /* end of get NFC/NFKC property */
-Unicode/Normalize version 0.17
+Unicode/Normalize version 0.20
===================================
Unicode::Normalize - Unicode Normalization Forms
$NFKD_string = NFKD($string); # Normalization Form KD
$NFKC_string = NFKC($string); # Normalization Form KC
-
or
-
use Unicode::Normalize 'normalize';
$NFD_string = normalize('D', $string); # Normalization Form D
(1) In the perl-current, unicore/CompExcl.txt
is renamed unicore/CompositionExclusions.txt.
-(2) When these unicore/*.* files are updated;
+(2) After these unicore/*.* files are updated.
- in the case of an XS version:
+ In the case of an XS version:
You must rebuild the module,
as the data will be compiled on building.
- in the case of a NoXS version:
+ In the case of a NoXS version:
Rebuilding is not necessary,
as the data will be read on requirement.
http://homepage1.nifty.com/nomenclator/perl/
- Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
+ Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
- This program is free software; you can redistribute it and/or
- modify it under the same terms as Perl itself.
+ This module is free software; you can redistribute it
+ and/or modify it under the same terms as Perl itself.
use strict;
use warnings;
use Carp;
+use File::Spec;
+
+our $IsEBCDIC = ord("A") != 0x41;
our $PACKAGE = 'Unicode::Normalize, mkheader';
|| croak "$PACKAGE: Decomposition.pl not found";
our %Combin; # $codepoint => $number : combination class
-our %Canon; # $codepoint => $hexstring : canonical decomp.
-our %Compat; # $codepoint => $hexstring : compat. decomp.
+our %Canon; # $codepoint => \@codepoints : canonical decomp.
+our %Compat; # $codepoint => \@codepoints : compat. decomp.
+# after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat
our %Exclus; # $codepoint => 1 : composition exclusions
our %Single; # $codepoint => 1 : singletons
our %NonStD; # $codepoint => 1 : non-starter decompositions
our $prefix = "UNF_";
our $structname = "${prefix}complist";
+########## definition of Hangul constants ##########
+use constant SBase => 0xAC00;
+use constant SFinal => 0xD7A3; # SBase -1 + SCount
+use constant SCount => 11172; # LCount * NCount
+use constant NCount => 588; # VCount * TCount
+use constant LBase => 0x1100;
+use constant LFinal => 0x1112;
+use constant LCount => 19;
+use constant VBase => 0x1161;
+use constant VFinal => 0x1175;
+use constant VCount => 21;
+use constant TBase => 0x11A7;
+use constant TFinal => 0x11C2;
+use constant TCount => 28;
+
+sub decomposeHangul {
+ my $SIndex = $_[0] - SBase;
+ my $LIndex = int( $SIndex / NCount);
+ my $VIndex = int(($SIndex % NCount) / TCount);
+ my $TIndex = $SIndex % TCount;
+ my @ret = (
+ LBase + $LIndex,
+ VBase + $VIndex,
+ $TIndex ? (TBase + $TIndex) : (),
+ );
+ wantarray ? @ret : pack('U*', @ret);
+ # any element in @ret greater than 0xFF, so no need of u2n conversion.
+}
+
+########## getting full decomposion ##########
{
my($f, $fh);
foreach my $d (@INC) {
- use File::Spec;
$f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
last if open($fh, $f);
$f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
sub getCanonList {
my @src = @_;
- my @dec = map $Canon{$_} ? @{ $Canon{$_} } : $_, @src;
+ my @dec = map {
+ (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
+ : $Canon{$_} ? @{ $Canon{$_} } : $_
+ } @src;
return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
# condition @src == @dec is not ok.
}
sub getCompatList {
my @src = @_;
- my @dec = map $Compat{$_} ? @{ $Compat{$_} } : $_, @src;
+ my @dec = map {
+ (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
+ : $Compat{$_} ? @{ $Compat{$_} } : $_
+ } @src;
return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
# condition @src == @dec is not ok.
}
sub _U_stringify {
sprintf '"%s"', join '',
- map sprintf("\\x%02x", $_), unpack 'C*', pack 'U*', @_;
+ map sprintf("\\x%02x", $_), unpack 'C*',
+ $IsEBCDIC
+ ? pack('U*', map utf8::unicode_to_native($_), @_)
+ : pack('U*', @_);
}
-# Do we need say <pack 'U*', map utf8::unicode_to_native($_),>
-# instead of <pack 'U*',> for EBCDIC?
foreach my $hash (\%Canon, \%Compat) {
foreach my $key (keys %$hash) {
}
}
-####################################
+########## writing header files ##########
my @boolfunc = (
{
-# Before `make install' is performed this script should be runnable with
-# `make test'. After `make install' it should work as `perl test.pl'
BEGIN {
if (ord("A") == 193) {
}
}
+BEGIN {
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = qw(../lib);
+ }
+}
+
#########################
use Test;
use Unicode::Normalize qw(:all);
ok(1); # If we made it this far, we're ok.
+our $IsEBCDIC = ord("A") != 0x41;
+
+sub _pack_U {
+ return $IsEBCDIC
+ ? pack('U*', map utf8::unicode_to_native($_), @_)
+ : pack('U*', @_);
+}
+
+sub _unpack_U {
+ return $IsEBCDIC
+ ? map(utf8::native_to_unicode($_), unpack 'U*', shift)
+ : unpack('U*', shift);
+}
+
#########################
print getCombinClass( 0) == 0
print ! defined getCanon( 0)
&& ! defined getCanon(41)
- && getCanon(0x00C0) eq pack('U*', 0x0041, 0x0300)
- && getCanon(0x00EF) eq pack('U*', 0x0069, 0x0308)
- && getCanon(0x304C) eq pack('U*', 0x304B, 0x3099)
- && getCanon(0x1EA4) eq pack('U*', 0x0041, 0x0302, 0x0301)
+ && getCanon(0x00C0) eq _pack_U(0x0041, 0x0300)
+ && getCanon(0x00EF) eq _pack_U(0x0069, 0x0308)
+ && getCanon(0x304C) eq _pack_U(0x304B, 0x3099)
+ && getCanon(0x1EA4) eq _pack_U(0x0041, 0x0302, 0x0301)
&& getCanon(0x1F82) eq "\x{03B1}\x{0313}\x{0300}\x{0345}"
- && getCanon(0x1FAF) eq pack('U*', 0x03A9, 0x0314, 0x0342, 0x0345)
- && getCanon(0xAC00) eq pack('U*', 0x1100, 0x1161)
- && getCanon(0xAE00) eq pack('U*', 0x1100, 0x1173, 0x11AF)
+ && getCanon(0x1FAF) eq _pack_U(0x03A9, 0x0314, 0x0342, 0x0345)
+ && getCanon(0xAC00) eq _pack_U(0x1100, 0x1161)
+ && getCanon(0xAE00) eq _pack_U(0x1100, 0x1173, 0x11AF)
&& ! defined getCanon(0x212C)
&& ! defined getCanon(0x3243)
- && getCanon(0xFA2D) eq pack('U*', 0x9DB4)
+ && getCanon(0xFA2D) eq _pack_U(0x9DB4)
? "ok" : "not ok", " 3\n";
print ! defined getCompat( 0)
&& ! defined getCompat(41)
- && getCompat(0x00C0) eq pack('U*', 0x0041, 0x0300)
- && getCompat(0x00EF) eq pack('U*', 0x0069, 0x0308)
- && getCompat(0x304C) eq pack('U*', 0x304B, 0x3099)
- && getCompat(0x1EA4) eq pack('U*', 0x0041, 0x0302, 0x0301)
- && getCompat(0x1F82) eq pack('U*', 0x03B1, 0x0313, 0x0300, 0x0345)
- && getCompat(0x1FAF) eq pack('U*', 0x03A9, 0x0314, 0x0342, 0x0345)
- && getCompat(0x212C) eq pack('U*', 0x0042)
- && getCompat(0x3243) eq pack('U*', 0x0028, 0x81F3, 0x0029)
- && getCompat(0xAC00) eq pack('U*', 0x1100, 0x1161)
- && getCompat(0xAE00) eq pack('U*', 0x1100, 0x1173, 0x11AF)
- && getCompat(0xFA2D) eq pack('U*', 0x9DB4)
+ && getCompat(0x00C0) eq _pack_U(0x0041, 0x0300)
+ && getCompat(0x00EF) eq _pack_U(0x0069, 0x0308)
+ && getCompat(0x304C) eq _pack_U(0x304B, 0x3099)
+ && getCompat(0x1EA4) eq _pack_U(0x0041, 0x0302, 0x0301)
+ && getCompat(0x1F82) eq _pack_U(0x03B1, 0x0313, 0x0300, 0x0345)
+ && getCompat(0x1FAF) eq _pack_U(0x03A9, 0x0314, 0x0342, 0x0345)
+ && getCompat(0x212C) eq _pack_U(0x0042)
+ && getCompat(0x3243) eq _pack_U(0x0028, 0x81F3, 0x0029)
+ && getCompat(0xAC00) eq _pack_U(0x1100, 0x1161)
+ && getCompat(0xAE00) eq _pack_U(0x1100, 0x1173, 0x11AF)
+ && getCompat(0xFA2D) eq _pack_U(0x9DB4)
? "ok" : "not ok", " 4\n";
print ! defined getComposite( 0, 0)
? "ok" : "not ok", " 7\n";
print reorder("") eq ""
- && reorder(pack("U*", 0x0041, 0x0300, 0x0315, 0x0313, 0x031b, 0x0061))
- eq pack("U*", 0x0041, 0x031b, 0x0300, 0x0313, 0x0315, 0x0061)
- && reorder(pack("U*", 0x00C1, 0x0300, 0x0315, 0x0313, 0x031b,
+ && reorder(_pack_U(0x0041, 0x0300, 0x0315, 0x0313, 0x031b, 0x0061))
+ eq _pack_U(0x0041, 0x031b, 0x0300, 0x0313, 0x0315, 0x0061)
+ && reorder(_pack_U(0x00C1, 0x0300, 0x0315, 0x0313, 0x031b,
0x0061, 0x309A, 0x3099))
- eq pack("U*", 0x00C1, 0x031b, 0x0300, 0x0313, 0x0315,
+ eq _pack_U(0x00C1, 0x031b, 0x0300, 0x0313, 0x0315,
0x0061, 0x309A, 0x3099)
? "ok" : "not ok", " 8\n";
? "ok" : "not ok", " 9\n";
print 1
- && answer(checkNFD(NFD(pack('U*', 0xC1, 0x1100, 0x1173, 0x11AF)))) eq "YES"
- && answer(checkNFD(pack('U*', 0x20, 0xC1, 0x1100, 0x1173, 0x11AF))) eq "NO"
- && answer(checkNFC(pack('U*', 0x20, 0xC1, 0x1173, 0x11AF))) eq "MAYBE"
- && answer(checkNFC(pack('U*', 0x20, 0xC1, 0xAE00, 0x1100))) eq "YES"
- && answer(checkNFC(pack('U*', 0x20, 0xC1, 0xAE00, 0x1100, 0x300))) eq "MAYBE"
- && answer(checkNFC(pack('U*', 0x20, 0xC1, 0xFF71, 0x2025))) eq "YES"
- && answer(check("NFC", pack('U*', 0x20, 0xC1, 0x212B, 0x300))) eq "NO"
- && answer(checkNFKD(pack('U*', 0x20, 0xC1, 0xFF71, 0x2025))) eq "NO"
- && answer(checkNFKC(pack('U*', 0x20, 0xC1, 0xAE00, 0x2025))) eq "NO"
+ && answer(checkNFD(NFD(_pack_U(0xC1, 0x1100, 0x1173, 0x11AF)))) eq "YES"
+ && answer(checkNFD(_pack_U(0x20, 0xC1, 0x1100, 0x1173, 0x11AF))) eq "NO"
+ && answer(checkNFC(_pack_U(0x20, 0xC1, 0x1173, 0x11AF))) eq "MAYBE"
+ && answer(checkNFC(_pack_U(0x20, 0xC1, 0xAE00, 0x1100))) eq "YES"
+ && answer(checkNFC(_pack_U(0x20, 0xC1, 0xAE00, 0x1100, 0x300))) eq "MAYBE"
+ && answer(checkNFC(_pack_U(0x20, 0xC1, 0xFF71, 0x2025))) eq "YES"
+ && answer(check("NFC", _pack_U(0x20, 0xC1, 0x212B, 0x300))) eq "NO"
+ && answer(checkNFKD(_pack_U(0x20, 0xC1, 0xFF71, 0x2025))) eq "NO"
+ && answer(checkNFKC(_pack_U(0x20, 0xC1, 0xAE00, 0x2025))) eq "NO"
? "ok" : "not ok", " 10\n";
"012ABC" =~ /(\d+)(\w+)/;
-# Before `make install' is performed this script should be runnable with
-# `make test'. After `make install' it should work as `perl test.pl'
BEGIN {
if (ord("A") == 193) {
}
}
+BEGIN {
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = qw(../lib);
+ }
+}
+
#########################
use Test;
use Unicode::Normalize qw(normalize);
ok(1); # If we made it this far, we're ok.
+our $IsEBCDIC = ord("A") != 0x41;
+
+sub _pack_U {
+ return $IsEBCDIC
+ ? pack('U*', map utf8::unicode_to_native($_), @_)
+ : pack('U*', @_);
+}
+
+sub _unpack_U {
+ return $IsEBCDIC
+ ? map(utf8::native_to_unicode($_), unpack 'U*', shift)
+ : unpack('U*', shift);
+}
+
#########################
ok(normalize('C', ""), "");
sub hexNFC {
join " ", map sprintf("%04X", $_),
- unpack 'U*', normalize 'C', pack 'U*', map hex(), split ' ', shift;
+ _unpack_U normalize 'C', _pack_U map hex, split ' ', shift;
}
sub hexNFD {
join " ", map sprintf("%04X", $_),
- unpack 'U*', normalize 'NFD', pack 'U*', map hex(), split ' ', shift;
+ _unpack_U normalize 'D', _pack_U map hex, split ' ', shift;
}
ok(hexNFC("0061 0315 0300 05AE 05C4 0062"), "00E0 05AE 05C4 0315 0062");
-# Before `make install' is performed this script should be runnable with
-# `make test'. After `make install' it should work as `perl test.pl'
BEGIN {
if (ord("A") == 193) {
}
}
+BEGIN {
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = qw(../lib);
+ }
+}
+
#########################
use Test;
use Unicode::Normalize;
ok(1); # If we made it this far, we're ok.
+our $IsEBCDIC = ord("A") != 0x41;
+
+sub _pack_U {
+ return $IsEBCDIC
+ ? pack('U*', map utf8::unicode_to_native($_), @_)
+ : pack('U*', @_);
+}
+
+sub _unpack_U {
+ return $IsEBCDIC
+ ? map(utf8::native_to_unicode($_), unpack 'U*', shift)
+ : unpack('U*', shift);
+}
+
#########################
ok(NFC(""), "");
sub hexNFC {
join " ", map sprintf("%04X", $_),
- unpack 'U*', NFC pack 'U*', map hex(), split ' ', shift;
+ _unpack_U NFC _pack_U map hex, split ' ', shift;
}
sub hexNFD {
join " ", map sprintf("%04X", $_),
- unpack 'U*', NFD pack 'U*', map hex(), split ' ', shift;
+ _unpack_U NFD _pack_U map hex, split ' ', shift;
}
ok(hexNFC("0061 0315 0300 05AE 05C4 0062"), "00E0 05AE 05C4 0315 0062");
ok(hexNFD("0000 0041 0000 0000"), "0000 0041 0000 0000");
# should be unary.
-my $str11 = pack('U*', 0x41, 0x0302, 0x0301, 0x62);
-my $str12 = pack('U*', 0x1EA4, 0x62);
+my $str11 = _pack_U(0x41, 0x0302, 0x0301, 0x62);
+my $str12 = _pack_U(0x1EA4, 0x62);
ok(NFC $str11 eq $str12);
-my $str21 = pack('U*', 0xE0, 0xAC00);
-my $str22 = pack('U*', 0x61, 0x0300, 0x1100, 0x1161);
+my $str21 = _pack_U(0xE0, 0xAC00);
+my $str22 = _pack_U(0x61, 0x0300, 0x1100, 0x1161);
ok(NFD $str21 eq $str22);
+