Revision history for Perl extension Unicode::Normalize.
+0.12 Wed Nov 29 22:49:02 2001
+ - documentation in .pod is appended to .pm and the .pod is removed.
+ (only POD in NON-XS refers to Lingua::KO::Hangul::Util.)
+
+0.11 Sat Nov 24 10:18:38 2001
+ - documentation of some functions for character data.
+ - Change 12909: by Jarkko Hietaniemi.
+ - Change 13228: by Peter Prymmer.
+
0.10 Sat Nov 03 16:30:20 2001
- The XS version is now independent of Lingua::KO::Hangul::Util.
(though the Non-XS version still requires that.)
use warnings;
use Carp;
-our $VERSION = '0.10';
+our $VERSION = '0.12';
our $PACKAGE = __PACKAGE__;
require Exporter;
our @ISA = qw(Exporter DynaLoader);
our @EXPORT = qw( NFC NFD NFKC NFKD );
-our @EXPORT_OK = qw( normalize decompose reorder compose
- getCanon getCompat getComposite getCombinClass getExclusion);
+our @EXPORT_OK = qw(
+ normalize decompose reorder compose
+ getCanon getCompat getComposite getCombinClass isExclusion
+);
our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
bootstrap Unicode::Normalize $VERSION;
use constant CANON => 0;
use constant COMPAT => 1;
-sub NFD ($) { reorder(decompose($_[0], CANON)) }
-
+sub NFD ($) { reorder(decompose($_[0], CANON )) }
sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
-sub NFC ($) { compose(reorder(decompose($_[0], CANON))) }
-
+sub NFC ($) { compose(reorder(decompose($_[0], CANON ))) }
sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
sub normalize($$)
{
my $form = shift;
- $form eq 'D' || $form eq 'NFD' ? NFD ($_[0]) :
- $form eq 'C' || $form eq 'NFC' ? NFC ($_[0]) :
- $form eq 'KD' || $form eq 'NFKD' ? NFKD($_[0]) :
- $form eq 'KC' || $form eq 'NFKC' ? NFKC($_[0]) :
+ $form =~ s/NF//;
+ $form eq 'D' ? NFD ($_[0]) :
+ $form eq 'C' ? NFC ($_[0]) :
+ $form eq 'KD' ? NFKD($_[0]) :
+ $form eq 'KC' ? NFKC($_[0]) :
croak $PACKAGE."::normalize: invalid form name: $form";
}
1;
__END__
+
+=head1 NAME
+
+Unicode::Normalize - normalized forms of Unicode text
+
+=head1 SYNOPSIS
+
+ use Unicode::Normalize;
+
+ $string_NFD = NFD($raw_string); # Normalization Form D
+ $string_NFC = NFC($raw_string); # Normalization Form C
+ $string_NFKD = NFKD($raw_string); # Normalization Form KD
+ $string_NFKC = NFKC($raw_string); # Normalization Form KC
+
+ or
+
+ use Unicode::Normalize 'normalize';
+
+ $string_NFD = normalize('D', $raw_string); # Normalization Form D
+ $string_NFC = normalize('C', $raw_string); # Normalization Form C
+ $string_NFKD = normalize('KD', $raw_string); # Normalization Form KD
+ $string_NFKC = normalize('KC', $raw_string); # Normalization Form KC
+
+=head1 DESCRIPTION
+
+=head2 Normalization
+
+=over 4
+
+=item C<$string_NFD = NFD($raw_string)>
+
+returns the Normalization Form D (formed by canonical decomposition).
+
+
+=item C<$string_NFC = NFC($raw_string)>
+
+returns the Normalization Form C (formed by canonical decomposition
+followed by canonical composition).
+
+=item C<$string_NFKD = NFKD($raw_string)>
+
+returns the Normalization Form KD (formed by compatibility decomposition).
+
+=item C<$string_NFKC = NFKC($raw_string)>
+
+returns the Normalization Form KC (formed by compatibility decomposition
+followed by B<canonical> composition).
+
+=item C<$normalized_string = normalize($form_name, $raw_string)>
+
+As C<$form_name>, one of the following names must be given.
+
+ 'C' or 'NFC' for Normalization Form C
+ 'D' or 'NFD' for Normalization Form D
+ 'KC' or 'NFKC' for Normalization Form KC
+ 'KD' or 'NFKD' for Normalization Form KD
+
+=back
+
+=head2 Character Data
+
+These functions are interface of character data used internally.
+If you want only to get unicode normalization forms,
+you need not to call them by yourself.
+
+=over 4
+
+=item C<$canonical_decomposed = getCanon($codepoint)>
+
+=item C<$compatibility_decomposed = getCompat($codepoint)>
+
+If the character of the specified codepoint is canonically or
+compatibility decomposable (including Hangul Syllables),
+returns the B<completely decomposed> string equivalent to it.
+
+If it is not decomposable, returns undef.
+
+=item C<$uv_composite = getComposite($uv_here, $uv_next)>
+
+If the couple of two characters here and next (as codepoints) is composable
+(including Hangul Jamo/Syllables and Exclusions),
+returns the codepoint of the composite.
+
+If they are not composable, returns undef.
+
+=item C<$combining_class = getCombinClass($codepoint)>
+
+Returns the combining class as integer of the character.
+
+=item C<$is_exclusion = isExclusion($codepoint)>
+
+Returns a boolean whether the character of the specified codepoint is
+a composition exclusion.
+
+=back
+
+=head2 EXPORT
+
+C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
+
+C<normalize> and other some functions: on request.
+
+=head1 AUTHOR
+
+SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
+
+ http://homepage1.nifty.com/nomenclator/perl/
+
+ Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
+
+ This program is free software; you can redistribute it and/or
+ modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+=over 4
+
+=item http://www.unicode.org/unicode/reports/tr15/
+
+Unicode Normalization Forms - UAX #15
+
+=back
+
+=cut
+
#define Hangul_TCount 28
#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
-#define Hangul_IsN(u) (! (((u) - Hangul_SBase) % Hangul_TCount))
+#define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
return row ? row[uv & 0xff] : NULL;
}
-UV getComposite (UV uv, UV uv2)
+UV composite_uv (UV uv, UV uv2)
{
UNF_complist ***plane, **row, *cell, *i;
}
if(Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
uv2 -= Hangul_TBase; /* tindex */
- return (uv + uv2);
+ return(uv + uv2);
}
plane = UNF_compos[uv >> 16];
if(! plane) return 0;
row = plane[(uv >> 8) & 0xff];
- if(! row) return 0;
+ if(! row) return 0;
cell = row[uv & 0xff];
- if(! cell) return 0;
+ if(! cell) return 0;
for(i = cell; i->nextchar; i++) {
if(uv2 == i->nextchar) return i->composite;
}
void sv_cat_decompHangul (SV* sv, UV uv)
{
UV sindex, lindex, vindex, tindex;
- U8 *t, temp[3 * UTF8_MAXLEN + 1];
+ U8 *t, tmp[3 * UTF8_MAXLEN + 1];
if(! Hangul_IsS(uv)) return;
vindex = (sindex % Hangul_NCount) / Hangul_TCount;
tindex = sindex % Hangul_TCount;
- t = temp;
+ t = tmp;
t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
if (tindex) t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
*t = '\0';
- sv_catpvn(sv, (char *)temp, strlen((char *)temp));
+ sv_catpvn(sv, (char *)tmp, strlen((char *)tmp));
}
MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
-
SV*
decompose(arg, compat)
SV * arg
SV * compat
PROTOTYPE: $
PREINIT:
+ UV uv;
SV *src, *dst;
STRLEN srclen, dstlen, retlen;
U8 *s, *e, *p, *d, *r;
- UV uv;
bool iscompat;
CODE:
if(SvUTF8(arg)) {
src = sv_mortalcopy(arg);
sv_utf8_upgrade(src);
}
-
iscompat = SvTRUE(compat);
dst = newSV(1);
s = (U8*)SvPV(src,srclen);
e = s + srclen;
+
for(p = s; p < e;){
U8 *cc_in;
STRLEN cc_len, cc_iter, cc_pos;
uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
- p += retlen;
- cc_pos = 0;
curCC = getCombinClass(uv);
+ p += retlen;
+
if(! (curCC && p < e)) continue; else cc_in = p - retlen;
+ cc_pos = 0;
stk_cc[cc_pos].cc = curCC;
stk_cc[cc_pos].uv = uv;
stk_cc[cc_pos].pos = cc_pos;
-void
+SV*
compose(arg)
SV * arg
PROTOTYPE: $
SV *src, *dst, *tmp;
U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
UV uv, uvS, uvComp;
- STRLEN srclen, dstlen, tmplen, dstcur, retlen;
+ STRLEN srclen, dstlen, tmplen, retlen;
bool beginning = TRUE;
- PPCODE:
+ CODE:
if(SvUTF8(arg)) {
src = arg;
} else {
src = sv_mortalcopy(arg);
sv_utf8_upgrade(src);
}
+
s = (U8*)SvPV(src, srclen);
e = s + srclen;
dstlen = srclen + 1; /* equal or shorter, XXX */
- dst = sv_2mortal(newSV(dstlen));
+ dst = newSV(dstlen);
(void)SvPOK_only(dst);
SvUTF8_on(dst);
d = (U8*)SvPVX(dst);
preCC = curCC;
t = uvuni_to_utf8(t, uv);
} else {
- uvComp = getComposite(uvS, uv);
+ uvComp = composite_uv(uvS, uv);
/* S + C + S => S-S + C would be also blocked. */
- if( uvComp && ! getExclusion(uvComp) && preCC <= curCC)
+ if( uvComp && ! isExclusion(uvComp) && preCC <= curCC)
{
/* preCC not changed to curCC */
uvS = uvComp;
}
}
}
- d = uvuni_to_utf8(d, uvS); /* composed char */
+ d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
if(tmplen = t - tmp_start) { /* uncomposed combining char */
t = (U8*)SvPVX(tmp);
while(tmplen--) *d++ = *t++;
}
uvS = uv;
} /* for */
- dstcur = d - (U8*)SvPVX(dst);
- SvCUR_set(dst, dstcur);
- XPUSHs(dst);
+ e = d; /* end of dst */
+ d = (U8*)SvPVX(dst);
+ SvCUR_set(dst, e - d);
+ RETVAL = dst;
+ OUTPUT:
+ RETVAL
UV uv
bool
-getExclusion(uv)
+isExclusion(uv)
UV uv
-UV
+SV*
getComposite(uv, uv2)
UV uv
UV uv2
+ PROTOTYPE: $$
+ PREINIT:
+ UV comp;
+ CODE:
+ comp = composite_uv(uv, uv2);
+ RETVAL = comp ? newSVuv(comp) : &PL_sv_undef;
+ OUTPUT:
+ RETVAL
SV*
getCanon(uv)
-Unicode/Normalize version 0.10
+Unicode/Normalize version 0.12
===================================
Unicode::Normalize - normalized forms of Unicode text
Exporter
File::Copy
File::Spec
-Lingua::KO::Hangul::Util 0.06
unicore/CombiningClass.pl or unicode/CombiningClass.pl
unicore/Decomposition.pl or unicode/Decomposition.pl
unicore/CompExcl.txt or unicode/CompExcl.txt
+and for the Non-XS version, in addition to the above,
+Lingua::KO::Hangul::Util 0.06
+
COPYRIGHT AND LICENCE
SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
croak "$PACKAGE: illegal char in the composite. utf-8 max is 0x10ffff.";
}
+sub utfebcdiclen {
+ my $uv = shift;
+ return $uv < 0xA0 ? 1 :
+ $uv < 0x400 ? 2 :
+ $uv < 0x4000 ? 3 :
+ $uv < 0x40000 ? 4 :
+ $uv < 0x110000 ? 5 :
+ croak "$PACKAGE: illegal char in the composite. utf-8 max is 0x10ffff.";
+}
+
my $prefix = "UNF_";
my $structname = "${prefix}complist";
our (%Comp1st, %CompList);
+my $errExpand = "$PACKAGE: A composable pair in %s "
+ . "is longer than the composite in bytes!\n"
+ . "%d + %d => %d\nQuit. Please inform the author...";
+
foreach(sort keys %Compos) {
my @a = unpack('U*', $_);
my $val = $Compos{$_};
$CompList{ $name }{ $a[1] } = $val;
if( utf8len($a[0]) + utf8len($a[1]) < utf8len($val) ) {
- croak "$PACKAGE: "
- . "composable pair is longer than the composite in bytes!\n"
- . sprintf("%d + %d => %d", $a[0], $a[1], $val);
+ croak sprintf($errExpand, "utf-8", $a[0], $a[1], $val);
+ }
+ if( utfebcdiclen($a[0]) + utfebcdiclen($a[1]) < utfebcdiclen($val)) {
+ croak sprintf($errExpand, "utf-ebcdic", $a[0], $a[1], $val);
}
}
open FH, ">$file" or croak "$PACKAGE: $file can't be made";
binmode FH; select FH;
-print "bool getExclusion (UV uv) \n{\nreturn\n\t";
+print "bool isExclusion (UV uv) \n{\nreturn\n\t";
while(@Exclus) {
my $cur = shift @Exclus;
&& getCanon(0x00EF) eq pack('U*', 0x0069, 0x0308)
&& getCanon(0x304C) eq pack('U*', 0x304B, 0x3099)
&& getCanon(0x1EA4) eq pack('U*', 0x0041, 0x0302, 0x0301)
+ && getCanon(0x1F82) eq "\x{03B1}\x{0313}\x{0300}\x{0345}"
&& getCanon(0x1FAF) eq pack('U*', 0x03A9, 0x0314, 0x0342, 0x0345)
&& getCanon(0xAC00) eq pack('U*', 0x1100, 0x1161)
&& getCanon(0xAE00) eq pack('U*', 0x1100, 0x1173, 0x11AF)
&& getCompat(0x00EF) eq pack('U*', 0x0069, 0x0308)
&& getCompat(0x304C) eq pack('U*', 0x304B, 0x3099)
&& getCompat(0x1EA4) eq pack('U*', 0x0041, 0x0302, 0x0301)
+ && getCompat(0x1F82) eq pack('U*', 0x03B1, 0x0313, 0x0300, 0x0345)
&& getCompat(0x1FAF) eq pack('U*', 0x03A9, 0x0314, 0x0342, 0x0345)
&& getCompat(0x212C) eq pack('U*', 0x0042)
&& getCompat(0x3243) eq pack('U*', 0x0028, 0x81F3, 0x0029)
&& getCompat(0xFA2D) eq pack('U*', 0x9DB4)
? "ok" : "not ok", " 4\n";
-print ! getComposite( 0, 0)
- && ! getComposite( 0, 41)
- && ! getComposite(41, 0)
- && ! getComposite(41, 41)
- && ! getComposite(12, 0x0300)
- && ! getComposite(0x0055, 0xFF00)
+print ! defined getComposite( 0, 0)
+ && ! defined getComposite( 0, 41)
+ && ! defined getComposite(41, 0)
+ && ! defined getComposite(41, 41)
+ && ! defined getComposite(12, 0x0300)
+ && ! defined getComposite(0x0055, 0xFF00)
+ && 0x00C0 == getComposite(0x0041, 0x0300)
&& 0x00D9 == getComposite(0x0055, 0x0300)
&& 0x1E14 == getComposite(0x0112, 0x0300)
&& 0xAC00 == getComposite(0x1100, 0x1161)
&& 0xAE00 == getComposite(0xADF8, 0x11AF)
? "ok" : "not ok", " 5\n";
-print ! getExclusion( 0)
- && ! getExclusion(41)
- && getExclusion(2392)
- && getExclusion(3907)
- && getExclusion(64334)
+print ! isExclusion( 0)
+ && ! isExclusion(41)
+ && isExclusion(2392)
+ && isExclusion(3907)
+ && isExclusion(64334)
? "ok" : "not ok", " 6\n";
+
}
sub hexNFD {
join " ", map sprintf("%04X", $_),
- unpack 'U*', normalize 'D', pack 'U*', map hex(), split ' ', shift;
+ unpack 'U*', normalize 'NFD', pack 'U*', map hex(), split ' ', shift;
}
ok(hexNFC("0061 0315 0300 05AE 05C4 0062"), "00E0 05AE 05C4 0315 0062");
use Test;
use strict;
use warnings;
-BEGIN { plan tests => 18 };
+BEGIN { plan tests => 20 };
use Unicode::Normalize;
ok(1); # If we made it this far, we're ok.
ok(hexNFC("0000 0041 0000 0000"), "0000 0041 0000 0000");
ok(hexNFD("0000 0041 0000 0000"), "0000 0041 0000 0000");
+# should be unary.
+ok(NFC "\x{41}\x{0302}\x{0301}\x62" eq "\x{1EA4}\x62");
+ok(NFD "\x{E0}\x{AC00}" eq "\x{61}\x{0300}\x{1100}\x{1161}");