From: SADAHIRO Tomoyuki Date: Thu, 10 Jan 2002 01:08:56 +0000 (+0900) Subject: [Patch @14129] fixes Unicode::Normalize X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=d85850a777b87feb3a01d2328aa281372642877e;p=p5sagit%2Fp5-mst-13.2.git [Patch @14129] fixes Unicode::Normalize Message-Id: <20020110010110.690B.BQW10602@nifty.com> p4raw-id: //depot/perl@14156 --- diff --git a/MANIFEST b/MANIFEST index 28418dd..b07674b 100644 --- a/MANIFEST +++ b/MANIFEST @@ -625,7 +625,6 @@ ext/Unicode/Normalize/Changes Unicode::Normalize ext/Unicode/Normalize/Makefile.PL Unicode::Normalize ext/Unicode/Normalize/mkheader Unicode::Normalize ext/Unicode/Normalize/Normalize.pm Unicode::Normalize -ext/Unicode/Normalize/Normalize.pod Unicode::Normalize ext/Unicode/Normalize/Normalize.xs Unicode::Normalize ext/Unicode/Normalize/README Unicode::Normalize ext/Unicode/Normalize/t/func.t Unicode::Normalize diff --git a/ext/Unicode/Normalize/Makefile.PL b/ext/Unicode/Normalize/Makefile.PL index 88ab9b7..2b834d7 100644 --- a/ext/Unicode/Normalize/Makefile.PL +++ b/ext/Unicode/Normalize/Makefile.PL @@ -9,7 +9,7 @@ WriteMakefile( 'NAME' => 'Unicode::Normalize', 'VERSION_FROM' => 'Normalize.pm', # finds $VERSION ($] >= 5.005 ? ## Add these new keywords supported since 5.005 - (ABSTRACT_FROM => 'Normalize.pod', # retrieve abstract from module + (ABSTRACT_FROM => 'Normalize.pm', # retrieve abstract from module AUTHOR => 'SADAHIRO Tomoyuki ') : ()), clean => {FILES=> 'unfcan.h unfcmb.h unfcmp.h unfcpt.h unfexc.h'}, ); diff --git a/ext/Unicode/Normalize/Normalize.pm b/ext/Unicode/Normalize/Normalize.pm index 40d326f..819fbc4 100644 --- a/ext/Unicode/Normalize/Normalize.pm +++ b/ext/Unicode/Normalize/Normalize.pm @@ -5,7 +5,7 @@ use strict; use warnings; use Carp; -our $VERSION = '0.12'; +our $VERSION = '0.13'; our $PACKAGE = __PACKAGE__; require Exporter; @@ -22,24 +22,24 @@ our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] ); bootstrap Unicode::Normalize $VERSION; -use constant CANON => 0; use constant COMPAT => 1; -sub NFD ($) { reorder(decompose($_[0], CANON )) } +sub NFD ($) { reorder(decompose($_[0])) } sub NFKD ($) { reorder(decompose($_[0], COMPAT)) } -sub NFC ($) { compose(reorder(decompose($_[0], CANON ))) } +sub NFC ($) { compose(reorder(decompose($_[0]))) } sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) } sub normalize($$) { - my $form = shift; - $form =~ s/^NF//; - $form eq 'D' ? NFD ($_[0]) : - $form eq 'C' ? NFC ($_[0]) : - $form eq 'KD' ? NFKD($_[0]) : - $form eq 'KC' ? NFKC($_[0]) : - croak $PACKAGE."::normalize: invalid form name: $form"; + my $form = shift; + $form =~ s/^NF//; + return + $form eq 'D' ? NFD ($_[0]) : + $form eq 'C' ? NFC ($_[0]) : + $form eq 'KD' ? NFKD($_[0]) : + $form eq 'KC' ? NFKC($_[0]) : + croak $PACKAGE."::normalize: invalid form name: $form"; } 1; @@ -69,7 +69,7 @@ Unicode::Normalize - normalized forms of Unicode text =head1 DESCRIPTION -=head2 Normalization +=head2 Normalization Forms =over 4 @@ -107,7 +107,7 @@ As C<$form_name>, one of the following names must be given. These functions are interface of character data used internally. If you want only to get unicode normalization forms, -you need not to call them by yourself. +you doesn't need call them by yourself. =over 4 @@ -123,7 +123,7 @@ If it is not decomposable, returns undef. =item C<$uv_composite = getComposite($uv_here, $uv_next)> -If the couple of two characters here and next (as codepoints) is composable +If two characters here and next (as codepoints) are composable (including Hangul Jamo/Syllables and Exclusions), returns the codepoint of the composite. diff --git a/ext/Unicode/Normalize/Normalize.pod b/ext/Unicode/Normalize/Normalize.pod deleted file mode 100644 index 4ac8966..0000000 --- a/ext/Unicode/Normalize/Normalize.pod +++ /dev/null @@ -1,89 +0,0 @@ - -=head1 NAME - -Unicode::Normalize - normalized forms of Unicode text - -=head1 SYNOPSIS - - use Unicode::Normalize; - - $string_NFD = NFD($raw_string); # Normalization Form D - $string_NFC = NFC($raw_string); # Normalization Form C - $string_NFKD = NFKD($raw_string); # Normalization Form KD - $string_NFKC = NFKC($raw_string); # Normalization Form KC - - or - - use Unicode::Normalize 'normalize'; - - $string_NFD = normalize('D', $raw_string); # Normalization Form D - $string_NFC = normalize('C', $raw_string); # Normalization Form C - $string_NFKD = normalize('KD', $raw_string); # Normalization Form KD - $string_NFKC = normalize('KC', $raw_string); # Normalization Form KC - -=head1 DESCRIPTION - -=over 4 - -=item C<$string_NFD = NFD($raw_string)> - -returns the Normalization Form D (formed by canonical decomposition). - - -=item C<$string_NFC = NFC($raw_string)> - -returns the Normalization Form C (formed by canonical decomposition -followed by canonical composition). - -=item C<$string_NFKD = NFKD($raw_string)> - -returns the Normalization Form KD (formed by compatibility decomposition). - -=item C<$string_NFKC = NFKC($raw_string)> - -returns the Normalization Form KC (formed by compatibility decomposition -followed by B composition). - -=item C<$normalized_string = normalize($form_name, $raw_string)> - -As C<$form_name>, one of the following names must be given. - - 'C' or 'NFC' for Normalization Form C - 'D' or 'NFD' for Normalization Form D - 'KC' or 'NFKC' for Normalization Form KC - 'KD' or 'NFKD' for Normalization Form KD - -=back - -=head2 EXPORT - -C, C, C, C: by default. - -C: on request. - -=head1 AUTHOR - -SADAHIRO Tomoyuki, ESADAHIRO@cpan.orgE - - http://homepage1.nifty.com/nomenclator/perl/ - - Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved. - - This program is free software; you can redistribute it and/or - modify it under the same terms as Perl itself. - -=head1 SEE ALSO - -=over 4 - -=item L - -utility functions for Hangul Syllables - -=item http://www.unicode.org/unicode/reports/tr15/ - -Unicode Normalization Forms - UAX #15 - -=back - -=cut diff --git a/ext/Unicode/Normalize/Normalize.xs b/ext/Unicode/Normalize/Normalize.xs index 7adad7e..0b0809e 100644 --- a/ext/Unicode/Normalize/Normalize.xs +++ b/ext/Unicode/Normalize/Normalize.xs @@ -146,10 +146,10 @@ void sv_cat_decompHangul (SV* sv, UV uv) MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize SV* -decompose(arg, compat) +decompose(arg, compat = &PL_sv_no) SV * arg SV * compat - PROTOTYPE: $ + PROTOTYPE: $;$ PREINIT: UV uv; SV *src, *dst; @@ -275,7 +275,7 @@ compose(arg) s = (U8*)SvPV(src, srclen); e = s + srclen; - dstlen = srclen + 1; /* equal or shorter, XXX */ + dstlen = srclen + 1; dst = newSV(dstlen); (void)SvPOK_only(dst); SvUTF8_on(dst); @@ -317,6 +317,16 @@ compose(arg) /* S + C + S => S-S + C would be also blocked. */ if( uvComp && ! isExclusion(uvComp) && preCC <= curCC) { + STRLEN leftcur, rightcur, dstcur; + leftcur = UNISKIP(uvComp); + rightcur = UNISKIP(uvS) + UNISKIP(uv); + + if (leftcur > rightcur) { + dstcur = d - (U8*)SvPVX(dst); + dstlen += leftcur - rightcur; + d = (U8*)SvGROW(dst,dstlen) + dstcur; + } + /* preCC not changed to curCC */ uvS = uvComp; } else if (! curCC && p < e) { /* blocked */ @@ -328,15 +338,15 @@ compose(arg) } } d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */ - if((tmplen = t - tmp_start)) { /* uncomposed combining char */ + tmplen = t - tmp_start; + if (tmplen) { /* uncomposed combining char */ t = (U8*)SvPVX(tmp); while(tmplen--) *d++ = *t++; } uvS = uv; } /* for */ - e = d; /* end of dst */ - d = (U8*)SvPVX(dst); - SvCUR_set(dst, e - d); + + SvCUR_set(dst, d - (U8*)SvPVX(dst)); RETVAL = dst; OUTPUT: RETVAL diff --git a/ext/Unicode/Normalize/mkheader b/ext/Unicode/Normalize/mkheader index 5793e4a..aa6a153 100644 --- a/ext/Unicode/Normalize/mkheader +++ b/ext/Unicode/Normalize/mkheader @@ -111,7 +111,7 @@ sub _getHexArray { sub _U_stringify { sprintf '"%s"', join '', - map sprintf("\\x%2x", $_), unpack 'C*', pack 'U*', @_; + map sprintf("\\x%02x", $_), unpack 'C*', pack 'U*', @_; } foreach my $hash (\%Canon, \%Compat) { @@ -120,48 +120,18 @@ foreach my $hash (\%Canon, \%Compat) { } } -sub utf8len { - my $uv = shift; - return $uv < 0x80 ? 1 : - $uv < 0x800 ? 2 : - $uv < 0x10000 ? 3 : - $uv < 0x110000 ? 4 : - croak "$PACKAGE: illegal char in the composite. utf-8 max is 0x10ffff."; -} - -sub utfebcdiclen { - my $uv = shift; - return $uv < 0xA0 ? 1 : - $uv < 0x400 ? 2 : - $uv < 0x4000 ? 3 : - $uv < 0x40000 ? 4 : - $uv < 0x110000 ? 5 : - croak "$PACKAGE: illegal char in the composite. utf-8 max is 0x10ffff."; -} - my $prefix = "UNF_"; my $structname = "${prefix}complist"; our (%Comp1st, %CompList); -my $errExpand = "$PACKAGE: A composable pair in %s " - . "is longer than the composite in bytes!\n" - . "%d + %d => %d\nQuit. Please inform the author..."; - foreach(sort keys %Compos) { my @a = unpack('U*', $_); my $val = $Compos{$_}; my $name = sprintf "${structname}_%06x", $a[0]; $Comp1st{ $a[0] } = $name; $CompList{ $name }{ $a[1] } = $val; - - if( utf8len($a[0]) + utf8len($a[1]) < utf8len($val) ) { - croak sprintf($errExpand, "utf-8", $a[0], $a[1], $val); - } - if( utfebcdiclen($a[0]) + utfebcdiclen($a[1]) < utfebcdiclen($val)) { - croak sprintf($errExpand, "utf-ebcdic", $a[0], $a[1], $val); - } } my $compinit =