ext/Unicode/Normalize/Makefile.PL Unicode::Normalize
ext/Unicode/Normalize/mkheader Unicode::Normalize
ext/Unicode/Normalize/Normalize.pm Unicode::Normalize
-ext/Unicode/Normalize/Normalize.pod Unicode::Normalize
ext/Unicode/Normalize/Normalize.xs Unicode::Normalize
ext/Unicode/Normalize/README Unicode::Normalize
ext/Unicode/Normalize/t/func.t Unicode::Normalize
'NAME' => 'Unicode::Normalize',
'VERSION_FROM' => 'Normalize.pm', # finds $VERSION
($] >= 5.005 ? ## Add these new keywords supported since 5.005
- (ABSTRACT_FROM => 'Normalize.pod', # retrieve abstract from module
+ (ABSTRACT_FROM => 'Normalize.pm', # retrieve abstract from module
AUTHOR => 'SADAHIRO Tomoyuki <SADAHIRO@cpan.org>') : ()),
clean => {FILES=> 'unfcan.h unfcmb.h unfcmp.h unfcpt.h unfexc.h'},
);
use warnings;
use Carp;
-our $VERSION = '0.12';
+our $VERSION = '0.13';
our $PACKAGE = __PACKAGE__;
require Exporter;
bootstrap Unicode::Normalize $VERSION;
-use constant CANON => 0;
use constant COMPAT => 1;
-sub NFD ($) { reorder(decompose($_[0], CANON )) }
+sub NFD ($) { reorder(decompose($_[0])) }
sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
-sub NFC ($) { compose(reorder(decompose($_[0], CANON ))) }
+sub NFC ($) { compose(reorder(decompose($_[0]))) }
sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
sub normalize($$)
{
- my $form = shift;
- $form =~ s/^NF//;
- $form eq 'D' ? NFD ($_[0]) :
- $form eq 'C' ? NFC ($_[0]) :
- $form eq 'KD' ? NFKD($_[0]) :
- $form eq 'KC' ? NFKC($_[0]) :
- croak $PACKAGE."::normalize: invalid form name: $form";
+ my $form = shift;
+ $form =~ s/^NF//;
+ return
+ $form eq 'D' ? NFD ($_[0]) :
+ $form eq 'C' ? NFC ($_[0]) :
+ $form eq 'KD' ? NFKD($_[0]) :
+ $form eq 'KC' ? NFKC($_[0]) :
+ croak $PACKAGE."::normalize: invalid form name: $form";
}
1;
=head1 DESCRIPTION
-=head2 Normalization
+=head2 Normalization Forms
=over 4
These functions are interface of character data used internally.
If you want only to get unicode normalization forms,
-you need not to call them by yourself.
+you doesn't need call them by yourself.
=over 4
=item C<$uv_composite = getComposite($uv_here, $uv_next)>
-If the couple of two characters here and next (as codepoints) is composable
+If two characters here and next (as codepoints) are composable
(including Hangul Jamo/Syllables and Exclusions),
returns the codepoint of the composite.
+++ /dev/null
-
-=head1 NAME
-
-Unicode::Normalize - normalized forms of Unicode text
-
-=head1 SYNOPSIS
-
- use Unicode::Normalize;
-
- $string_NFD = NFD($raw_string); # Normalization Form D
- $string_NFC = NFC($raw_string); # Normalization Form C
- $string_NFKD = NFKD($raw_string); # Normalization Form KD
- $string_NFKC = NFKC($raw_string); # Normalization Form KC
-
- or
-
- use Unicode::Normalize 'normalize';
-
- $string_NFD = normalize('D', $raw_string); # Normalization Form D
- $string_NFC = normalize('C', $raw_string); # Normalization Form C
- $string_NFKD = normalize('KD', $raw_string); # Normalization Form KD
- $string_NFKC = normalize('KC', $raw_string); # Normalization Form KC
-
-=head1 DESCRIPTION
-
-=over 4
-
-=item C<$string_NFD = NFD($raw_string)>
-
-returns the Normalization Form D (formed by canonical decomposition).
-
-
-=item C<$string_NFC = NFC($raw_string)>
-
-returns the Normalization Form C (formed by canonical decomposition
-followed by canonical composition).
-
-=item C<$string_NFKD = NFKD($raw_string)>
-
-returns the Normalization Form KD (formed by compatibility decomposition).
-
-=item C<$string_NFKC = NFKC($raw_string)>
-
-returns the Normalization Form KC (formed by compatibility decomposition
-followed by B<canonical> composition).
-
-=item C<$normalized_string = normalize($form_name, $raw_string)>
-
-As C<$form_name>, one of the following names must be given.
-
- 'C' or 'NFC' for Normalization Form C
- 'D' or 'NFD' for Normalization Form D
- 'KC' or 'NFKC' for Normalization Form KC
- 'KD' or 'NFKD' for Normalization Form KD
-
-=back
-
-=head2 EXPORT
-
-C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
-
-C<normalize>: on request.
-
-=head1 AUTHOR
-
-SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
-
- http://homepage1.nifty.com/nomenclator/perl/
-
- Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
-
- This program is free software; you can redistribute it and/or
- modify it under the same terms as Perl itself.
-
-=head1 SEE ALSO
-
-=over 4
-
-=item L<Lingua::KO::Hangul::Util>
-
-utility functions for Hangul Syllables
-
-=item http://www.unicode.org/unicode/reports/tr15/
-
-Unicode Normalization Forms - UAX #15
-
-=back
-
-=cut
MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
SV*
-decompose(arg, compat)
+decompose(arg, compat = &PL_sv_no)
SV * arg
SV * compat
- PROTOTYPE: $
+ PROTOTYPE: $;$
PREINIT:
UV uv;
SV *src, *dst;
s = (U8*)SvPV(src, srclen);
e = s + srclen;
- dstlen = srclen + 1; /* equal or shorter, XXX */
+ dstlen = srclen + 1;
dst = newSV(dstlen);
(void)SvPOK_only(dst);
SvUTF8_on(dst);
/* S + C + S => S-S + C would be also blocked. */
if( uvComp && ! isExclusion(uvComp) && preCC <= curCC)
{
+ STRLEN leftcur, rightcur, dstcur;
+ leftcur = UNISKIP(uvComp);
+ rightcur = UNISKIP(uvS) + UNISKIP(uv);
+
+ if (leftcur > rightcur) {
+ dstcur = d - (U8*)SvPVX(dst);
+ dstlen += leftcur - rightcur;
+ d = (U8*)SvGROW(dst,dstlen) + dstcur;
+ }
+
/* preCC not changed to curCC */
uvS = uvComp;
} else if (! curCC && p < e) { /* blocked */
}
}
d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
- if((tmplen = t - tmp_start)) { /* uncomposed combining char */
+ tmplen = t - tmp_start;
+ if (tmplen) { /* uncomposed combining char */
t = (U8*)SvPVX(tmp);
while(tmplen--) *d++ = *t++;
}
uvS = uv;
} /* for */
- e = d; /* end of dst */
- d = (U8*)SvPVX(dst);
- SvCUR_set(dst, e - d);
+
+ SvCUR_set(dst, d - (U8*)SvPVX(dst));
RETVAL = dst;
OUTPUT:
RETVAL
sub _U_stringify {
sprintf '"%s"', join '',
- map sprintf("\\x%2x", $_), unpack 'C*', pack 'U*', @_;
+ map sprintf("\\x%02x", $_), unpack 'C*', pack 'U*', @_;
}
foreach my $hash (\%Canon, \%Compat) {
}
}
-sub utf8len {
- my $uv = shift;
- return $uv < 0x80 ? 1 :
- $uv < 0x800 ? 2 :
- $uv < 0x10000 ? 3 :
- $uv < 0x110000 ? 4 :
- croak "$PACKAGE: illegal char in the composite. utf-8 max is 0x10ffff.";
-}
-
-sub utfebcdiclen {
- my $uv = shift;
- return $uv < 0xA0 ? 1 :
- $uv < 0x400 ? 2 :
- $uv < 0x4000 ? 3 :
- $uv < 0x40000 ? 4 :
- $uv < 0x110000 ? 5 :
- croak "$PACKAGE: illegal char in the composite. utf-8 max is 0x10ffff.";
-}
-
my $prefix = "UNF_";
my $structname = "${prefix}complist";
our (%Comp1st, %CompList);
-my $errExpand = "$PACKAGE: A composable pair in %s "
- . "is longer than the composite in bytes!\n"
- . "%d + %d => %d\nQuit. Please inform the author...";
-
foreach(sort keys %Compos) {
my @a = unpack('U*', $_);
my $val = $Compos{$_};
my $name = sprintf "${structname}_%06x", $a[0];
$Comp1st{ $a[0] } = $name;
$CompList{ $name }{ $a[1] } = $val;
-
- if( utf8len($a[0]) + utf8len($a[1]) < utf8len($val) ) {
- croak sprintf($errExpand, "utf-8", $a[0], $a[1], $val);
- }
- if( utfebcdiclen($a[0]) + utfebcdiclen($a[1]) < utfebcdiclen($val)) {
- croak sprintf($errExpand, "utf-ebcdic", $a[0], $a[1], $val);
- }
}
my $compinit =