from SADAHIRO Tomoyuki.
p4raw-id: //depot/perl@16260
Revision history for Perl extension Unicode::Normalize.
+0.17 Sun Apr 28 23:13:32 2002
+ - now normalize('NFC',$1) should work.
+ - Some croak()'s are added in mkheader.
+ - synchronization with bleadperl.
+ - Change 15596: by me
+ - Change 16136: by pudge
+
0.16 Thu Mar 21 13:36:14 2002
- synchronization with bleadperl.
- Change 15318: by jhi
use warnings;
use Carp;
-our $VERSION = '0.16';
+our $VERSION = '0.17';
our $PACKAGE = __PACKAGE__;
require Exporter;
sub normalize($$)
{
my $form = shift;
+ my $str = shift;
$form =~ s/^NF//;
return
- $form eq 'D' ? NFD ($_[0]) :
- $form eq 'C' ? NFC ($_[0]) :
- $form eq 'KD' ? NFKD($_[0]) :
- $form eq 'KC' ? NFKC($_[0]) :
+ $form eq 'D' ? NFD ($str) :
+ $form eq 'C' ? NFC ($str) :
+ $form eq 'KD' ? NFKD($str) :
+ $form eq 'KC' ? NFKC($str) :
croak $PACKAGE."::normalize: invalid form name: $form";
}
sub check($$)
{
my $form = shift;
+ my $str = shift;
$form =~ s/^NF//;
return
- $form eq 'D' ? checkNFD ($_[0]) :
- $form eq 'C' ? checkNFC ($_[0]) :
- $form eq 'KD' ? checkNFKD($_[0]) :
- $form eq 'KC' ? checkNFKC($_[0]) :
+ $form eq 'D' ? checkNFD ($str) :
+ $form eq 'C' ? checkNFC ($str) :
+ $form eq 'KD' ? checkNFKD($str) :
+ $form eq 'KC' ? checkNFKC($str) :
croak $PACKAGE."::check: invalid form name: $form";
}
=head1 NAME
-Unicode::Normalize - normalized forms of Unicode text
+Unicode::Normalize - Unicode Normalization Forms
=head1 SYNOPSIS
=item C<$result = checkNFD($string)>
-returns YES (1) or NO (empty string).
+returns C<YES> (C<1>) or C<NO> (C<empty string>).
=item C<$result = checkNFC($string)>
-returns YES (1), NO (empty string), or MAYBE (undef).
+returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
=item C<$result = checkNFKD($string)>
-returns YES (1) or NO (empty string).
+returns C<YES> (C<1>) or C<NO> (C<empty string>).
=item C<$result = checkNFKC($string)>
-returns YES (1), NO (empty string), or MAYBE (undef).
+returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
=item C<$result = check($form_name, $string)>
-returns YES (1), NO (empty string), or MAYBE (undef).
+returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
C<$form_name> is alike to that for C<normalize()>.
the MAYBE_NFC/MAYBE_NFKC property.
Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
-Though, C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
+C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
decomposable (including Hangul Syllables),
returns the B<completely decomposed> string canonically equivalent to it.
-If it is not decomposable, returns undef.
+If it is not decomposable, returns C<undef>.
=item C<$compatibility_decomposed = getCompat($codepoint)>
decomposable (including Hangul Syllables),
returns the B<completely decomposed> string compatibility equivalent to it.
-If it is not decomposable, returns undef.
+If it is not decomposable, returns C<undef>.
=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
(including Hangul Jamo/Syllables and Composition Exclusions),
returns the codepoint of the composite.
-If they are not composable, returns undef.
+If they are not composable, returns C<undef>.
=item C<$combining_class = getCombinClass($codepoint)>
-Unicode/Normalize version 0.16
+Unicode/Normalize version 0.17
===================================
-Unicode::Normalize - normalized forms of Unicode text
+Unicode::Normalize - Unicode Normalization Forms
SYNOPSIS
CAVEAT
-(1) In bleadperl, unicore/CompExcl.txt is renamed
- unicore/CompositionExclusions.txt.
+(1) In the perl-current, unicore/CompExcl.txt
+ is renamed unicore/CompositionExclusions.txt.
(2) When these unicore/*.* files are updated;
#!perl
#
# This script generates "unfcan.h", "unfcpt.h", "unfcmb.h",
-# "unfcmp.h", and "unfexc.h"
+# "unfcmp.h", and "unfexc.h"
# from CombiningClass.pl, Decomposition.pl, CompExcl.txt
# in lib/unicore or unicode directory
# for Unicode::Normalize.xs. (cf. Makefile.PL)
#
+# Usage: <perl mkheader> in command line
+# or <do 'mkheader'> in perl
+#
use 5.006;
use strict;
use warnings;
our %Combin; # $codepoint => $number : combination class
our %Canon; # $codepoint => $hexstring : canonical decomp.
our %Compat; # $codepoint => $hexstring : compat. decomp.
-our %Compos; # $string => $codepoint : composite
+our %Compos; # $1st,$2nd => $codepoint : composite
our %Exclus; # $codepoint => 1 : composition exclusions
our %Single; # $codepoint => 1 : singletons
our %NonStD; # $codepoint => 1 : non-starter decompositions
my @tab = split /\t/, $1;
my $compat = $tab[2] =~ s/<[^>]+>//;
my $dec = [ _getHexArray($tab[2]) ]; # decomposition
- my $com = pack('U*', @$dec); # composable sequence
my $ini = hex($tab[0]); # initial decomposable character
if ($tab[1] eq '') {
$Compat{ $ini } = $dec;
if (! $compat) {
$Canon{ $ini } = $dec;
- if (@$dec > 1) {
+ if (@$dec == 2) {
if ($Combin{ $dec->[0] }) {
$NonStD{ $ini } = 1;
} else {
- $Compos{ $com } = $ini;
+ $Compos{ $dec->[0] }{ $dec->[1] } = $ini;
}
- } else {
+ } elsif (@$dec == 1) {
$Single{ $ini } = 1;
+ } else {
+ croak("Weird Canonical Decomposition of U+$tab[0]");
}
}
} else {
if (! $compat) {
$Canon{ $u } = $dec;
- if (@$dec > 1) {
+ if (@$dec == 2) {
if ($Combin{ $dec->[0] }) {
$NonStD{ $u } = 1;
} else {
- $Compos{ $com } = $u;
+ $Compos{ $dec->[0] }{ $dec->[1] } = $u;
}
- } else {
+ } elsif (@$dec == 1) {
$Single{ $u } = 1;
+ } else {
+ croak("Weird Canonical Decomposition of U+$tab[0]");
}
}
}
sprintf '"%s"', join '',
map sprintf("\\x%02x", $_), unpack 'C*', pack 'U*', @_;
}
+# Do we need say <pack 'U*', map utf8::unicode_to_native($_),>
+# instead of <pack 'U*',> for EBCDIC?
foreach my $hash (\%Canon, \%Compat) {
foreach my $key (keys %$hash) {
our (%Comp1st, %Comp2nd, %CompList);
-foreach (sort keys %Compos) {
- my @a = unpack('U*', $_);
- my $val = $Compos{$_};
- my $name = sprintf "${structname}_%06x", $a[0];
- $Comp1st{$a[0]} = $name;
- $Comp2nd{$a[1]} = 1 if ! $Exclus{$Compos{$_}} && ! $Combin{$a[0]};
- $CompList{$name}{$a[1]} = $val;
+foreach my $c1 (keys %Compos) {
+ my $name = sprintf "${structname}_%06x", $c1;
+ $Comp1st{$c1} = $name;
+
+ foreach my $c2 (keys %{ $Compos{$c1} }) {
+ my $composite = $Compos{$c1}{$c2};
+ $Comp2nd{$c2} = 1 if ! $Exclus{$composite} && ! $Combin{$c1};
+ $CompList{$name}{$c2} = $composite;
+ }
}
# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
-foreach (0x1161..0x1175, 0x11A8..0x11C2) {
- $Comp2nd{$_} = 1;
+foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
+ $Comp2nd{$j} = 1;
}
my $compinit =
print $init if defined $init;
foreach my $uv (keys %$hash) {
+ croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
+ unless $uv <= 0x10FFFF;
my @c = unpack 'CCCC', pack 'N', $uv;
$val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
}
BEGIN {
if (ord("A") == 193) {
- print "1..0 # Unicode::Normalize not ported to EBCDIC\n";
- exit 0;
+ print "1..0 # Unicode::Normalize not ported to EBCDIC\n";
+ exit 0;
}
}
use Test;
use strict;
use warnings;
-BEGIN { plan tests => 10 };
+BEGIN { plan tests => 13 };
use Unicode::Normalize qw(:all);
ok(1); # If we made it this far, we're ok.
print getCombinClass( 0) == 0
&& getCombinClass( 768) == 230
&& getCombinClass(1809) == 36
-# && getCombinClass(119143) == 1 # U+1D167, a Unicode 3.1 character
+ && ($] < 5.007003 || getCombinClass(0x1D167) == 1) # Unicode 3.1
? "ok" : "not ok", " 2\n";
print ! defined getCanon( 0)
&& answer(checkNFKD(pack('U*', 0x20, 0xC1, 0xFF71, 0x2025))) eq "NO"
&& answer(checkNFKC(pack('U*', 0x20, 0xC1, 0xAE00, 0x2025))) eq "NO"
? "ok" : "not ok", " 10\n";
+
+"012ABC" =~ /(\d+)(\w+)/;
+print "012" eq NFC $1 && "ABC" eq NFC $2
+ ? "ok" : "not ok", " 11\n";
+
+print "012" eq normalize('C', $1) && "ABC" eq normalize('C', $2)
+ ? "ok" : "not ok", " 12\n";
+
+print "012" eq normalize('NFC', $1) && "ABC" eq normalize('NFC', $2)
+ ? "ok" : "not ok", " 13\n";
+ # s/^NF// in normalize() must not prevent using $1, $&, etc.
+