no warnings 'utf8';
-our $VERSION = '0.30';
+our $VERSION = '0.32';
our $PACKAGE = __PACKAGE__;
require Exporter;
{
my $form = shift;
my $str = shift;
- return exists $formNorm{$form}
+ return exists $formNorm{$form}
? $formNorm{$form}->($str)
: croak $PACKAGE."::normalize: invalid form name: $form";
}
{
my $form = shift;
my $str = shift;
- return exists $formCheck{$form}
+ return exists $formCheck{$form}
? $formCheck{$form}->($str)
: croak $PACKAGE."::check: invalid form name: $form";
}
C<$codepoint> should be an unsigned integer
representing a Unicode code point.
-Note: Between XS edition and pure Perl edition,
-interpretation of C<$codepoint> as a decimal number has incompatibility.
-XS converts C<$codepoint> to an unsigned integer, but pure Perl does not.
+Note: Between XSUB and pure Perl, there is an incompatibility
+about the interpretation of C<$codepoint> as a decimal number.
+XSUB converts C<$codepoint> to an unsigned integer, but pure Perl does not.
Do not use a floating point nor a negative sign in C<$codepoint>.
=head2 Normalization Forms
=item C<$result = checkNFD($string)>
-returns C<YES> (C<1>) or C<NO> (C<empty string>).
+returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
=item C<$result = checkNFC($string)>
-returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
+returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
+C<undef> if C<MAYBE>.
=item C<$result = checkNFKD($string)>
-returns C<YES> (C<1>) or C<NO> (C<empty string>).
+returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
=item C<$result = checkNFKC($string)>
-returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
+returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
+C<undef> if C<MAYBE>.
=item C<$result = checkFCD($string)>
-returns C<YES> (C<1>) or C<NO> (C<empty string>).
+returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
=item C<$result = checkFCC($string)>
-returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
+returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
+C<undef> if C<MAYBE>.
If a string is not in FCD, it must not be in FCC.
So C<checkFCC($not_FCD_string)> should return C<NO>.
=item C<$result = check($form_name, $string)>
-returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
+returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
+C<undef> if C<MAYBE>.
-C<$form_name> is alike to that for C<normalize()>.
+As C<$form_name>, one of the following names must be given.
+
+ 'C' or 'NFC' for Normalization Form C (UAX #15)
+ 'D' or 'NFD' for Normalization Form D (UAX #15)
+ 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
+ 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
+
+ 'FCD' for "Fast C or D" Form (UTN #5)
+ 'FCC' for "Fast C Contiguous" (UTN #5)
=back
(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
-If you want to check exactly, compare the string with its NFC/NFKC/FCC;
-i.e.,
+If you want to check exactly, compare the string with its NFC/NFKC/FCC.
+
+ if ($string eq NFC($string)) {
+ # $string is exactly normalized in NFC;
+ } else {
+ # $string is not normalized in NFC;
+ }
- $string eq NFC($string) # thorough than checkNFC($string)
- $string eq NFKC($string) # thorough than checkNFKC($string)
- $string eq FCC($string) # thorough than checkFCC($string)
+ if ($string eq NFKC($string)) {
+ # $string is exactly normalized in NFKC;
+ } else {
+ # $string is not normalized in NFKC;
+ }
=head2 Character Data
=back
-=head2 EXPORT
+=head1 EXPORT
C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
C<normalize> and other some functions: on request.
+=head1 CAVEATS
+
+=over 4
+
+=item Perl's version vs. Unicode version
+
+Since this module refers to perl core's Unicode database in the directory
+F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
+normalization implemented by this module depends on your perl's version.
+
+ perl's version implemented Unicode version
+ 5.6.1 3.0.1
+ 5.7.2 3.1.0
+ 5.7.3 3.1.1 (same normalized form as that of 3.1.0)
+ 5.8.0 3.2.0
+ 5.8.1-5.8.3 4.0.0
+ 5.8.4-5.8.6 (latest) 4.0.1 (same normalized form as that of 4.0.0)
+
+=item Correction of decomposition mapping
+
+In older Unicode versions, a small number of characters (all of which are
+CJK compatibility ideographs as far as they have been found) may have
+an erroneous decomposition mapping (see F<NormalizationCorrections.txt>).
+Anyhow, this module will neither refer to F<NormalizationCorrections.txt>
+nor provide any specific version of normalization. Therefore this module
+running on an older perl with an older Unicode database may use
+the erroneous decomposition mapping blindly conforming to the Unicode database.
+
+=item Revised definition of canonical composition
+
+In Unicode 4.1.0, the definition D2 of canonical composition (which
+affects NFC and NFKC) has been changed (see Public Review Issue #29
+and recent UAX #15). This module has used the newer definition
+since the version 0.07 (Oct 31, 2001).
+This module does not support normalization according to the older
+definition, even if the Unicode version implemented by perl is
+lower than 4.1.0.
+
+=back
+
=head1 AUTHOR
SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
- http://homepage1.nifty.com/nomenclator/perl/
-
- Copyright(C) 2001-2004, SADAHIRO Tomoyuki. Japan. All rights reserved.
+Copyright(C) 2001-2005, SADAHIRO Tomoyuki. Japan. All rights reserved.
- This module is free software; you can redistribute it
- and/or modify it under the same terms as Perl itself.
+This module is free software; you can redistribute it
+and/or modify it under the same terms as Perl itself.
=head1 SEE ALSO
Derived Normalization Properties
+=item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt
+
+Normalization Corrections
+
+=item http://www.unicode.org/review/pr-29.html
+
+Public Review Issue #29: Normalization Issue
+
=item http://www.unicode.org/notes/tn5/
Canonical Equivalence in Applications - UTN #5
=back
=cut
-
use Test;
use strict;
use warnings;
-BEGIN { plan tests => 18 };
+BEGIN { plan tests => 29 };
use Unicode::Normalize qw(normalize);
ok(1); # If we made it this far, we're ok.
ok(hexNFC("0000 0041 0000 0000"), "0000 0041 0000 0000");
ok(hexNFD("0000 0041 0000 0000"), "0000 0041 0000 0000");
+ok(hexNFC("AC00 11A7"), "AC00 11A7");
+ok(hexNFC("AC00 11A8"), "AC01");
+ok(hexNFC("AC00 11A9"), "AC02");
+ok(hexNFC("AC00 11C2"), "AC1B");
+ok(hexNFC("AC00 11C3"), "AC00 11C3");
+
+# Test Cases from Public Review Issue #29: Normalization Issue
+# cf. http://www.unicode.org/review/pr-29.html
+ok(hexNFC("0B47 0300 0B3E"), "0B47 0300 0B3E");
+ok(hexNFC("1100 0300 1161"), "1100 0300 1161");
+
+ok(hexNFC("0B47 0B3E 0300"), "0B4B 0300");
+ok(hexNFC("1100 1161 0300"), "AC00 0300");
+
+ok(hexNFC("0B47 0300 0B3E 0327"), "0B47 0300 0B3E 0327");
+ok(hexNFC("1100 0300 1161 0327"), "1100 0300 1161 0327");
use Test;
use strict;
use warnings;
-BEGIN { plan tests => 20 };
+BEGIN { plan tests => 31 };
use Unicode::Normalize;
ok(1); # If we made it this far, we're ok.
ok(hexNFC("0000 0041 0000 0000"), "0000 0041 0000 0000");
ok(hexNFD("0000 0041 0000 0000"), "0000 0041 0000 0000");
-# should be unary.
+ok(hexNFC("AC00 11A7"), "AC00 11A7");
+ok(hexNFC("AC00 11A8"), "AC01");
+ok(hexNFC("AC00 11A9"), "AC02");
+ok(hexNFC("AC00 11C2"), "AC1B");
+ok(hexNFC("AC00 11C3"), "AC00 11C3");
+
+# Test Cases from Public Review Issue #29: Normalization Issue
+# cf. http://www.unicode.org/review/pr-29.html
+ok(hexNFC("0B47 0300 0B3E"), "0B47 0300 0B3E");
+ok(hexNFC("1100 0300 1161"), "1100 0300 1161");
+
+ok(hexNFC("0B47 0B3E 0300"), "0B4B 0300");
+ok(hexNFC("1100 1161 0300"), "AC00 0300");
+
+ok(hexNFC("0B47 0300 0B3E 0327"), "0B47 0300 0B3E 0327");
+ok(hexNFC("1100 0300 1161 0327"), "1100 0300 1161 0327");
+
+# NFC() should be unary.
my $str11 = _pack_U(0x41, 0x0302, 0x0301, 0x62);
my $str12 = _pack_U(0x1EA4, 0x62);
ok(NFC $str11 eq $str12);
+# NFD() should be unary.
my $str21 = _pack_U(0xE0, 0xAC00);
my $str22 = _pack_U(0x61, 0x0300, 0x1100, 0x1161);
ok(NFD $str21 eq $str22);