Rename ext/Unicode/Normalize to ext/Unicode-Normalize
[p5sagit/p5-mst-13.2.git] / ext / Unicode-Normalize / Normalize.pm
CommitLineData
ac5ea531 1package Unicode::Normalize;
2
4a2e806c 3BEGIN {
1efaba7f 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Normalize cannot stringify a Unicode code point\n";
4a2e806c 6 }
7}
8
ac5ea531 9use 5.006;
10use strict;
11use warnings;
12use Carp;
13
e524f5b2 14no warnings 'utf8';
15
2b8d773d 16our $VERSION = '1.02';
ac5ea531 17our $PACKAGE = __PACKAGE__;
18
19require Exporter;
20require DynaLoader;
ac5ea531 21
22our @ISA = qw(Exporter DynaLoader);
23our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45 24our @EXPORT_OK = qw(
25 normalize decompose reorder compose
8f118dcd 26 checkNFD checkNFKD checkNFC checkNFKC check
27 getCanon getCompat getComposite getCombinClass
28 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
29 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
82e740b6 30 FCD checkFCD FCC checkFCC composeContiguous
31 splitOnLastStarter
8f118dcd 32);
33our %EXPORT_TAGS = (
34 all => [ @EXPORT, @EXPORT_OK ],
35 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
36 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
82e740b6 37 fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
2a204b45 38);
ac5ea531 39
82e740b6 40######
41
ac5ea531 42bootstrap Unicode::Normalize $VERSION;
43
82e740b6 44######
45
fe067ad9 46##
47## utilites for tests
48##
49
9f1f04a1 50sub pack_U {
b8d10bc1 51 return pack('U*', @_);
9f1f04a1 52}
53
54sub unpack_U {
fe067ad9 55 return unpack('U*', shift(@_).pack('U*'));
9f1f04a1 56}
57
82e740b6 58
59##
60## normalization forms
61##
62
82e740b6 63sub FCD ($) {
64 my $str = shift;
65 return checkFCD($str) ? $str : NFD($str);
66}
82e740b6 67
68our %formNorm = (
69 NFC => \&NFC, C => \&NFC,
70 NFD => \&NFD, D => \&NFD,
71 NFKC => \&NFKC, KC => \&NFKC,
72 NFKD => \&NFKD, KD => \&NFKD,
73 FCD => \&FCD, FCC => \&FCC,
74);
75
ac5ea531 76sub normalize($$)
77{
d85850a7 78 my $form = shift;
f027f502 79 my $str = shift;
fe067ad9 80 if (exists $formNorm{$form}) {
81 return $formNorm{$form}->($str);
82 }
83 croak($PACKAGE."::normalize: invalid form name: $form");
ac5ea531 84}
85
82e740b6 86
87##
88## quick check
89##
90
91our %formCheck = (
92 NFC => \&checkNFC, C => \&checkNFC,
93 NFD => \&checkNFD, D => \&checkNFD,
94 NFKC => \&checkNFKC, KC => \&checkNFKC,
95 NFKD => \&checkNFKD, KD => \&checkNFKD,
96 FCD => \&checkFCD, FCC => \&checkFCC,
97);
98
8f118dcd 99sub check($$)
100{
101 my $form = shift;
f027f502 102 my $str = shift;
fe067ad9 103 if (exists $formCheck{$form}) {
104 return $formCheck{$form}->($str);
105 }
106 croak($PACKAGE."::check: invalid form name: $form");
8f118dcd 107}
108
ac5ea531 1091;
110__END__
2a204b45 111
112=head1 NAME
113
f027f502 114Unicode::Normalize - Unicode Normalization Forms
2a204b45 115
116=head1 SYNOPSIS
117
a092bcfd 118(1) using function names exported by default:
119
2a204b45 120 use Unicode::Normalize;
121
8f118dcd 122 $NFD_string = NFD($string); # Normalization Form D
123 $NFC_string = NFC($string); # Normalization Form C
124 $NFKD_string = NFKD($string); # Normalization Form KD
125 $NFKC_string = NFKC($string); # Normalization Form KC
2a204b45 126
a092bcfd 127(2) using function names exported on request:
2a204b45 128
129 use Unicode::Normalize 'normalize';
130
8f118dcd 131 $NFD_string = normalize('D', $string); # Normalization Form D
132 $NFC_string = normalize('C', $string); # Normalization Form C
133 $NFKD_string = normalize('KD', $string); # Normalization Form KD
134 $NFKC_string = normalize('KC', $string); # Normalization Form KC
2a204b45 135
136=head1 DESCRIPTION
137
00f2676f 138Parameters:
139
fe067ad9 140C<$string> is used as a string under character semantics (see F<perlunicode>).
00f2676f 141
fe067ad9 142C<$code_point> should be an unsigned integer representing a Unicode code point.
00f2676f 143
628bbff0 144Note: Between XSUB and pure Perl, there is an incompatibility
fe067ad9 145about the interpretation of C<$code_point> as a decimal number.
146XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not.
147Do not use a floating point nor a negative sign in C<$code_point>.
00f2676f 148
d85850a7 149=head2 Normalization Forms
2a204b45 150
151=over 4
152
8f118dcd 153=item C<$NFD_string = NFD($string)>
2a204b45 154
fe067ad9 155It returns the Normalization Form D (formed by canonical decomposition).
2a204b45 156
8f118dcd 157=item C<$NFC_string = NFC($string)>
2a204b45 158
fe067ad9 159It returns the Normalization Form C (formed by canonical decomposition
2a204b45 160followed by canonical composition).
161
8f118dcd 162=item C<$NFKD_string = NFKD($string)>
2a204b45 163
fe067ad9 164It returns the Normalization Form KD (formed by compatibility decomposition).
2a204b45 165
8f118dcd 166=item C<$NFKC_string = NFKC($string)>
2a204b45 167
fe067ad9 168It returns the Normalization Form KC (formed by compatibility decomposition
2a204b45 169followed by B<canonical> composition).
170
82e740b6 171=item C<$FCD_string = FCD($string)>
172
173If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
fe067ad9 174it returns the string without modification; otherwise it returns an FCD string.
82e740b6 175
176Note: FCD is not always unique, then plural forms may be equivalent
177each other. C<FCD()> will return one of these equivalent forms.
178
179=item C<$FCC_string = FCC($string)>
180
fe067ad9 181It returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
82e740b6 182
e524f5b2 183Note: FCC is unique, as well as four normalization forms (NF*).
82e740b6 184
8f118dcd 185=item C<$normalized_string = normalize($form_name, $string)>
2a204b45 186
fe067ad9 187It returns the normalization form of C<$form_name>.
188
2a204b45 189As C<$form_name>, one of the following names must be given.
190
82e740b6 191 'C' or 'NFC' for Normalization Form C (UAX #15)
192 'D' or 'NFD' for Normalization Form D (UAX #15)
193 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
194 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
195
196 'FCD' for "Fast C or D" Form (UTN #5)
197 'FCC' for "Fast C Contiguous" (UTN #5)
2a204b45 198
199=back
200
8f118dcd 201=head2 Decomposition and Composition
202
203=over 4
204
fe067ad9 205=item C<$decomposed_string = decompose($string [, $useCompatMapping])>
8f118dcd 206
fe067ad9 207It returns the concatenation of the decomposition of each character
208in the string.
8f118dcd 209
fe067ad9 210If the second parameter (a boolean) is omitted or false,
211the decomposition is canonical decomposition;
212if the second parameter (a boolean) is true,
213the decomposition is compatibility decomposition.
8f118dcd 214
fe067ad9 215The string returned is not always in NFD/NFKD. Reordering may be required.
8f118dcd 216
217 $NFD_string = reorder(decompose($string)); # eq. to NFD()
218 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
219
fe067ad9 220=item C<$reordered_string = reorder($string)>
8f118dcd 221
fe067ad9 222It returns the result of reordering the combining characters
223according to Canonical Ordering Behavior.
8f118dcd 224
fe067ad9 225For example, when you have a list of NFD/NFKD strings,
226you can get the concatenated NFD/NFKD string from them, by saying
8f118dcd 227
228 $concat_NFD = reorder(join '', @NFD_strings);
229 $concat_NFKD = reorder(join '', @NFKD_strings);
230
fe067ad9 231=item C<$composed_string = compose($string)>
8f118dcd 232
fe067ad9 233It returns the result of canonical composition
234without applying any decomposition.
8f118dcd 235
fe067ad9 236For example, when you have a NFD/NFKD string,
237you can get its NFC/NFKC string, by saying
8f118dcd 238
239 $NFC_string = compose($NFD_string);
240 $NFKC_string = compose($NFKD_string);
241
242=back
243
244=head2 Quick Check
245
82e740b6 246(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
8f118dcd 247
248The following functions check whether the string is in that normalization form.
249
fe067ad9 250The result returned will be one of the following:
8f118dcd 251
252 YES The string is in that normalization form.
253 NO The string is not in that normalization form.
254 MAYBE Dubious. Maybe yes, maybe no.
255
256=over 4
257
258=item C<$result = checkNFD($string)>
259
fe067ad9 260It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
8f118dcd 261
262=item C<$result = checkNFC($string)>
263
fe067ad9 264It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 265C<undef> if C<MAYBE>.
8f118dcd 266
267=item C<$result = checkNFKD($string)>
268
fe067ad9 269It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
8f118dcd 270
271=item C<$result = checkNFKC($string)>
272
fe067ad9 273It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 274C<undef> if C<MAYBE>.
8f118dcd 275
82e740b6 276=item C<$result = checkFCD($string)>
277
fe067ad9 278It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
82e740b6 279
280=item C<$result = checkFCC($string)>
281
fe067ad9 282It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 283C<undef> if C<MAYBE>.
82e740b6 284
fe067ad9 285Note: If a string is not in FCD, it must not be in FCC.
82e740b6 286So C<checkFCC($not_FCD_string)> should return C<NO>.
287
8f118dcd 288=item C<$result = check($form_name, $string)>
289
fe067ad9 290It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 291C<undef> if C<MAYBE>.
8f118dcd 292
628bbff0 293As C<$form_name>, one of the following names must be given.
294
295 'C' or 'NFC' for Normalization Form C (UAX #15)
296 'D' or 'NFD' for Normalization Form D (UAX #15)
297 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
298 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
299
300 'FCD' for "Fast C or D" Form (UTN #5)
301 'FCC' for "Fast C Contiguous" (UTN #5)
8f118dcd 302
303=back
304
305B<Note>
306
82e740b6 307In the cases of NFD, NFKD, and FCD, the answer must be
308either C<YES> or C<NO>. The answer C<MAYBE> may be returned
309in the cases of NFC, NFKC, and FCC.
8f118dcd 310
82e740b6 311A C<MAYBE> string should contain at least one combining character
312or the like. For example, C<COMBINING ACUTE ACCENT> has
8f118dcd 313the MAYBE_NFC/MAYBE_NFKC property.
82e740b6 314
8f118dcd 315Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
316and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
f027f502 317C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
8f118dcd 318(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
319while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
320
628bbff0 321If you want to check exactly, compare the string with its NFC/NFKC/FCC.
322
323 if ($string eq NFC($string)) {
324 # $string is exactly normalized in NFC;
325 } else {
326 # $string is not normalized in NFC;
327 }
8f118dcd 328
628bbff0 329 if ($string eq NFKC($string)) {
330 # $string is exactly normalized in NFKC;
331 } else {
332 # $string is not normalized in NFKC;
333 }
8f118dcd 334
2a204b45 335=head2 Character Data
336
337These functions are interface of character data used internally.
d0ed0342 338If you want only to get Unicode normalization forms, you don't need
339call them yourself.
2a204b45 340
341=over 4
342
fe067ad9 343=item C<$canonical_decomposition = getCanon($code_point)>
2a204b45 344
fe067ad9 345If the character is canonically decomposable (including Hangul Syllables),
346it returns the (full) canonical decomposition as a string.
347Otherwise it returns C<undef>.
8f118dcd 348
fe067ad9 349B<Note:> According to the Unicode standard, the canonical decomposition
350of the character that is not canonically decomposable is same as
351the character itself.
8f118dcd 352
fe067ad9 353=item C<$compatibility_decomposition = getCompat($code_point)>
2a204b45 354
fe067ad9 355If the character is compatibility decomposable (including Hangul Syllables),
356it returns the (full) compatibility decomposition as a string.
357Otherwise it returns C<undef>.
2a204b45 358
fe067ad9 359B<Note:> According to the Unicode standard, the compatibility decomposition
360of the character that is not compatibility decomposable is same as
361the character itself.
2a204b45 362
fe067ad9 363=item C<$code_point_composite = getComposite($code_point_here, $code_point_next)>
2a204b45 364
fe067ad9 365If two characters here and next (as code points) are composable
8f118dcd 366(including Hangul Jamo/Syllables and Composition Exclusions),
fe067ad9 367it returns the code point of the composite.
368
369If they are not composable, it returns C<undef>.
2a204b45 370
fe067ad9 371=item C<$combining_class = getCombinClass($code_point)>
2a204b45 372
fe067ad9 373It returns the combining class (as an integer) of the character.
2a204b45 374
fe067ad9 375=item C<$may_be_composed_with_prev_char = isComp2nd($code_point)>
2a204b45 376
fe067ad9 377It returns a boolean whether the character of the specified codepoint
378may be composed with the previous one in a certain composition
379(including Hangul Compositions, but excluding
380Composition Exclusions and Non-Starter Decompositions).
2a204b45 381
fe067ad9 382=item C<$is_exclusion = isExclusion($code_point)>
8f118dcd 383
fe067ad9 384It returns a boolean whether the code point is a composition exclusion.
8f118dcd 385
fe067ad9 386=item C<$is_singleton = isSingleton($code_point)>
8f118dcd 387
fe067ad9 388It returns a boolean whether the code point is a singleton
8f118dcd 389
fe067ad9 390=item C<$is_non_starter_decomposition = isNonStDecomp($code_point)>
8f118dcd 391
fe067ad9 392It returns a boolean whether the code point has Non-Starter Decomposition.
8f118dcd 393
fe067ad9 394=item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)>
395
396It returns a boolean of the derived property Comp_Ex
397(Full_Composition_Exclusion). This property is generated from
398Composition Exclusions + Singletons + Non-Starter Decompositions.
399
400=item C<$NFD_is_NO = isNFD_NO($code_point)>
401
402It returns a boolean of the derived property NFD_NO
403(NFD_Quick_Check=No).
404
405=item C<$NFC_is_NO = isNFC_NO($code_point)>
406
407It returns a boolean of the derived property NFC_NO
408(NFC_Quick_Check=No).
409
410=item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)>
411
412It returns a boolean of the derived property NFC_MAYBE
413(NFC_Quick_Check=Maybe).
414
415=item C<$NFKD_is_NO = isNFKD_NO($code_point)>
416
417It returns a boolean of the derived property NFKD_NO
418(NFKD_Quick_Check=No).
419
420=item C<$NFKC_is_NO = isNFKC_NO($code_point)>
421
422It returns a boolean of the derived property NFKC_NO
423(NFKC_Quick_Check=No).
424
425=item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)>
426
427It returns a boolean of the derived property NFKC_MAYBE
428(NFKC_Quick_Check=Maybe).
2a204b45 429
430=back
431
628bbff0 432=head1 EXPORT
2a204b45 433
434C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
435
436C<normalize> and other some functions: on request.
437
628bbff0 438=head1 CAVEATS
439
440=over 4
441
442=item Perl's version vs. Unicode version
443
444Since this module refers to perl core's Unicode database in the directory
445F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
446normalization implemented by this module depends on your perl's version.
447
fe067ad9 448 perl's version implemented Unicode version
449 5.6.1 3.0.1
450 5.7.2 3.1.0
451 5.7.3 3.1.1 (normalization is same as 3.1.0)
452 5.8.0 3.2.0
453 5.8.1-5.8.3 4.0.0
454 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0)
455 5.8.7-5.8.8 4.1.0
628bbff0 456
457=item Correction of decomposition mapping
458
459In older Unicode versions, a small number of characters (all of which are
460CJK compatibility ideographs as far as they have been found) may have
461an erroneous decomposition mapping (see F<NormalizationCorrections.txt>).
462Anyhow, this module will neither refer to F<NormalizationCorrections.txt>
463nor provide any specific version of normalization. Therefore this module
464running on an older perl with an older Unicode database may use
465the erroneous decomposition mapping blindly conforming to the Unicode database.
466
467=item Revised definition of canonical composition
468
469In Unicode 4.1.0, the definition D2 of canonical composition (which
470affects NFC and NFKC) has been changed (see Public Review Issue #29
471and recent UAX #15). This module has used the newer definition
472since the version 0.07 (Oct 31, 2001).
2b8d773d 473This module will not support the normalization according to the older
628bbff0 474definition, even if the Unicode version implemented by perl is
475lower than 4.1.0.
476
477=back
478
2a204b45 479=head1 AUTHOR
480
a092bcfd 481SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
2a204b45 482
2b8d773d 483Copyright(C) 2001-2007, SADAHIRO Tomoyuki. Japan. All rights reserved.
2a204b45 484
628bbff0 485This module is free software; you can redistribute it
486and/or modify it under the same terms as Perl itself.
2a204b45 487
488=head1 SEE ALSO
489
490=over 4
491
e524f5b2 492=item http://www.unicode.org/reports/tr15/
2a204b45 493
494Unicode Normalization Forms - UAX #15
495
fe067ad9 496=item http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
497
498Composition Exclusion Table
499
14e6b36c 500=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
8f118dcd 501
502Derived Normalization Properties
503
628bbff0 504=item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt
505
506Normalization Corrections
507
508=item http://www.unicode.org/review/pr-29.html
509
510Public Review Issue #29: Normalization Issue
511
82e740b6 512=item http://www.unicode.org/notes/tn5/
513
514Canonical Equivalence in Applications - UTN #5
515
2a204b45 516=back
517
518=cut