ext/Unicode-Normalize/Normalize.pm

   1 package Unicode::Normalize;
   2
   3 BEGIN {
   4     unless ("A" eq pack('U', 0x41)) {
   5         die "Unicode::Normalize cannot stringify a Unicode code point\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13
  14 no warnings 'utf8';
  15
  16 our $VERSION = '1.03';
  17 our $PACKAGE = __PACKAGE__;
  18
  19 require Exporter;
  20 require DynaLoader;
  21
  22 our @ISA = qw(Exporter DynaLoader);
  23 our @EXPORT = qw( NFC NFD NFKC NFKD );
  24 our @EXPORT_OK = qw(
  25     normalize decompose reorder compose
  26     checkNFD checkNFKD checkNFC checkNFKC check
  27     getCanon getCompat getComposite getCombinClass
  28     isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
  29     isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
  30     FCD checkFCD FCC checkFCC composeContiguous
  31     splitOnLastStarter
  32 );
  33 our %EXPORT_TAGS = (
  34     all       => [ @EXPORT, @EXPORT_OK ],
  35     normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
  36     check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
  37     fast      => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
  38 );
  39
  40 ######
  41
  42 bootstrap Unicode::Normalize $VERSION;
  43
  44 ######
  45
  46 ##
  47 ## utilites for tests
  48 ##
  49
  50 sub pack_U {
  51     return pack('U*', @_);
  52 }
  53
  54 sub unpack_U {
  55     return unpack('U*', shift(@_).pack('U*'));
  56 }
  57
  58
  59 ##
  60 ## normalization forms
  61 ##
  62
  63 sub FCD ($) {
  64     my $str = shift;
  65     return checkFCD($str) ? $str : NFD($str);
  66 }
  67
  68 our %formNorm = (
  69     NFC  => \&NFC,      C  => \&NFC,
  70     NFD  => \&NFD,      D  => \&NFD,
  71     NFKC => \&NFKC,     KC => \&NFKC,
  72     NFKD => \&NFKD,     KD => \&NFKD,
  73     FCD  => \&FCD,      FCC => \&FCC,
  74 );
  75
  76 sub normalize($$)
  77 {
  78     my $form = shift;
  79     my $str = shift;
  80     if (exists $formNorm{$form}) {
  81         return $formNorm{$form}->($str);
  82     }
  83     croak($PACKAGE."::normalize: invalid form name: $form");
  84 }
  85
  86
  87 ##
  88 ## quick check
  89 ##
  90
  91 our %formCheck = (
  92     NFC  => \&checkNFC,         C  => \&checkNFC,
  93     NFD  => \&checkNFD,         D  => \&checkNFD,
  94     NFKC => \&checkNFKC,        KC => \&checkNFKC,
  95     NFKD => \&checkNFKD,        KD => \&checkNFKD,
  96     FCD  => \&checkFCD,         FCC => \&checkFCC,
  97 );
  98
  99 sub check($$)
 100 {
 101     my $form = shift;
 102     my $str = shift;
 103     if (exists $formCheck{$form}) {
 104         return $formCheck{$form}->($str);
 105     }
 106     croak($PACKAGE."::check: invalid form name: $form");
 107 }
 108
 109 1;
 110 __END__
 111
 112 =head1 NAME
 113
 114 Unicode::Normalize - Unicode Normalization Forms
 115
 116 =head1 SYNOPSIS
 117
 118 (1) using function names exported by default:
 119
 120   use Unicode::Normalize;
 121
 122   $NFD_string  = NFD($string);  # Normalization Form D
 123   $NFC_string  = NFC($string);  # Normalization Form C
 124   $NFKD_string = NFKD($string); # Normalization Form KD
 125   $NFKC_string = NFKC($string); # Normalization Form KC
 126
 127 (2) using function names exported on request:
 128
 129   use Unicode::Normalize 'normalize';
 130
 131   $NFD_string  = normalize('D',  $string);  # Normalization Form D
 132   $NFC_string  = normalize('C',  $string);  # Normalization Form C
 133   $NFKD_string = normalize('KD', $string);  # Normalization Form KD
 134   $NFKC_string = normalize('KC', $string);  # Normalization Form KC
 135
 136 =head1 DESCRIPTION
 137
 138 Parameters:
 139
 140 C<$string> is used as a string under character semantics (see F<perlunicode>).
 141
 142 C<$code_point> should be an unsigned integer representing a Unicode code point.
 143
 144 Note: Between XSUB and pure Perl, there is an incompatibility
 145 about the interpretation of C<$code_point> as a decimal number.
 146 XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not.
 147 Do not use a floating point nor a negative sign in C<$code_point>.
 148
 149 =head2 Normalization Forms
 150
 151 =over 4
 152
 153 =item C<$NFD_string = NFD($string)>
 154
 155 It returns the Normalization Form D (formed by canonical decomposition).
 156
 157 =item C<$NFC_string = NFC($string)>
 158
 159 It returns the Normalization Form C (formed by canonical decomposition
 160 followed by canonical composition).
 161
 162 =item C<$NFKD_string = NFKD($string)>
 163
 164 It returns the Normalization Form KD (formed by compatibility decomposition).
 165
 166 =item C<$NFKC_string = NFKC($string)>
 167
 168 It returns the Normalization Form KC (formed by compatibility decomposition
 169 followed by B<canonical> composition).
 170
 171 =item C<$FCD_string = FCD($string)>
 172
 173 If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
 174 it returns the string without modification; otherwise it returns an FCD string.
 175
 176 Note: FCD is not always unique, then plural forms may be equivalent
 177 each other. C<FCD()> will return one of these equivalent forms.
 178
 179 =item C<$FCC_string = FCC($string)>
 180
 181 It returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
 182
 183 Note: FCC is unique, as well as four normalization forms (NF*).
 184
 185 =item C<$normalized_string = normalize($form_name, $string)>
 186
 187 It returns the normalization form of C<$form_name>.
 188
 189 As C<$form_name>, one of the following names must be given.
 190
 191   'C'  or 'NFC'  for Normalization Form C  (UAX #15)
 192   'D'  or 'NFD'  for Normalization Form D  (UAX #15)
 193   'KC' or 'NFKC' for Normalization Form KC (UAX #15)
 194   'KD' or 'NFKD' for Normalization Form KD (UAX #15)
 195
 196   'FCD'          for "Fast C or D" Form  (UTN #5)
 197   'FCC'          for "Fast C Contiguous" (UTN #5)
 198
 199 =back
 200
 201 =head2 Decomposition and Composition
 202
 203 =over 4
 204
 205 =item C<$decomposed_string = decompose($string [, $useCompatMapping])>
 206
 207 It returns the concatenation of the decomposition of each character
 208 in the string.
 209
 210 If the second parameter (a boolean) is omitted or false,
 211 the decomposition is canonical decomposition;
 212 if the second parameter (a boolean) is true,
 213 the decomposition is compatibility decomposition.
 214
 215 The string returned is not always in NFD/NFKD. Reordering may be required.
 216
 217     $NFD_string  = reorder(decompose($string));       # eq. to NFD()
 218     $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
 219
 220 =item C<$reordered_string = reorder($string)>
 221
 222 It returns the result of reordering the combining characters
 223 according to Canonical Ordering Behavior.
 224
 225 For example, when you have a list of NFD/NFKD strings,
 226 you can get the concatenated NFD/NFKD string from them, by saying
 227
 228     $concat_NFD  = reorder(join '', @NFD_strings);
 229     $concat_NFKD = reorder(join '', @NFKD_strings);
 230
 231 =item C<$composed_string = compose($string)>
 232
 233 It returns the result of canonical composition
 234 without applying any decomposition.
 235
 236 For example, when you have a NFD/NFKD string,
 237 you can get its NFC/NFKC string, by saying
 238
 239     $NFC_string  = compose($NFD_string);
 240     $NFKC_string = compose($NFKD_string);
 241
 242 =back
 243
 244 =head2 Quick Check
 245
 246 (see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
 247
 248 The following functions check whether the string is in that normalization form.
 249
 250 The result returned will be one of the following:
 251
 252     YES     The string is in that normalization form.
 253     NO      The string is not in that normalization form.
 254     MAYBE   Dubious. Maybe yes, maybe no.
 255
 256 =over 4
 257
 258 =item C<$result = checkNFD($string)>
 259
 260 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
 261
 262 =item C<$result = checkNFC($string)>
 263
 264 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 265 C<undef> if C<MAYBE>.
 266
 267 =item C<$result = checkNFKD($string)>
 268
 269 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
 270
 271 =item C<$result = checkNFKC($string)>
 272
 273 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 274 C<undef> if C<MAYBE>.
 275
 276 =item C<$result = checkFCD($string)>
 277
 278 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
 279
 280 =item C<$result = checkFCC($string)>
 281
 282 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 283 C<undef> if C<MAYBE>.
 284
 285 Note: If a string is not in FCD, it must not be in FCC.
 286 So C<checkFCC($not_FCD_string)> should return C<NO>.
 287
 288 =item C<$result = check($form_name, $string)>
 289
 290 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 291 C<undef> if C<MAYBE>.
 292
 293 As C<$form_name>, one of the following names must be given.
 294
 295   'C'  or 'NFC'  for Normalization Form C  (UAX #15)
 296   'D'  or 'NFD'  for Normalization Form D  (UAX #15)
 297   'KC' or 'NFKC' for Normalization Form KC (UAX #15)
 298   'KD' or 'NFKD' for Normalization Form KD (UAX #15)
 299
 300   'FCD'          for "Fast C or D" Form  (UTN #5)
 301   'FCC'          for "Fast C Contiguous" (UTN #5)
 302
 303 =back
 304
 305 B<Note>
 306
 307 In the cases of NFD, NFKD, and FCD, the answer must be
 308 either C<YES> or C<NO>. The answer C<MAYBE> may be returned
 309 in the cases of NFC, NFKC, and FCC.
 310
 311 A C<MAYBE> string should contain at least one combining character
 312 or the like. For example, C<COMBINING ACUTE ACCENT> has
 313 the MAYBE_NFC/MAYBE_NFKC property.
 314
 315 Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
 316 and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
 317 C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
 318 (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
 319 while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
 320
 321 If you want to check exactly, compare the string with its NFC/NFKC/FCC.
 322
 323     if ($string eq NFC($string)) {
 324         # $string is exactly normalized in NFC;
 325     } else {
 326         # $string is not normalized in NFC;
 327     }
 328
 329     if ($string eq NFKC($string)) {
 330         # $string is exactly normalized in NFKC;
 331     } else {
 332         # $string is not normalized in NFKC;
 333     }
 334
 335 =head2 Character Data
 336
 337 These functions are interface of character data used internally.
 338 If you want only to get Unicode normalization forms, you don't need
 339 call them yourself.
 340
 341 =over 4
 342
 343 =item C<$canonical_decomposition = getCanon($code_point)>
 344
 345 If the character is canonically decomposable (including Hangul Syllables),
 346 it returns the (full) canonical decomposition as a string.
 347 Otherwise it returns C<undef>.
 348
 349 B<Note:> According to the Unicode standard, the canonical decomposition
 350 of the character that is not canonically decomposable is same as
 351 the character itself.
 352
 353 =item C<$compatibility_decomposition = getCompat($code_point)>
 354
 355 If the character is compatibility decomposable (including Hangul Syllables),
 356 it returns the (full) compatibility decomposition as a string.
 357 Otherwise it returns C<undef>.
 358
 359 B<Note:> According to the Unicode standard, the compatibility decomposition
 360 of the character that is not compatibility decomposable is same as
 361 the character itself.
 362
 363 =item C<$code_point_composite = getComposite($code_point_here, $code_point_next)>
 364
 365 If two characters here and next (as code points) are composable
 366 (including Hangul Jamo/Syllables and Composition Exclusions),
 367 it returns the code point of the composite.
 368
 369 If they are not composable, it returns C<undef>.
 370
 371 =item C<$combining_class = getCombinClass($code_point)>
 372
 373 It returns the combining class (as an integer) of the character.
 374
 375 =item C<$may_be_composed_with_prev_char = isComp2nd($code_point)>
 376
 377 It returns a boolean whether the character of the specified codepoint
 378 may be composed with the previous one in a certain composition
 379 (including Hangul Compositions, but excluding
 380 Composition Exclusions and Non-Starter Decompositions).
 381
 382 =item C<$is_exclusion = isExclusion($code_point)>
 383
 384 It returns a boolean whether the code point is a composition exclusion.
 385
 386 =item C<$is_singleton = isSingleton($code_point)>
 387
 388 It returns a boolean whether the code point is a singleton
 389
 390 =item C<$is_non_starter_decomposition = isNonStDecomp($code_point)>
 391
 392 It returns a boolean whether the code point has Non-Starter Decomposition.
 393
 394 =item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)>
 395
 396 It returns a boolean of the derived property Comp_Ex
 397 (Full_Composition_Exclusion). This property is generated from
 398 Composition Exclusions + Singletons + Non-Starter Decompositions.
 399
 400 =item C<$NFD_is_NO = isNFD_NO($code_point)>
 401
 402 It returns a boolean of the derived property NFD_NO
 403 (NFD_Quick_Check=No).
 404
 405 =item C<$NFC_is_NO = isNFC_NO($code_point)>
 406
 407 It returns a boolean of the derived property NFC_NO
 408 (NFC_Quick_Check=No).
 409
 410 =item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)>
 411
 412 It returns a boolean of the derived property NFC_MAYBE
 413 (NFC_Quick_Check=Maybe).
 414
 415 =item C<$NFKD_is_NO = isNFKD_NO($code_point)>
 416
 417 It returns a boolean of the derived property NFKD_NO
 418 (NFKD_Quick_Check=No).
 419
 420 =item C<$NFKC_is_NO = isNFKC_NO($code_point)>
 421
 422 It returns a boolean of the derived property NFKC_NO
 423 (NFKC_Quick_Check=No).
 424
 425 =item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)>
 426
 427 It returns a boolean of the derived property NFKC_MAYBE
 428 (NFKC_Quick_Check=Maybe).
 429
 430 =back
 431
 432 =head1 EXPORT
 433
 434 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
 435
 436 C<normalize> and other some functions: on request.
 437
 438 =head1 CAVEATS
 439
 440 =over 4
 441
 442 =item Perl's version vs. Unicode version
 443
 444 Since this module refers to perl core's Unicode database in the directory
 445 F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
 446 normalization implemented by this module depends on your perl's version.
 447
 448     perl's version     implemented Unicode version
 449        5.6.1              3.0.1
 450        5.7.2              3.1.0
 451        5.7.3              3.1.1 (normalization is same as 3.1.0)
 452        5.8.0              3.2.0
 453      5.8.1-5.8.3          4.0.0
 454      5.8.4-5.8.6          4.0.1 (normalization is same as 4.0.0)
 455      5.8.7-5.8.8          4.1.0
 456        5.10.0             5.0.0
 457        5.8.9              5.1.0
 458
 459 =item Correction of decomposition mapping
 460
 461 In older Unicode versions, a small number of characters (all of which are
 462 CJK compatibility ideographs as far as they have been found) may have
 463 an erroneous decomposition mapping (see F<NormalizationCorrections.txt>).
 464 Anyhow, this module will neither refer to F<NormalizationCorrections.txt>
 465 nor provide any specific version of normalization. Therefore this module
 466 running on an older perl with an older Unicode database may use
 467 the erroneous decomposition mapping blindly conforming to the Unicode database.
 468
 469 =item Revised definition of canonical composition
 470
 471 In Unicode 4.1.0, the definition D2 of canonical composition (which
 472 affects NFC and NFKC) has been changed (see Public Review Issue #29
 473 and recent UAX #15). This module has used the newer definition
 474 since the version 0.07 (Oct 31, 2001).
 475 This module will not support the normalization according to the older
 476 definition, even if the Unicode version implemented by perl is
 477 lower than 4.1.0.
 478
 479 =back
 480
 481 =head1 AUTHOR
 482
 483 SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
 484
 485 Copyright(C) 2001-2007, SADAHIRO Tomoyuki. Japan. All rights reserved.
 486
 487 This module is free software; you can redistribute it
 488 and/or modify it under the same terms as Perl itself.
 489
 490 =head1 SEE ALSO
 491
 492 =over 4
 493
 494 =item http://www.unicode.org/reports/tr15/
 495
 496 Unicode Normalization Forms - UAX #15
 497
 498 =item http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
 499
 500 Composition Exclusion Table
 501
 502 =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
 503
 504 Derived Normalization Properties
 505
 506 =item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt
 507
 508 Normalization Corrections
 509
 510 =item http://www.unicode.org/review/pr-29.html
 511
 512 Public Review Issue #29: Normalization Issue
 513
 514 =item http://www.unicode.org/notes/tn5/
 515
 516 Canonical Equivalence in Applications - UTN #5
 517
 518 =back
 519
 520 =cut