ext/Unicode/Normalize/Normalize.pm

   1 package Unicode::Normalize;
   2
   3 BEGIN {
   4     unless ("A" eq pack('U', 0x41)) {
   5         die "Unicode::Normalize cannot stringify a Unicode code point\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13
  14 no warnings 'utf8';
  15
  16 our $VERSION = '0.28';
  17 our $PACKAGE = __PACKAGE__;
  18
  19 require Exporter;
  20 require DynaLoader;
  21
  22 our @ISA = qw(Exporter DynaLoader);
  23 our @EXPORT = qw( NFC NFD NFKC NFKD );
  24 our @EXPORT_OK = qw(
  25     normalize decompose reorder compose
  26     checkNFD checkNFKD checkNFC checkNFKC check
  27     getCanon getCompat getComposite getCombinClass
  28     isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
  29     isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
  30     FCD checkFCD FCC checkFCC composeContiguous
  31     splitOnLastStarter
  32 );
  33 our %EXPORT_TAGS = (
  34     all       => [ @EXPORT, @EXPORT_OK ],
  35     normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
  36     check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
  37     fast      => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
  38 );
  39
  40 ######
  41
  42 bootstrap Unicode::Normalize $VERSION;
  43
  44 ######
  45
  46 sub pack_U {
  47     return pack('U*', @_);
  48 }
  49
  50 sub unpack_U {
  51     return unpack('U*', pack('U*').shift);
  52 }
  53
  54
  55 ##
  56 ## normalization forms
  57 ##
  58
  59 use constant COMPAT => 1;
  60
  61 sub NFD  ($) { reorder(decompose($_[0])) }
  62 sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
  63 sub NFC  ($) { compose(reorder(decompose($_[0]))) }
  64 sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
  65
  66 sub FCD ($) {
  67     my $str = shift;
  68     return checkFCD($str) ? $str : NFD($str);
  69 }
  70 sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) }
  71
  72 our %formNorm = (
  73     NFC  => \&NFC,      C  => \&NFC,
  74     NFD  => \&NFD,      D  => \&NFD,
  75     NFKC => \&NFKC,     KC => \&NFKC,
  76     NFKD => \&NFKD,     KD => \&NFKD,
  77     FCD  => \&FCD,      FCC => \&FCC,
  78 );
  79
  80 sub normalize($$)
  81 {
  82     my $form = shift;
  83     my $str = shift;
  84     return exists $formNorm{$form}
  85         ? $formNorm{$form}->($str)
  86         : croak $PACKAGE."::normalize: invalid form name: $form";
  87 }
  88
  89
  90 ##
  91 ## quick check
  92 ##
  93
  94 our %formCheck = (
  95     NFC  => \&checkNFC,         C  => \&checkNFC,
  96     NFD  => \&checkNFD,         D  => \&checkNFD,
  97     NFKC => \&checkNFKC,        KC => \&checkNFKC,
  98     NFKD => \&checkNFKD,        KD => \&checkNFKD,
  99     FCD  => \&checkFCD,         FCC => \&checkFCC,
 100 );
 101
 102 sub check($$)
 103 {
 104     my $form = shift;
 105     my $str = shift;
 106     return exists $formCheck{$form}
 107         ? $formCheck{$form}->($str)
 108         : croak $PACKAGE."::check: invalid form name: $form";
 109 }
 110
 111 1;
 112 __END__
 113
 114 =head1 NAME
 115
 116 Unicode::Normalize - Unicode Normalization Forms
 117
 118 =head1 SYNOPSIS
 119
 120   use Unicode::Normalize;
 121
 122   $NFD_string  = NFD($string);  # Normalization Form D
 123   $NFC_string  = NFC($string);  # Normalization Form C
 124   $NFKD_string = NFKD($string); # Normalization Form KD
 125   $NFKC_string = NFKC($string); # Normalization Form KC
 126
 127    or
 128
 129   use Unicode::Normalize 'normalize';
 130
 131   $NFD_string  = normalize('D',  $string);  # Normalization Form D
 132   $NFC_string  = normalize('C',  $string);  # Normalization Form C
 133   $NFKD_string = normalize('KD', $string);  # Normalization Form KD
 134   $NFKC_string = normalize('KC', $string);  # Normalization Form KC
 135
 136 =head1 DESCRIPTION
 137
 138 Parameters:
 139
 140 C<$string> is used as a string under character semantics
 141 (see F<perlunicode>).
 142
 143 C<$codepoint> should be an unsigned integer
 144 representing a Unicode code point.
 145
 146 Note: Between XS edition and pure Perl edition,
 147 interpretation of C<$codepoint> as a decimal number has incompatibility.
 148 XS converts C<$codepoint> to an unsigned integer, but pure Perl does not.
 149 Do not use a floating point nor a negative sign in C<$codepoint>.
 150
 151 =head2 Normalization Forms
 152
 153 =over 4
 154
 155 =item C<$NFD_string = NFD($string)>
 156
 157 returns the Normalization Form D (formed by canonical decomposition).
 158
 159 =item C<$NFC_string = NFC($string)>
 160
 161 returns the Normalization Form C (formed by canonical decomposition
 162 followed by canonical composition).
 163
 164 =item C<$NFKD_string = NFKD($string)>
 165
 166 returns the Normalization Form KD (formed by compatibility decomposition).
 167
 168 =item C<$NFKC_string = NFKC($string)>
 169
 170 returns the Normalization Form KC (formed by compatibility decomposition
 171 followed by B<canonical> composition).
 172
 173 =item C<$FCD_string = FCD($string)>
 174
 175 If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
 176 returns it without modification; otherwise returns an FCD string.
 177
 178 Note: FCD is not always unique, then plural forms may be equivalent
 179 each other. C<FCD()> will return one of these equivalent forms.
 180
 181 =item C<$FCC_string = FCC($string)>
 182
 183 returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
 184
 185 Note: FCC is unique, as well as four normalization forms (NF*).
 186
 187 =item C<$normalized_string = normalize($form_name, $string)>
 188
 189 As C<$form_name>, one of the following names must be given.
 190
 191   'C'  or 'NFC'  for Normalization Form C  (UAX #15)
 192   'D'  or 'NFD'  for Normalization Form D  (UAX #15)
 193   'KC' or 'NFKC' for Normalization Form KC (UAX #15)
 194   'KD' or 'NFKD' for Normalization Form KD (UAX #15)
 195
 196   'FCD'          for "Fast C or D" Form  (UTN #5)
 197   'FCC'          for "Fast C Contiguous" (UTN #5)
 198
 199 =back
 200
 201 =head2 Decomposition and Composition
 202
 203 =over 4
 204
 205 =item C<$decomposed_string = decompose($string)>
 206
 207 =item C<$decomposed_string = decompose($string, $useCompatMapping)>
 208
 209 Decomposes the specified string and returns the result.
 210
 211 If the second parameter (a boolean) is omitted or false, decomposes it
 212 using the Canonical Decomposition Mapping.
 213 If true, decomposes it using the Compatibility Decomposition Mapping.
 214
 215 The string returned is not always in NFD/NFKD.
 216 Reordering may be required.
 217
 218     $NFD_string  = reorder(decompose($string));       # eq. to NFD()
 219     $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
 220
 221 =item C<$reordered_string  = reorder($string)>
 222
 223 Reorders the combining characters and the like in the canonical ordering
 224 and returns the result.
 225
 226 E.g., when you have a list of NFD/NFKD strings,
 227 you can get the concatenated NFD/NFKD string from them, saying
 228
 229     $concat_NFD  = reorder(join '', @NFD_strings);
 230     $concat_NFKD = reorder(join '', @NFKD_strings);
 231
 232 =item C<$composed_string   = compose($string)>
 233
 234 Returns the string where composable pairs are composed.
 235
 236 E.g., when you have a NFD/NFKD string,
 237 you can get its NFC/NFKC string, saying
 238
 239     $NFC_string  = compose($NFD_string);
 240     $NFKC_string = compose($NFKD_string);
 241
 242 =back
 243
 244 =head2 Quick Check
 245
 246 (see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
 247
 248 The following functions check whether the string is in that normalization form.
 249
 250 The result returned will be:
 251
 252     YES     The string is in that normalization form.
 253     NO      The string is not in that normalization form.
 254     MAYBE   Dubious. Maybe yes, maybe no.
 255
 256 =over 4
 257
 258 =item C<$result = checkNFD($string)>
 259
 260 returns C<YES> (C<1>) or C<NO> (C<empty string>).
 261
 262 =item C<$result = checkNFC($string)>
 263
 264 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
 265
 266 =item C<$result = checkNFKD($string)>
 267
 268 returns C<YES> (C<1>) or C<NO> (C<empty string>).
 269
 270 =item C<$result = checkNFKC($string)>
 271
 272 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
 273
 274 =item C<$result = checkFCD($string)>
 275
 276 returns C<YES> (C<1>) or C<NO> (C<empty string>).
 277
 278 =item C<$result = checkFCC($string)>
 279
 280 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
 281
 282 If a string is not in FCD, it must not be in FCC.
 283 So C<checkFCC($not_FCD_string)> should return C<NO>.
 284
 285 =item C<$result = check($form_name, $string)>
 286
 287 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
 288
 289 C<$form_name> is alike to that for C<normalize()>.
 290
 291 =back
 292
 293 B<Note>
 294
 295 In the cases of NFD, NFKD, and FCD, the answer must be
 296 either C<YES> or C<NO>. The answer C<MAYBE> may be returned
 297 in the cases of NFC, NFKC, and FCC.
 298
 299 A C<MAYBE> string should contain at least one combining character
 300 or the like. For example, C<COMBINING ACUTE ACCENT> has
 301 the MAYBE_NFC/MAYBE_NFKC property.
 302
 303 Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
 304 and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
 305 C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
 306 (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
 307 while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
 308
 309 If you want to check exactly, compare the string with its NFC/NFKC/FCC;
 310 i.e.,
 311
 312     $string eq NFC($string)    # thorough than checkNFC($string)
 313     $string eq NFKC($string)   # thorough than checkNFKC($string)
 314     $string eq FCC($string)    # thorough than checkFCC($string)
 315
 316 =head2 Character Data
 317
 318 These functions are interface of character data used internally.
 319 If you want only to get Unicode normalization forms, you don't need
 320 call them yourself.
 321
 322 =over 4
 323
 324 =item C<$canonical_decomposed = getCanon($codepoint)>
 325
 326 If the character of the specified codepoint is canonically
 327 decomposable (including Hangul Syllables),
 328 returns the B<completely decomposed> string canonically equivalent to it.
 329
 330 If it is not decomposable, returns C<undef>.
 331
 332 =item C<$compatibility_decomposed = getCompat($codepoint)>
 333
 334 If the character of the specified codepoint is compatibility
 335 decomposable (including Hangul Syllables),
 336 returns the B<completely decomposed> string compatibility equivalent to it.
 337
 338 If it is not decomposable, returns C<undef>.
 339
 340 =item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
 341
 342 If two characters here and next (as codepoints) are composable
 343 (including Hangul Jamo/Syllables and Composition Exclusions),
 344 returns the codepoint of the composite.
 345
 346 If they are not composable, returns C<undef>.
 347
 348 =item C<$combining_class = getCombinClass($codepoint)>
 349
 350 Returns the combining class of the character as an integer.
 351
 352 =item C<$is_exclusion = isExclusion($codepoint)>
 353
 354 Returns a boolean whether the character of the specified codepoint
 355 is a composition exclusion.
 356
 357 =item C<$is_singleton = isSingleton($codepoint)>
 358
 359 Returns a boolean whether the character of the specified codepoint is
 360 a singleton.
 361
 362 =item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
 363
 364 Returns a boolean whether the canonical decomposition
 365 of the character of the specified codepoint
 366 is a Non-Starter Decomposition.
 367
 368 =item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
 369
 370 Returns a boolean whether the character of the specified codepoint
 371 may be composed with the previous one in a certain composition
 372 (including Hangul Compositions, but excluding
 373 Composition Exclusions and Non-Starter Decompositions).
 374
 375 =back
 376
 377 =head2 EXPORT
 378
 379 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
 380
 381 C<normalize> and other some functions: on request.
 382
 383 =head1 AUTHOR
 384
 385 SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>
 386
 387   http://homepage1.nifty.com/nomenclator/perl/
 388
 389   Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
 390
 391   This module is free software; you can redistribute it
 392   and/or modify it under the same terms as Perl itself.
 393
 394 =head1 SEE ALSO
 395
 396 =over 4
 397
 398 =item http://www.unicode.org/reports/tr15/
 399
 400 Unicode Normalization Forms - UAX #15
 401
 402 =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
 403
 404 Derived Normalization Properties
 405
 406 =item http://www.unicode.org/notes/tn5/
 407
 408 Canonical Equivalence in Applications - UTN #5
 409
 410 =back
 411
 412 =cut
 413