ext/Unicode/Normalize/Normalize.pm

   1 package Unicode::Normalize;
   2
   3 BEGIN {
   4     unless ("A" eq pack('U', 0x41)) {
   5         die "Unicode::Normalize cannot stringify a Unicode code point\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13
  14 our $VERSION = '0.23';
  15 our $PACKAGE = __PACKAGE__;
  16
  17 require Exporter;
  18 require DynaLoader;
  19 require AutoLoader;
  20
  21 our @ISA = qw(Exporter DynaLoader);
  22 our @EXPORT = qw( NFC NFD NFKC NFKD );
  23 our @EXPORT_OK = qw(
  24     normalize decompose reorder compose
  25     checkNFD checkNFKD checkNFC checkNFKC check
  26     getCanon getCompat getComposite getCombinClass
  27     isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
  28     isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
  29 );
  30 our %EXPORT_TAGS = (
  31     all       => [ @EXPORT, @EXPORT_OK ],
  32     normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
  33     check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
  34 );
  35
  36 bootstrap Unicode::Normalize $VERSION;
  37
  38 sub pack_U {
  39     return pack('U*', @_);
  40 }
  41
  42 sub unpack_U {
  43     return unpack('U*', pack('U*').shift);
  44 }
  45
  46 use constant COMPAT => 1;
  47
  48 sub NFD  ($) { reorder(decompose($_[0])) }
  49 sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
  50 sub NFC  ($) { compose(reorder(decompose($_[0]))) }
  51 sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
  52
  53 sub normalize($$)
  54 {
  55     my $form = shift;
  56     my $str = shift;
  57     $form =~ s/^NF//;
  58     return
  59         $form eq 'D'  ? NFD ($str) :
  60         $form eq 'C'  ? NFC ($str) :
  61         $form eq 'KD' ? NFKD($str) :
  62         $form eq 'KC' ? NFKC($str) :
  63       croak $PACKAGE."::normalize: invalid form name: $form";
  64 }
  65
  66 sub check($$)
  67 {
  68     my $form = shift;
  69     my $str = shift;
  70     $form =~ s/^NF//;
  71     return
  72         $form eq 'D'  ? checkNFD ($str) :
  73         $form eq 'C'  ? checkNFC ($str) :
  74         $form eq 'KD' ? checkNFKD($str) :
  75         $form eq 'KC' ? checkNFKC($str) :
  76       croak $PACKAGE."::check: invalid form name: $form";
  77 }
  78
  79 1;
  80 __END__
  81
  82 =head1 NAME
  83
  84 Unicode::Normalize - Unicode Normalization Forms
  85
  86 =head1 SYNOPSIS
  87
  88   use Unicode::Normalize;
  89
  90   $NFD_string  = NFD($string);  # Normalization Form D
  91   $NFC_string  = NFC($string);  # Normalization Form C
  92   $NFKD_string = NFKD($string); # Normalization Form KD
  93   $NFKC_string = NFKC($string); # Normalization Form KC
  94
  95    or
  96
  97   use Unicode::Normalize 'normalize';
  98
  99   $NFD_string  = normalize('D',  $string);  # Normalization Form D
 100   $NFC_string  = normalize('C',  $string);  # Normalization Form C
 101   $NFKD_string = normalize('KD', $string);  # Normalization Form KD
 102   $NFKC_string = normalize('KC', $string);  # Normalization Form KC
 103
 104 =head1 DESCRIPTION
 105
 106 Parameters:
 107
 108 C<$string> is used as a string under character semantics
 109 (see F<perlunicode>).
 110
 111 C<$codepoint> should be an unsigned integer
 112 representing a Unicode code point.
 113
 114 Note: Between XS edition and pure Perl edition,
 115 interpretation of C<$codepoint> as a decimal number has incompatibility.
 116 XS converts C<$codepoint> to an unsigned integer, but pure Perl does not.
 117 Do not use a floating point nor a negative sign in C<$codepoint>.
 118
 119 =head2 Normalization Forms
 120
 121 =over 4
 122
 123 =item C<$NFD_string = NFD($string)>
 124
 125 returns the Normalization Form D (formed by canonical decomposition).
 126
 127 =item C<$NFC_string = NFC($string)>
 128
 129 returns the Normalization Form C (formed by canonical decomposition
 130 followed by canonical composition).
 131
 132 =item C<$NFKD_string = NFKD($string)>
 133
 134 returns the Normalization Form KD (formed by compatibility decomposition).
 135
 136 =item C<$NFKC_string = NFKC($string)>
 137
 138 returns the Normalization Form KC (formed by compatibility decomposition
 139 followed by B<canonical> composition).
 140
 141 =item C<$normalized_string = normalize($form_name, $string)>
 142
 143 As C<$form_name>, one of the following names must be given.
 144
 145   'C'  or 'NFC'  for Normalization Form C
 146   'D'  or 'NFD'  for Normalization Form D
 147   'KC' or 'NFKC' for Normalization Form KC
 148   'KD' or 'NFKD' for Normalization Form KD
 149
 150 =back
 151
 152 =head2 Decomposition and Composition
 153
 154 =over 4
 155
 156 =item C<$decomposed_string = decompose($string)>
 157
 158 =item C<$decomposed_string = decompose($string, $useCompatMapping)>
 159
 160 Decomposes the specified string and returns the result.
 161
 162 If the second parameter (a boolean) is omitted or false, decomposes it
 163 using the Canonical Decomposition Mapping.
 164 If true, decomposes it using the Compatibility Decomposition Mapping.
 165
 166 The string returned is not always in NFD/NFKD.
 167 Reordering may be required.
 168
 169     $NFD_string  = reorder(decompose($string));       # eq. to NFD()
 170     $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
 171
 172 =item C<$reordered_string  = reorder($string)>
 173
 174 Reorders the combining characters and the like in the canonical ordering
 175 and returns the result.
 176
 177 E.g., when you have a list of NFD/NFKD strings,
 178 you can get the concatenated NFD/NFKD string from them, saying
 179
 180     $concat_NFD  = reorder(join '', @NFD_strings);
 181     $concat_NFKD = reorder(join '', @NFKD_strings);
 182
 183 =item C<$composed_string   = compose($string)>
 184
 185 Returns the string where composable pairs are composed.
 186
 187 E.g., when you have a NFD/NFKD string,
 188 you can get its NFC/NFKC string, saying
 189
 190     $NFC_string  = compose($NFD_string);
 191     $NFKC_string = compose($NFKD_string);
 192
 193 =back
 194
 195 =head2 Quick Check
 196
 197 (see Annex 8, UAX #15, and F<DerivedNormalizationProps.txt>)
 198
 199 The following functions check whether the string is in that normalization form.
 200
 201 The result returned will be:
 202
 203     YES     The string is in that normalization form.
 204     NO      The string is not in that normalization form.
 205     MAYBE   Dubious. Maybe yes, maybe no.
 206
 207 =over 4
 208
 209 =item C<$result = checkNFD($string)>
 210
 211 returns C<YES> (C<1>) or C<NO> (C<empty string>).
 212
 213 =item C<$result = checkNFC($string)>
 214
 215 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
 216
 217 =item C<$result = checkNFKD($string)>
 218
 219 returns C<YES> (C<1>) or C<NO> (C<empty string>).
 220
 221 =item C<$result = checkNFKC($string)>
 222
 223 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
 224
 225 =item C<$result = check($form_name, $string)>
 226
 227 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
 228
 229 C<$form_name> is alike to that for C<normalize()>.
 230
 231 =back
 232
 233 B<Note>
 234
 235 In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
 236 The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
 237
 238 A MAYBE-NFC/NFKC string should contain at least
 239 one combining character or the like.
 240 For example, C<COMBINING ACUTE ACCENT> has
 241 the MAYBE_NFC/MAYBE_NFKC property.
 242 Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
 243 and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
 244 C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
 245 (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
 246 while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
 247
 248 If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
 249
 250     $string eq NFC($string)    # more thorough than checkNFC($string)
 251     $string eq NFKC($string)   # more thorough than checkNFKC($string)
 252
 253 =head2 Character Data
 254
 255 These functions are interface of character data used internally.
 256 If you want only to get Unicode normalization forms, you don't need
 257 call them yourself.
 258
 259 =over 4
 260
 261 =item C<$canonical_decomposed = getCanon($codepoint)>
 262
 263 If the character of the specified codepoint is canonically
 264 decomposable (including Hangul Syllables),
 265 returns the B<completely decomposed> string canonically equivalent to it.
 266
 267 If it is not decomposable, returns C<undef>.
 268
 269 =item C<$compatibility_decomposed = getCompat($codepoint)>
 270
 271 If the character of the specified codepoint is compatibility
 272 decomposable (including Hangul Syllables),
 273 returns the B<completely decomposed> string compatibility equivalent to it.
 274
 275 If it is not decomposable, returns C<undef>.
 276
 277 =item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
 278
 279 If two characters here and next (as codepoints) are composable
 280 (including Hangul Jamo/Syllables and Composition Exclusions),
 281 returns the codepoint of the composite.
 282
 283 If they are not composable, returns C<undef>.
 284
 285 =item C<$combining_class = getCombinClass($codepoint)>
 286
 287 Returns the combining class of the character as an integer.
 288
 289 =item C<$is_exclusion = isExclusion($codepoint)>
 290
 291 Returns a boolean whether the character of the specified codepoint
 292 is a composition exclusion.
 293
 294 =item C<$is_singleton = isSingleton($codepoint)>
 295
 296 Returns a boolean whether the character of the specified codepoint is
 297 a singleton.
 298
 299 =item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
 300
 301 Returns a boolean whether the canonical decomposition
 302 of the character of the specified codepoint
 303 is a Non-Starter Decomposition.
 304
 305 =item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
 306
 307 Returns a boolean whether the character of the specified codepoint
 308 may be composed with the previous one in a certain composition
 309 (including Hangul Compositions, but excluding
 310 Composition Exclusions and Non-Starter Decompositions).
 311
 312 =back
 313
 314 =head2 EXPORT
 315
 316 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
 317
 318 C<normalize> and other some functions: on request.
 319
 320 =head1 AUTHOR
 321
 322 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
 323
 324   http://homepage1.nifty.com/nomenclator/perl/
 325
 326   Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
 327
 328   This module is free software; you can redistribute it
 329   and/or modify it under the same terms as Perl itself.
 330
 331 =head1 SEE ALSO
 332
 333 =over 4
 334
 335 =item http://www.unicode.org/unicode/reports/tr15/
 336
 337 Unicode Normalization Forms - UAX #15
 338
 339 =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
 340
 341 Derived Normalization Properties
 342
 343 =back
 344
 345 =cut
 346