ext/Unicode/Normalize/Normalize.pm

   1 package Unicode::Normalize;
   2
   3 BEGIN {
   4     unless ("A" eq pack('U', 0x41) || "A" eq pack('U', ord("A"))) {
   5         die "Unicode::Normalize cannot stringify a Unicode code point\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13
  14 our $VERSION = '0.21';
  15 our $PACKAGE = __PACKAGE__;
  16
  17 require Exporter;
  18 require DynaLoader;
  19 require AutoLoader;
  20
  21 our @ISA = qw(Exporter DynaLoader);
  22 our @EXPORT = qw( NFC NFD NFKC NFKD );
  23 our @EXPORT_OK = qw(
  24     normalize decompose reorder compose
  25     checkNFD checkNFKD checkNFC checkNFKC check
  26     getCanon getCompat getComposite getCombinClass
  27     isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
  28     isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
  29 );
  30 our %EXPORT_TAGS = (
  31     all       => [ @EXPORT, @EXPORT_OK ],
  32     normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
  33     check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
  34 );
  35
  36 bootstrap Unicode::Normalize $VERSION;
  37
  38 sub pack_U {
  39     return pack('U*', @_);
  40 }
  41
  42 sub unpack_U {
  43     return unpack('U*', pack('U*').shift);
  44 }
  45
  46 use constant COMPAT => 1;
  47
  48 sub NFD  ($) { reorder(decompose($_[0])) }
  49 sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
  50 sub NFC  ($) { compose(reorder(decompose($_[0]))) }
  51 sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
  52
  53 sub normalize($$)
  54 {
  55     my $form = shift;
  56     my $str = shift;
  57     $form =~ s/^NF//;
  58     return
  59         $form eq 'D'  ? NFD ($str) :
  60         $form eq 'C'  ? NFC ($str) :
  61         $form eq 'KD' ? NFKD($str) :
  62         $form eq 'KC' ? NFKC($str) :
  63       croak $PACKAGE."::normalize: invalid form name: $form";
  64 }
  65
  66 sub check($$)
  67 {
  68     my $form = shift;
  69     my $str = shift;
  70     $form =~ s/^NF//;
  71     return
  72         $form eq 'D'  ? checkNFD ($str) :
  73         $form eq 'C'  ? checkNFC ($str) :
  74         $form eq 'KD' ? checkNFKD($str) :
  75         $form eq 'KC' ? checkNFKC($str) :
  76       croak $PACKAGE."::check: invalid form name: $form";
  77 }
  78
  79 1;
  80 __END__
  81
  82 =head1 NAME
  83
  84 Unicode::Normalize - Unicode Normalization Forms
  85
  86 =head1 SYNOPSIS
  87
  88   use Unicode::Normalize;
  89
  90   $NFD_string  = NFD($string);  # Normalization Form D
  91   $NFC_string  = NFC($string);  # Normalization Form C
  92   $NFKD_string = NFKD($string); # Normalization Form KD
  93   $NFKC_string = NFKC($string); # Normalization Form KC
  94
  95    or
  96
  97   use Unicode::Normalize 'normalize';
  98
  99   $NFD_string  = normalize('D',  $string);  # Normalization Form D
 100   $NFC_string  = normalize('C',  $string);  # Normalization Form C
 101   $NFKD_string = normalize('KD', $string);  # Normalization Form KD
 102   $NFKC_string = normalize('KC', $string);  # Normalization Form KC
 103
 104 =head1 DESCRIPTION
 105
 106 =head2 Normalization Forms
 107
 108 =over 4
 109
 110 =item C<$NFD_string = NFD($string)>
 111
 112 returns the Normalization Form D (formed by canonical decomposition).
 113
 114 =item C<$NFC_string = NFC($string)>
 115
 116 returns the Normalization Form C (formed by canonical decomposition
 117 followed by canonical composition).
 118
 119 =item C<$NFKD_string = NFKD($string)>
 120
 121 returns the Normalization Form KD (formed by compatibility decomposition).
 122
 123 =item C<$NFKC_string = NFKC($string)>
 124
 125 returns the Normalization Form KC (formed by compatibility decomposition
 126 followed by B<canonical> composition).
 127
 128 =item C<$normalized_string = normalize($form_name, $string)>
 129
 130 As C<$form_name>, one of the following names must be given.
 131
 132   'C'  or 'NFC'  for Normalization Form C
 133   'D'  or 'NFD'  for Normalization Form D
 134   'KC' or 'NFKC' for Normalization Form KC
 135   'KD' or 'NFKD' for Normalization Form KD
 136
 137 =back
 138
 139 =head2 Decomposition and Composition
 140
 141 =over 4
 142
 143 =item C<$decomposed_string = decompose($string)>
 144
 145 =item C<$decomposed_string = decompose($string, $useCompatMapping)>
 146
 147 Decomposes the specified string and returns the result.
 148
 149 If the second parameter (a boolean) is omitted or false, decomposes it
 150 using the Canonical Decomposition Mapping.
 151 If true, decomposes it using the Compatibility Decomposition Mapping.
 152
 153 The string returned is not always in NFD/NFKD.
 154 Reordering may be required.
 155
 156     $NFD_string  = reorder(decompose($string));       # eq. to NFD()
 157     $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
 158
 159 =item C<$reordered_string  = reorder($string)>
 160
 161 Reorders the combining characters and the like in the canonical ordering
 162 and returns the result.
 163
 164 E.g., when you have a list of NFD/NFKD strings,
 165 you can get the concatenated NFD/NFKD string from them, saying
 166
 167     $concat_NFD  = reorder(join '', @NFD_strings);
 168     $concat_NFKD = reorder(join '', @NFKD_strings);
 169
 170 =item C<$composed_string   = compose($string)>
 171
 172 Returns the string where composable pairs are composed.
 173
 174 E.g., when you have a NFD/NFKD string,
 175 you can get its NFC/NFKC string, saying
 176
 177     $NFC_string  = compose($NFD_string);
 178     $NFKC_string = compose($NFKD_string);
 179
 180 =back
 181
 182 =head2 Quick Check
 183
 184 (see Annex 8, UAX #15, and F<DerivedNormalizationProps.txt>)
 185
 186 The following functions check whether the string is in that normalization form.
 187
 188 The result returned will be:
 189
 190     YES     The string is in that normalization form.
 191     NO      The string is not in that normalization form.
 192     MAYBE   Dubious. Maybe yes, maybe no.
 193
 194 =over 4
 195
 196 =item C<$result = checkNFD($string)>
 197
 198 returns C<YES> (C<1>) or C<NO> (C<empty string>).
 199
 200 =item C<$result = checkNFC($string)>
 201
 202 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
 203
 204 =item C<$result = checkNFKD($string)>
 205
 206 returns C<YES> (C<1>) or C<NO> (C<empty string>).
 207
 208 =item C<$result = checkNFKC($string)>
 209
 210 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
 211
 212 =item C<$result = check($form_name, $string)>
 213
 214 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
 215
 216 C<$form_name> is alike to that for C<normalize()>.
 217
 218 =back
 219
 220 B<Note>
 221
 222 In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
 223 The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
 224
 225 A MAYBE-NFC/NFKC string should contain at least
 226 one combining character or the like.
 227 For example, C<COMBINING ACUTE ACCENT> has
 228 the MAYBE_NFC/MAYBE_NFKC property.
 229 Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
 230 and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
 231 C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
 232 (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
 233 while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
 234
 235 If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
 236
 237     $string eq NFC($string)    # more thorough than checkNFC($string)
 238     $string eq NFKC($string)   # more thorough than checkNFKC($string)
 239
 240 =head2 Character Data
 241
 242 These functions are interface of character data used internally.
 243 If you want only to get Unicode normalization forms, you don't need
 244 call them yourself.
 245
 246 =over 4
 247
 248 =item C<$canonical_decomposed = getCanon($codepoint)>
 249
 250 If the character of the specified codepoint is canonically
 251 decomposable (including Hangul Syllables),
 252 returns the B<completely decomposed> string canonically equivalent to it.
 253
 254 If it is not decomposable, returns C<undef>.
 255
 256 =item C<$compatibility_decomposed = getCompat($codepoint)>
 257
 258 If the character of the specified codepoint is compatibility
 259 decomposable (including Hangul Syllables),
 260 returns the B<completely decomposed> string compatibility equivalent to it.
 261
 262 If it is not decomposable, returns C<undef>.
 263
 264 =item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
 265
 266 If two characters here and next (as codepoints) are composable
 267 (including Hangul Jamo/Syllables and Composition Exclusions),
 268 returns the codepoint of the composite.
 269
 270 If they are not composable, returns C<undef>.
 271
 272 =item C<$combining_class = getCombinClass($codepoint)>
 273
 274 Returns the combining class of the character as an integer.
 275
 276 =item C<$is_exclusion = isExclusion($codepoint)>
 277
 278 Returns a boolean whether the character of the specified codepoint
 279 is a composition exclusion.
 280
 281 =item C<$is_singleton = isSingleton($codepoint)>
 282
 283 Returns a boolean whether the character of the specified codepoint is
 284 a singleton.
 285
 286 =item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
 287
 288 Returns a boolean whether the canonical decomposition
 289 of the character of the specified codepoint
 290 is a Non-Starter Decomposition.
 291
 292 =item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
 293
 294 Returns a boolean whether the character of the specified codepoint
 295 may be composed with the previous one in a certain composition
 296 (including Hangul Compositions, but excluding
 297 Composition Exclusions and Non-Starter Decompositions).
 298
 299 =back
 300
 301 =head2 EXPORT
 302
 303 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
 304
 305 C<normalize> and other some functions: on request.
 306
 307 =head1 AUTHOR
 308
 309 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
 310
 311   http://homepage1.nifty.com/nomenclator/perl/
 312
 313   Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
 314
 315   This module is free software; you can redistribute it
 316   and/or modify it under the same terms as Perl itself.
 317
 318 =head1 SEE ALSO
 319
 320 =over 4
 321
 322 =item http://www.unicode.org/unicode/reports/tr15/
 323
 324 Unicode Normalization Forms - UAX #15
 325
 326 =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
 327
 328 Derived Normalization Properties
 329
 330 =back
 331
 332 =cut
 333