ext/Unicode/Normalize/Normalize.pm

   1 package Unicode::Normalize;
   2
   3 BEGIN {
   4     unless ("A" eq pack('U', 0x41) || "A" eq pack('U', ord("A"))) {
   5         die "Unicode::Normalize cannot stringify a Unicode code point\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13
  14 our $VERSION = '0.21';
  15 our $PACKAGE = __PACKAGE__;
  16
  17 require Exporter;
  18 require DynaLoader;
  19 require AutoLoader;
  20
  21 our @ISA = qw(Exporter DynaLoader);
  22 our @EXPORT = qw( NFC NFD NFKC NFKD );
  23 our @EXPORT_OK = qw(
  24     normalize decompose reorder compose
  25     checkNFD checkNFKD checkNFC checkNFKC check
  26     getCanon getCompat getComposite getCombinClass
  27     isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
  28     isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
  29 );
  30 our %EXPORT_TAGS = (
  31     all       => [ @EXPORT, @EXPORT_OK ],
  32     normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
  33     check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
  34 );
  35
  36 bootstrap Unicode::Normalize $VERSION;
  37
  38 use constant UNICODE_FOR_PACK => "A" eq pack('U', 0x41);
  39 use constant NATIVE_FOR_PACK  => "A" eq pack('U', ord("A"));
  40
  41 use constant UNICODE_FOR_UNPACK => 0x41 == unpack('U', "A");
  42 use constant NATIVE_FOR_UNPACK  => ord("A") == unpack('U', "A");
  43
  44 sub pack_U {
  45     return UNICODE_FOR_PACK
  46         ? pack('U*', @_)
  47         : NATIVE_FOR_PACK
  48             ? pack('U*', map utf8::unicode_to_native($_), @_)
  49             : die "$PACKAGE, a Unicode code point cannot be stringified.\n";
  50 }
  51
  52 sub unpack_U {
  53     return UNICODE_FOR_UNPACK
  54         ? unpack('U*', shift)
  55         : NATIVE_FOR_UNPACK
  56             ? map(utf8::native_to_unicode($_), unpack 'U*', shift)
  57             : die "$PACKAGE, a code point returned from unpack U " .
  58                 "cannot be converted into Unicode.\n";
  59 }
  60
  61 use constant COMPAT => 1;
  62
  63 sub NFD  ($) { reorder(decompose($_[0])) }
  64 sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
  65 sub NFC  ($) { compose(reorder(decompose($_[0]))) }
  66 sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
  67
  68 sub normalize($$)
  69 {
  70     my $form = shift;
  71     my $str = shift;
  72     $form =~ s/^NF//;
  73     return
  74         $form eq 'D'  ? NFD ($str) :
  75         $form eq 'C'  ? NFC ($str) :
  76         $form eq 'KD' ? NFKD($str) :
  77         $form eq 'KC' ? NFKC($str) :
  78       croak $PACKAGE."::normalize: invalid form name: $form";
  79 }
  80
  81 sub check($$)
  82 {
  83     my $form = shift;
  84     my $str = shift;
  85     $form =~ s/^NF//;
  86     return
  87         $form eq 'D'  ? checkNFD ($str) :
  88         $form eq 'C'  ? checkNFC ($str) :
  89         $form eq 'KD' ? checkNFKD($str) :
  90         $form eq 'KC' ? checkNFKC($str) :
  91       croak $PACKAGE."::check: invalid form name: $form";
  92 }
  93
  94 1;
  95 __END__
  96
  97 =head1 NAME
  98
  99 Unicode::Normalize - Unicode Normalization Forms
 100
 101 =head1 SYNOPSIS
 102
 103   use Unicode::Normalize;
 104
 105   $NFD_string  = NFD($string);  # Normalization Form D
 106   $NFC_string  = NFC($string);  # Normalization Form C
 107   $NFKD_string = NFKD($string); # Normalization Form KD
 108   $NFKC_string = NFKC($string); # Normalization Form KC
 109
 110    or
 111
 112   use Unicode::Normalize 'normalize';
 113
 114   $NFD_string  = normalize('D',  $string);  # Normalization Form D
 115   $NFC_string  = normalize('C',  $string);  # Normalization Form C
 116   $NFKD_string = normalize('KD', $string);  # Normalization Form KD
 117   $NFKC_string = normalize('KC', $string);  # Normalization Form KC
 118
 119 =head1 DESCRIPTION
 120
 121 =head2 Normalization Forms
 122
 123 =over 4
 124
 125 =item C<$NFD_string = NFD($string)>
 126
 127 returns the Normalization Form D (formed by canonical decomposition).
 128
 129 =item C<$NFC_string = NFC($string)>
 130
 131 returns the Normalization Form C (formed by canonical decomposition
 132 followed by canonical composition).
 133
 134 =item C<$NFKD_string = NFKD($string)>
 135
 136 returns the Normalization Form KD (formed by compatibility decomposition).
 137
 138 =item C<$NFKC_string = NFKC($string)>
 139
 140 returns the Normalization Form KC (formed by compatibility decomposition
 141 followed by B<canonical> composition).
 142
 143 =item C<$normalized_string = normalize($form_name, $string)>
 144
 145 As C<$form_name>, one of the following names must be given.
 146
 147   'C'  or 'NFC'  for Normalization Form C
 148   'D'  or 'NFD'  for Normalization Form D
 149   'KC' or 'NFKC' for Normalization Form KC
 150   'KD' or 'NFKD' for Normalization Form KD
 151
 152 =back
 153
 154 =head2 Decomposition and Composition
 155
 156 =over 4
 157
 158 =item C<$decomposed_string = decompose($string)>
 159
 160 =item C<$decomposed_string = decompose($string, $useCompatMapping)>
 161
 162 Decomposes the specified string and returns the result.
 163
 164 If the second parameter (a boolean) is omitted or false, decomposes it
 165 using the Canonical Decomposition Mapping.
 166 If true, decomposes it using the Compatibility Decomposition Mapping.
 167
 168 The string returned is not always in NFD/NFKD.
 169 Reordering may be required.
 170
 171     $NFD_string  = reorder(decompose($string));       # eq. to NFD()
 172     $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
 173
 174 =item C<$reordered_string  = reorder($string)>
 175
 176 Reorders the combining characters and the like in the canonical ordering
 177 and returns the result.
 178
 179 E.g., when you have a list of NFD/NFKD strings,
 180 you can get the concatenated NFD/NFKD string from them, saying
 181
 182     $concat_NFD  = reorder(join '', @NFD_strings);
 183     $concat_NFKD = reorder(join '', @NFKD_strings);
 184
 185 =item C<$composed_string   = compose($string)>
 186
 187 Returns the string where composable pairs are composed.
 188
 189 E.g., when you have a NFD/NFKD string,
 190 you can get its NFC/NFKC string, saying
 191
 192     $NFC_string  = compose($NFD_string);
 193     $NFKC_string = compose($NFKD_string);
 194
 195 =back
 196
 197 =head2 Quick Check
 198
 199 (see Annex 8, UAX #15, and F<DerivedNormalizationProps.txt>)
 200
 201 The following functions check whether the string is in that normalization form.
 202
 203 The result returned will be:
 204
 205     YES     The string is in that normalization form.
 206     NO      The string is not in that normalization form.
 207     MAYBE   Dubious. Maybe yes, maybe no.
 208
 209 =over 4
 210
 211 =item C<$result = checkNFD($string)>
 212
 213 returns C<YES> (C<1>) or C<NO> (C<empty string>).
 214
 215 =item C<$result = checkNFC($string)>
 216
 217 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
 218
 219 =item C<$result = checkNFKD($string)>
 220
 221 returns C<YES> (C<1>) or C<NO> (C<empty string>).
 222
 223 =item C<$result = checkNFKC($string)>
 224
 225 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
 226
 227 =item C<$result = check($form_name, $string)>
 228
 229 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
 230
 231 C<$form_name> is alike to that for C<normalize()>.
 232
 233 =back
 234
 235 B<Note>
 236
 237 In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
 238 The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
 239
 240 A MAYBE-NFC/NFKC string should contain at least
 241 one combining character or the like.
 242 For example, C<COMBINING ACUTE ACCENT> has
 243 the MAYBE_NFC/MAYBE_NFKC property.
 244 Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
 245 and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
 246 C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
 247 (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
 248 while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
 249
 250 If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
 251
 252     $string eq NFC($string)    # more thorough than checkNFC($string)
 253     $string eq NFKC($string)   # more thorough than checkNFKC($string)
 254
 255 =head2 Character Data
 256
 257 These functions are interface of character data used internally.
 258 If you want only to get Unicode normalization forms, you don't need
 259 call them yourself.
 260
 261 =over 4
 262
 263 =item C<$canonical_decomposed = getCanon($codepoint)>
 264
 265 If the character of the specified codepoint is canonically
 266 decomposable (including Hangul Syllables),
 267 returns the B<completely decomposed> string canonically equivalent to it.
 268
 269 If it is not decomposable, returns C<undef>.
 270
 271 =item C<$compatibility_decomposed = getCompat($codepoint)>
 272
 273 If the character of the specified codepoint is compatibility
 274 decomposable (including Hangul Syllables),
 275 returns the B<completely decomposed> string compatibility equivalent to it.
 276
 277 If it is not decomposable, returns C<undef>.
 278
 279 =item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
 280
 281 If two characters here and next (as codepoints) are composable
 282 (including Hangul Jamo/Syllables and Composition Exclusions),
 283 returns the codepoint of the composite.
 284
 285 If they are not composable, returns C<undef>.
 286
 287 =item C<$combining_class = getCombinClass($codepoint)>
 288
 289 Returns the combining class of the character as an integer.
 290
 291 =item C<$is_exclusion = isExclusion($codepoint)>
 292
 293 Returns a boolean whether the character of the specified codepoint
 294 is a composition exclusion.
 295
 296 =item C<$is_singleton = isSingleton($codepoint)>
 297
 298 Returns a boolean whether the character of the specified codepoint is
 299 a singleton.
 300
 301 =item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
 302
 303 Returns a boolean whether the canonical decomposition
 304 of the character of the specified codepoint
 305 is a Non-Starter Decomposition.
 306
 307 =item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
 308
 309 Returns a boolean whether the character of the specified codepoint
 310 may be composed with the previous one in a certain composition
 311 (including Hangul Compositions, but excluding
 312 Composition Exclusions and Non-Starter Decompositions).
 313
 314 =back
 315
 316 =head2 EXPORT
 317
 318 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
 319
 320 C<normalize> and other some functions: on request.
 321
 322 =head1 AUTHOR
 323
 324 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
 325
 326   http://homepage1.nifty.com/nomenclator/perl/
 327
 328   Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
 329
 330   This module is free software; you can redistribute it
 331   and/or modify it under the same terms as Perl itself.
 332
 333 =head1 SEE ALSO
 334
 335 =over 4
 336
 337 =item http://www.unicode.org/unicode/reports/tr15/
 338
 339 Unicode Normalization Forms - UAX #15
 340
 341 =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
 342
 343 Derived Normalization Properties
 344
 345 =back
 346
 347 =cut
 348