ext/Unicode/Normalize/Normalize.pm

   1 package Unicode::Normalize;
   2
   3 BEGIN {
   4     if (ord("A") == 193) {
   5         die "Unicode::Normalize not ported to EBCDIC\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13
  14 our $VERSION = '0.16';
  15 our $PACKAGE = __PACKAGE__;
  16
  17 require Exporter;
  18 require DynaLoader;
  19 require AutoLoader;
  20
  21 our @ISA = qw(Exporter DynaLoader);
  22 our @EXPORT = qw( NFC NFD NFKC NFKD );
  23 our @EXPORT_OK = qw(
  24     normalize decompose reorder compose
  25     checkNFD checkNFKD checkNFC checkNFKC check
  26     getCanon getCompat getComposite getCombinClass
  27     isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
  28     isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
  29 );
  30 our %EXPORT_TAGS = (
  31     all       => [ @EXPORT, @EXPORT_OK ],
  32     normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
  33     check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
  34 );
  35
  36 bootstrap Unicode::Normalize $VERSION;
  37
  38 use constant COMPAT => 1;
  39
  40 sub NFD  ($) { reorder(decompose($_[0])) }
  41 sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
  42 sub NFC  ($) { compose(reorder(decompose($_[0]))) }
  43 sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
  44
  45 sub normalize($$)
  46 {
  47     my $form = shift;
  48     $form =~ s/^NF//;
  49     return
  50         $form eq 'D'  ? NFD ($_[0]) :
  51         $form eq 'C'  ? NFC ($_[0]) :
  52         $form eq 'KD' ? NFKD($_[0]) :
  53         $form eq 'KC' ? NFKC($_[0]) :
  54       croak $PACKAGE."::normalize: invalid form name: $form";
  55 }
  56
  57 sub check($$)
  58 {
  59     my $form = shift;
  60     $form =~ s/^NF//;
  61     return
  62         $form eq 'D'  ? checkNFD ($_[0]) :
  63         $form eq 'C'  ? checkNFC ($_[0]) :
  64         $form eq 'KD' ? checkNFKD($_[0]) :
  65         $form eq 'KC' ? checkNFKC($_[0]) :
  66       croak $PACKAGE."::check: invalid form name: $form";
  67 }
  68
  69 1;
  70 __END__
  71
  72 =head1 NAME
  73
  74 Unicode::Normalize - normalized forms of Unicode text
  75
  76 =head1 SYNOPSIS
  77
  78   use Unicode::Normalize;
  79
  80   $NFD_string  = NFD($string);  # Normalization Form D
  81   $NFC_string  = NFC($string);  # Normalization Form C
  82   $NFKD_string = NFKD($string); # Normalization Form KD
  83   $NFKC_string = NFKC($string); # Normalization Form KC
  84
  85    or
  86
  87   use Unicode::Normalize 'normalize';
  88
  89   $NFD_string  = normalize('D',  $string);  # Normalization Form D
  90   $NFC_string  = normalize('C',  $string);  # Normalization Form C
  91   $NFKD_string = normalize('KD', $string);  # Normalization Form KD
  92   $NFKC_string = normalize('KC', $string);  # Normalization Form KC
  93
  94 =head1 DESCRIPTION
  95
  96 =head2 Normalization Forms
  97
  98 =over 4
  99
 100 =item C<$NFD_string = NFD($string)>
 101
 102 returns the Normalization Form D (formed by canonical decomposition).
 103
 104 =item C<$NFC_string = NFC($string)>
 105
 106 returns the Normalization Form C (formed by canonical decomposition
 107 followed by canonical composition).
 108
 109 =item C<$NFKD_string = NFKD($string)>
 110
 111 returns the Normalization Form KD (formed by compatibility decomposition).
 112
 113 =item C<$NFKC_string = NFKC($string)>
 114
 115 returns the Normalization Form KC (formed by compatibility decomposition
 116 followed by B<canonical> composition).
 117
 118 =item C<$normalized_string = normalize($form_name, $string)>
 119
 120 As C<$form_name>, one of the following names must be given.
 121
 122   'C'  or 'NFC'  for Normalization Form C
 123   'D'  or 'NFD'  for Normalization Form D
 124   'KC' or 'NFKC' for Normalization Form KC
 125   'KD' or 'NFKD' for Normalization Form KD
 126
 127 =back
 128
 129 =head2 Decomposition and Composition
 130
 131 =over 4
 132
 133 =item C<$decomposed_string = decompose($string)>
 134
 135 =item C<$decomposed_string = decompose($string, $useCompatMapping)>
 136
 137 Decompose the specified string and returns the result.
 138
 139 If the second parameter (a boolean) is omitted or false, decomposes it
 140 using the Canonical Decomposition Mapping.
 141 If true, decomposes it using the Compatibility Decomposition Mapping.
 142
 143 The string returned is not always in NFD/NFKD.
 144 Reordering may be required.
 145
 146     $NFD_string  = reorder(decompose($string));       # eq. to NFD()
 147     $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
 148
 149 =item C<$reordered_string  = reorder($string)>
 150
 151 Reorder the combining characters and the like in the canonical ordering
 152 and returns the result.
 153
 154 E.g., when you have a list of NFD/NFKD strings,
 155 you can get the concatenated NFD/NFKD string from them, saying
 156
 157     $concat_NFD  = reorder(join '', @NFD_strings);
 158     $concat_NFKD = reorder(join '', @NFKD_strings);
 159
 160 =item C<$composed_string   = compose($string)>
 161
 162 Returns the string where composable pairs are composed.
 163
 164 E.g., when you have a NFD/NFKD string,
 165 you can get its NFC/NFKC string, saying
 166
 167     $NFC_string  = compose($NFD_string);
 168     $NFKC_string = compose($NFKD_string);
 169
 170 =back
 171
 172 =head2 Quick Check
 173
 174 (see Annex 8, UAX #15; F<DerivedNormalizationProperties.txt>)
 175
 176 The following functions check whether the string is in that normalization form.
 177
 178 The result returned will be:
 179
 180     YES     The string is in that normalization form.
 181     NO      The string is not in that normalization form.
 182     MAYBE   Dubious. Maybe yes, maybe no.
 183
 184 =over 4
 185
 186 =item C<$result = checkNFD($string)>
 187
 188 returns YES (1) or NO (empty string).
 189
 190 =item C<$result = checkNFC($string)>
 191
 192 returns YES (1), NO (empty string), or MAYBE (undef).
 193
 194 =item C<$result = checkNFKD($string)>
 195
 196 returns YES (1) or NO (empty string).
 197
 198 =item C<$result = checkNFKC($string)>
 199
 200 returns YES (1), NO (empty string), or MAYBE (undef).
 201
 202 =item C<$result = check($form_name, $string)>
 203
 204 returns YES (1), NO (empty string), or MAYBE (undef).
 205
 206 C<$form_name> is alike to that for C<normalize()>.
 207
 208 =back
 209
 210 B<Note>
 211
 212 In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
 213 The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
 214
 215 A MAYBE-NFC/NFKC string should contain at least
 216 one combining character or the like.
 217 For example, C<COMBINING ACUTE ACCENT> has
 218 the MAYBE_NFC/MAYBE_NFKC property.
 219 Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
 220 and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
 221 Though, C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
 222 (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
 223 while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
 224
 225 If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
 226
 227     $string eq NFC($string)    # more thorough than checkNFC($string)
 228     $string eq NFKC($string)   # more thorough than checkNFKC($string)
 229
 230 =head2 Character Data
 231
 232 These functions are interface of character data used internally.
 233 If you want only to get Unicode normalization forms, you don't need
 234 call them yourself.
 235
 236 =over 4
 237
 238 =item C<$canonical_decomposed = getCanon($codepoint)>
 239
 240 If the character of the specified codepoint is canonically
 241 decomposable (including Hangul Syllables),
 242 returns the B<completely decomposed> string canonically equivalent to it.
 243
 244 If it is not decomposable, returns undef.
 245
 246 =item C<$compatibility_decomposed = getCompat($codepoint)>
 247
 248 If the character of the specified codepoint is compatibility
 249 decomposable (including Hangul Syllables),
 250 returns the B<completely decomposed> string compatibility equivalent to it.
 251
 252 If it is not decomposable, returns undef.
 253
 254 =item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
 255
 256 If two characters here and next (as codepoints) are composable
 257 (including Hangul Jamo/Syllables and Composition Exclusions),
 258 returns the codepoint of the composite.
 259
 260 If they are not composable, returns undef.
 261
 262 =item C<$combining_class = getCombinClass($codepoint)>
 263
 264 Returns the combining class of the character as an integer.
 265
 266 =item C<$is_exclusion = isExclusion($codepoint)>
 267
 268 Returns a boolean whether the character of the specified codepoint
 269 is a composition exclusion.
 270
 271 =item C<$is_singleton = isSingleton($codepoint)>
 272
 273 Returns a boolean whether the character of the specified codepoint is
 274 a singleton.
 275
 276 =item C<$is_non_startar_decomposition = isNonStDecomp($codepoint)>
 277
 278 Returns a boolean whether the canonical decomposition
 279 of the character of the specified codepoint
 280 is a Non-Starter Decomposition.
 281
 282 =item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
 283
 284 Returns a boolean whether the character of the specified codepoint
 285 may be composed with the previous one in a certain composition
 286 (including Hangul Compositions, but excluding
 287 Composition Exclusions and Non-Starter Decompositions).
 288
 289 =back
 290
 291 =head2 EXPORT
 292
 293 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
 294
 295 C<normalize> and other some functions: on request.
 296
 297 =head1 AUTHOR
 298
 299 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
 300
 301   http://homepage1.nifty.com/nomenclator/perl/
 302
 303   Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
 304
 305   This program is free software; you can redistribute it and/or
 306   modify it under the same terms as Perl itself.
 307
 308 =head1 SEE ALSO
 309
 310 =over 4
 311
 312 =item http://www.unicode.org/unicode/reports/tr15/
 313
 314 Unicode Normalization Forms - UAX #15
 315
 316 =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProperties.txt
 317
 318 Derived Normalization Properties
 319
 320 =back
 321
 322 =cut
 323