ext/Unicode/Normalize/Normalize.pm

   1 package Unicode::Normalize;
   2
   3 BEGIN {
   4     if (ord("A") == 193) {
   5         die "Unicode::Normalize not ported to EBCDIC\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13
  14 our $VERSION = '0.17';
  15 our $PACKAGE = __PACKAGE__;
  16
  17 require Exporter;
  18 require DynaLoader;
  19 require AutoLoader;
  20
  21 our @ISA = qw(Exporter DynaLoader);
  22 our @EXPORT = qw( NFC NFD NFKC NFKD );
  23 our @EXPORT_OK = qw(
  24     normalize decompose reorder compose
  25     checkNFD checkNFKD checkNFC checkNFKC check
  26     getCanon getCompat getComposite getCombinClass
  27     isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
  28     isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
  29 );
  30 our %EXPORT_TAGS = (
  31     all       => [ @EXPORT, @EXPORT_OK ],
  32     normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
  33     check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
  34 );
  35
  36 bootstrap Unicode::Normalize $VERSION;
  37
  38 use constant COMPAT => 1;
  39
  40 sub NFD  ($) { reorder(decompose($_[0])) }
  41 sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
  42 sub NFC  ($) { compose(reorder(decompose($_[0]))) }
  43 sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
  44
  45 sub normalize($$)
  46 {
  47     my $form = shift;
  48     my $str = shift;
  49     $form =~ s/^NF//;
  50     return
  51         $form eq 'D'  ? NFD ($str) :
  52         $form eq 'C'  ? NFC ($str) :
  53         $form eq 'KD' ? NFKD($str) :
  54         $form eq 'KC' ? NFKC($str) :
  55       croak $PACKAGE."::normalize: invalid form name: $form";
  56 }
  57
  58 sub check($$)
  59 {
  60     my $form = shift;
  61     my $str = shift;
  62     $form =~ s/^NF//;
  63     return
  64         $form eq 'D'  ? checkNFD ($str) :
  65         $form eq 'C'  ? checkNFC ($str) :
  66         $form eq 'KD' ? checkNFKD($str) :
  67         $form eq 'KC' ? checkNFKC($str) :
  68       croak $PACKAGE."::check: invalid form name: $form";
  69 }
  70
  71 1;
  72 __END__
  73
  74 =head1 NAME
  75
  76 Unicode::Normalize - Unicode Normalization Forms
  77
  78 =head1 SYNOPSIS
  79
  80   use Unicode::Normalize;
  81
  82   $NFD_string  = NFD($string);  # Normalization Form D
  83   $NFC_string  = NFC($string);  # Normalization Form C
  84   $NFKD_string = NFKD($string); # Normalization Form KD
  85   $NFKC_string = NFKC($string); # Normalization Form KC
  86
  87    or
  88
  89   use Unicode::Normalize 'normalize';
  90
  91   $NFD_string  = normalize('D',  $string);  # Normalization Form D
  92   $NFC_string  = normalize('C',  $string);  # Normalization Form C
  93   $NFKD_string = normalize('KD', $string);  # Normalization Form KD
  94   $NFKC_string = normalize('KC', $string);  # Normalization Form KC
  95
  96 =head1 DESCRIPTION
  97
  98 =head2 Normalization Forms
  99
 100 =over 4
 101
 102 =item C<$NFD_string = NFD($string)>
 103
 104 returns the Normalization Form D (formed by canonical decomposition).
 105
 106 =item C<$NFC_string = NFC($string)>
 107
 108 returns the Normalization Form C (formed by canonical decomposition
 109 followed by canonical composition).
 110
 111 =item C<$NFKD_string = NFKD($string)>
 112
 113 returns the Normalization Form KD (formed by compatibility decomposition).
 114
 115 =item C<$NFKC_string = NFKC($string)>
 116
 117 returns the Normalization Form KC (formed by compatibility decomposition
 118 followed by B<canonical> composition).
 119
 120 =item C<$normalized_string = normalize($form_name, $string)>
 121
 122 As C<$form_name>, one of the following names must be given.
 123
 124   'C'  or 'NFC'  for Normalization Form C
 125   'D'  or 'NFD'  for Normalization Form D
 126   'KC' or 'NFKC' for Normalization Form KC
 127   'KD' or 'NFKD' for Normalization Form KD
 128
 129 =back
 130
 131 =head2 Decomposition and Composition
 132
 133 =over 4
 134
 135 =item C<$decomposed_string = decompose($string)>
 136
 137 =item C<$decomposed_string = decompose($string, $useCompatMapping)>
 138
 139 Decompose the specified string and returns the result.
 140
 141 If the second parameter (a boolean) is omitted or false, decomposes it
 142 using the Canonical Decomposition Mapping.
 143 If true, decomposes it using the Compatibility Decomposition Mapping.
 144
 145 The string returned is not always in NFD/NFKD.
 146 Reordering may be required.
 147
 148     $NFD_string  = reorder(decompose($string));       # eq. to NFD()
 149     $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
 150
 151 =item C<$reordered_string  = reorder($string)>
 152
 153 Reorder the combining characters and the like in the canonical ordering
 154 and returns the result.
 155
 156 E.g., when you have a list of NFD/NFKD strings,
 157 you can get the concatenated NFD/NFKD string from them, saying
 158
 159     $concat_NFD  = reorder(join '', @NFD_strings);
 160     $concat_NFKD = reorder(join '', @NFKD_strings);
 161
 162 =item C<$composed_string   = compose($string)>
 163
 164 Returns the string where composable pairs are composed.
 165
 166 E.g., when you have a NFD/NFKD string,
 167 you can get its NFC/NFKC string, saying
 168
 169     $NFC_string  = compose($NFD_string);
 170     $NFKC_string = compose($NFKD_string);
 171
 172 =back
 173
 174 =head2 Quick Check
 175
 176 (see Annex 8, UAX #15; F<DerivedNormalizationProps.txt>)
 177
 178 The following functions check whether the string is in that normalization form.
 179
 180 The result returned will be:
 181
 182     YES     The string is in that normalization form.
 183     NO      The string is not in that normalization form.
 184     MAYBE   Dubious. Maybe yes, maybe no.
 185
 186 =over 4
 187
 188 =item C<$result = checkNFD($string)>
 189
 190 returns C<YES> (C<1>) or C<NO> (C<empty string>).
 191
 192 =item C<$result = checkNFC($string)>
 193
 194 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
 195
 196 =item C<$result = checkNFKD($string)>
 197
 198 returns C<YES> (C<1>) or C<NO> (C<empty string>).
 199
 200 =item C<$result = checkNFKC($string)>
 201
 202 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
 203
 204 =item C<$result = check($form_name, $string)>
 205
 206 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
 207
 208 C<$form_name> is alike to that for C<normalize()>.
 209
 210 =back
 211
 212 B<Note>
 213
 214 In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
 215 The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
 216
 217 A MAYBE-NFC/NFKC string should contain at least
 218 one combining character or the like.
 219 For example, C<COMBINING ACUTE ACCENT> has
 220 the MAYBE_NFC/MAYBE_NFKC property.
 221 Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
 222 and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
 223 C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
 224 (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
 225 while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
 226
 227 If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
 228
 229     $string eq NFC($string)    # more thorough than checkNFC($string)
 230     $string eq NFKC($string)   # more thorough than checkNFKC($string)
 231
 232 =head2 Character Data
 233
 234 These functions are interface of character data used internally.
 235 If you want only to get Unicode normalization forms, you don't need
 236 call them yourself.
 237
 238 =over 4
 239
 240 =item C<$canonical_decomposed = getCanon($codepoint)>
 241
 242 If the character of the specified codepoint is canonically
 243 decomposable (including Hangul Syllables),
 244 returns the B<completely decomposed> string canonically equivalent to it.
 245
 246 If it is not decomposable, returns C<undef>.
 247
 248 =item C<$compatibility_decomposed = getCompat($codepoint)>
 249
 250 If the character of the specified codepoint is compatibility
 251 decomposable (including Hangul Syllables),
 252 returns the B<completely decomposed> string compatibility equivalent to it.
 253
 254 If it is not decomposable, returns C<undef>.
 255
 256 =item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
 257
 258 If two characters here and next (as codepoints) are composable
 259 (including Hangul Jamo/Syllables and Composition Exclusions),
 260 returns the codepoint of the composite.
 261
 262 If they are not composable, returns C<undef>.
 263
 264 =item C<$combining_class = getCombinClass($codepoint)>
 265
 266 Returns the combining class of the character as an integer.
 267
 268 =item C<$is_exclusion = isExclusion($codepoint)>
 269
 270 Returns a boolean whether the character of the specified codepoint
 271 is a composition exclusion.
 272
 273 =item C<$is_singleton = isSingleton($codepoint)>
 274
 275 Returns a boolean whether the character of the specified codepoint is
 276 a singleton.
 277
 278 =item C<$is_non_startar_decomposition = isNonStDecomp($codepoint)>
 279
 280 Returns a boolean whether the canonical decomposition
 281 of the character of the specified codepoint
 282 is a Non-Starter Decomposition.
 283
 284 =item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
 285
 286 Returns a boolean whether the character of the specified codepoint
 287 may be composed with the previous one in a certain composition
 288 (including Hangul Compositions, but excluding
 289 Composition Exclusions and Non-Starter Decompositions).
 290
 291 =back
 292
 293 =head2 EXPORT
 294
 295 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
 296
 297 C<normalize> and other some functions: on request.
 298
 299 =head1 AUTHOR
 300
 301 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
 302
 303   http://homepage1.nifty.com/nomenclator/perl/
 304
 305   Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
 306
 307   This program is free software; you can redistribute it and/or
 308   modify it under the same terms as Perl itself.
 309
 310 =head1 SEE ALSO
 311
 312 =over 4
 313
 314 =item http://www.unicode.org/unicode/reports/tr15/
 315
 316 Unicode Normalization Forms - UAX #15
 317
 318 =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
 319
 320 Derived Normalization Properties
 321
 322 =back
 323
 324 =cut
 325