1 package Unicode::Normalize;
4 unless ("A" eq pack('U', 0x41)) {
5 die "Unicode::Normalize cannot stringify a Unicode code point\n";
16 our $VERSION = '0.28';
17 our $PACKAGE = __PACKAGE__;
22 our @ISA = qw(Exporter DynaLoader);
23 our @EXPORT = qw( NFC NFD NFKC NFKD );
25 normalize decompose reorder compose
26 checkNFD checkNFKD checkNFC checkNFKC check
27 getCanon getCompat getComposite getCombinClass
28 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
29 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
30 FCD checkFCD FCC checkFCC composeContiguous
34 all => [ @EXPORT, @EXPORT_OK ],
35 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
36 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
37 fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
42 bootstrap Unicode::Normalize $VERSION;
47 return pack('U*', @_);
51 return unpack('U*', pack('U*').shift);
56 ## normalization forms
59 use constant COMPAT => 1;
61 sub NFD ($) { reorder(decompose($_[0])) }
62 sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
63 sub NFC ($) { compose(reorder(decompose($_[0]))) }
64 sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
68 return checkFCD($str) ? $str : NFD($str);
70 sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) }
73 NFC => \&NFC, C => \&NFC,
74 NFD => \&NFD, D => \&NFD,
75 NFKC => \&NFKC, KC => \&NFKC,
76 NFKD => \&NFKD, KD => \&NFKD,
77 FCD => \&FCD, FCC => \&FCC,
84 return exists $formNorm{$form}
85 ? $formNorm{$form}->($str)
86 : croak $PACKAGE."::normalize: invalid form name: $form";
95 NFC => \&checkNFC, C => \&checkNFC,
96 NFD => \&checkNFD, D => \&checkNFD,
97 NFKC => \&checkNFKC, KC => \&checkNFKC,
98 NFKD => \&checkNFKD, KD => \&checkNFKD,
99 FCD => \&checkFCD, FCC => \&checkFCC,
106 return exists $formCheck{$form}
107 ? $formCheck{$form}->($str)
108 : croak $PACKAGE."::check: invalid form name: $form";
116 Unicode::Normalize - Unicode Normalization Forms
120 use Unicode::Normalize;
122 $NFD_string = NFD($string); # Normalization Form D
123 $NFC_string = NFC($string); # Normalization Form C
124 $NFKD_string = NFKD($string); # Normalization Form KD
125 $NFKC_string = NFKC($string); # Normalization Form KC
129 use Unicode::Normalize 'normalize';
131 $NFD_string = normalize('D', $string); # Normalization Form D
132 $NFC_string = normalize('C', $string); # Normalization Form C
133 $NFKD_string = normalize('KD', $string); # Normalization Form KD
134 $NFKC_string = normalize('KC', $string); # Normalization Form KC
140 C<$string> is used as a string under character semantics
141 (see F<perlunicode>).
143 C<$codepoint> should be an unsigned integer
144 representing a Unicode code point.
146 Note: Between XS edition and pure Perl edition,
147 interpretation of C<$codepoint> as a decimal number has incompatibility.
148 XS converts C<$codepoint> to an unsigned integer, but pure Perl does not.
149 Do not use a floating point nor a negative sign in C<$codepoint>.
151 =head2 Normalization Forms
155 =item C<$NFD_string = NFD($string)>
157 returns the Normalization Form D (formed by canonical decomposition).
159 =item C<$NFC_string = NFC($string)>
161 returns the Normalization Form C (formed by canonical decomposition
162 followed by canonical composition).
164 =item C<$NFKD_string = NFKD($string)>
166 returns the Normalization Form KD (formed by compatibility decomposition).
168 =item C<$NFKC_string = NFKC($string)>
170 returns the Normalization Form KC (formed by compatibility decomposition
171 followed by B<canonical> composition).
173 =item C<$FCD_string = FCD($string)>
175 If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
176 returns it without modification; otherwise returns an FCD string.
178 Note: FCD is not always unique, then plural forms may be equivalent
179 each other. C<FCD()> will return one of these equivalent forms.
181 =item C<$FCC_string = FCC($string)>
183 returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
185 Note: FCC is unique, as well as four normalization forms (NF*).
187 =item C<$normalized_string = normalize($form_name, $string)>
189 As C<$form_name>, one of the following names must be given.
191 'C' or 'NFC' for Normalization Form C (UAX #15)
192 'D' or 'NFD' for Normalization Form D (UAX #15)
193 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
194 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
196 'FCD' for "Fast C or D" Form (UTN #5)
197 'FCC' for "Fast C Contiguous" (UTN #5)
201 =head2 Decomposition and Composition
205 =item C<$decomposed_string = decompose($string)>
207 =item C<$decomposed_string = decompose($string, $useCompatMapping)>
209 Decomposes the specified string and returns the result.
211 If the second parameter (a boolean) is omitted or false, decomposes it
212 using the Canonical Decomposition Mapping.
213 If true, decomposes it using the Compatibility Decomposition Mapping.
215 The string returned is not always in NFD/NFKD.
216 Reordering may be required.
218 $NFD_string = reorder(decompose($string)); # eq. to NFD()
219 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
221 =item C<$reordered_string = reorder($string)>
223 Reorders the combining characters and the like in the canonical ordering
224 and returns the result.
226 E.g., when you have a list of NFD/NFKD strings,
227 you can get the concatenated NFD/NFKD string from them, saying
229 $concat_NFD = reorder(join '', @NFD_strings);
230 $concat_NFKD = reorder(join '', @NFKD_strings);
232 =item C<$composed_string = compose($string)>
234 Returns the string where composable pairs are composed.
236 E.g., when you have a NFD/NFKD string,
237 you can get its NFC/NFKC string, saying
239 $NFC_string = compose($NFD_string);
240 $NFKC_string = compose($NFKD_string);
246 (see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
248 The following functions check whether the string is in that normalization form.
250 The result returned will be:
252 YES The string is in that normalization form.
253 NO The string is not in that normalization form.
254 MAYBE Dubious. Maybe yes, maybe no.
258 =item C<$result = checkNFD($string)>
260 returns C<YES> (C<1>) or C<NO> (C<empty string>).
262 =item C<$result = checkNFC($string)>
264 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
266 =item C<$result = checkNFKD($string)>
268 returns C<YES> (C<1>) or C<NO> (C<empty string>).
270 =item C<$result = checkNFKC($string)>
272 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
274 =item C<$result = checkFCD($string)>
276 returns C<YES> (C<1>) or C<NO> (C<empty string>).
278 =item C<$result = checkFCC($string)>
280 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
282 If a string is not in FCD, it must not be in FCC.
283 So C<checkFCC($not_FCD_string)> should return C<NO>.
285 =item C<$result = check($form_name, $string)>
287 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
289 C<$form_name> is alike to that for C<normalize()>.
295 In the cases of NFD, NFKD, and FCD, the answer must be
296 either C<YES> or C<NO>. The answer C<MAYBE> may be returned
297 in the cases of NFC, NFKC, and FCC.
299 A C<MAYBE> string should contain at least one combining character
300 or the like. For example, C<COMBINING ACUTE ACCENT> has
301 the MAYBE_NFC/MAYBE_NFKC property.
303 Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
304 and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
305 C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
306 (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
307 while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
309 If you want to check exactly, compare the string with its NFC/NFKC/FCC;
312 $string eq NFC($string) # thorough than checkNFC($string)
313 $string eq NFKC($string) # thorough than checkNFKC($string)
314 $string eq FCC($string) # thorough than checkFCC($string)
316 =head2 Character Data
318 These functions are interface of character data used internally.
319 If you want only to get Unicode normalization forms, you don't need
324 =item C<$canonical_decomposed = getCanon($codepoint)>
326 If the character of the specified codepoint is canonically
327 decomposable (including Hangul Syllables),
328 returns the B<completely decomposed> string canonically equivalent to it.
330 If it is not decomposable, returns C<undef>.
332 =item C<$compatibility_decomposed = getCompat($codepoint)>
334 If the character of the specified codepoint is compatibility
335 decomposable (including Hangul Syllables),
336 returns the B<completely decomposed> string compatibility equivalent to it.
338 If it is not decomposable, returns C<undef>.
340 =item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
342 If two characters here and next (as codepoints) are composable
343 (including Hangul Jamo/Syllables and Composition Exclusions),
344 returns the codepoint of the composite.
346 If they are not composable, returns C<undef>.
348 =item C<$combining_class = getCombinClass($codepoint)>
350 Returns the combining class of the character as an integer.
352 =item C<$is_exclusion = isExclusion($codepoint)>
354 Returns a boolean whether the character of the specified codepoint
355 is a composition exclusion.
357 =item C<$is_singleton = isSingleton($codepoint)>
359 Returns a boolean whether the character of the specified codepoint is
362 =item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
364 Returns a boolean whether the canonical decomposition
365 of the character of the specified codepoint
366 is a Non-Starter Decomposition.
368 =item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
370 Returns a boolean whether the character of the specified codepoint
371 may be composed with the previous one in a certain composition
372 (including Hangul Compositions, but excluding
373 Composition Exclusions and Non-Starter Decompositions).
379 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
381 C<normalize> and other some functions: on request.
385 SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>
387 http://homepage1.nifty.com/nomenclator/perl/
389 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
391 This module is free software; you can redistribute it
392 and/or modify it under the same terms as Perl itself.
398 =item http://www.unicode.org/reports/tr15/
400 Unicode Normalization Forms - UAX #15
402 =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
404 Derived Normalization Properties
406 =item http://www.unicode.org/notes/tn5/
408 Canonical Equivalence in Applications - UTN #5