1 package Unicode::Normalize;
4 unless ("A" eq pack('U', 0x41) || "A" eq pack('U', ord("A"))) {
5 die "Unicode::Normalize cannot stringify a Unicode code point\n";
14 our $VERSION = '0.21';
15 our $PACKAGE = __PACKAGE__;
21 our @ISA = qw(Exporter DynaLoader);
22 our @EXPORT = qw( NFC NFD NFKC NFKD );
24 normalize decompose reorder compose
25 checkNFD checkNFKD checkNFC checkNFKC check
26 getCanon getCompat getComposite getCombinClass
27 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
28 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
31 all => [ @EXPORT, @EXPORT_OK ],
32 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
33 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
36 bootstrap Unicode::Normalize $VERSION;
38 use constant UNICODE_FOR_PACK => "A" eq pack('U', 0x41);
39 use constant NATIVE_FOR_PACK => "A" eq pack('U', ord("A"));
41 use constant UNICODE_FOR_UNPACK => 0x41 == unpack('U', "A");
42 use constant NATIVE_FOR_UNPACK => ord("A") == unpack('U', "A");
45 return UNICODE_FOR_PACK
48 ? pack('U*', map utf8::unicode_to_native($_), @_)
49 : die "$PACKAGE, a Unicode code point cannot be stringified.\n";
53 return UNICODE_FOR_UNPACK
56 ? map(utf8::native_to_unicode($_), unpack 'U*', shift)
57 : die "$PACKAGE, a code point returned from unpack U " .
58 "cannot be converted into Unicode.\n";
61 use constant COMPAT => 1;
63 sub NFD ($) { reorder(decompose($_[0])) }
64 sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
65 sub NFC ($) { compose(reorder(decompose($_[0]))) }
66 sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
74 $form eq 'D' ? NFD ($str) :
75 $form eq 'C' ? NFC ($str) :
76 $form eq 'KD' ? NFKD($str) :
77 $form eq 'KC' ? NFKC($str) :
78 croak $PACKAGE."::normalize: invalid form name: $form";
87 $form eq 'D' ? checkNFD ($str) :
88 $form eq 'C' ? checkNFC ($str) :
89 $form eq 'KD' ? checkNFKD($str) :
90 $form eq 'KC' ? checkNFKC($str) :
91 croak $PACKAGE."::check: invalid form name: $form";
99 Unicode::Normalize - Unicode Normalization Forms
103 use Unicode::Normalize;
105 $NFD_string = NFD($string); # Normalization Form D
106 $NFC_string = NFC($string); # Normalization Form C
107 $NFKD_string = NFKD($string); # Normalization Form KD
108 $NFKC_string = NFKC($string); # Normalization Form KC
112 use Unicode::Normalize 'normalize';
114 $NFD_string = normalize('D', $string); # Normalization Form D
115 $NFC_string = normalize('C', $string); # Normalization Form C
116 $NFKD_string = normalize('KD', $string); # Normalization Form KD
117 $NFKC_string = normalize('KC', $string); # Normalization Form KC
121 =head2 Normalization Forms
125 =item C<$NFD_string = NFD($string)>
127 returns the Normalization Form D (formed by canonical decomposition).
129 =item C<$NFC_string = NFC($string)>
131 returns the Normalization Form C (formed by canonical decomposition
132 followed by canonical composition).
134 =item C<$NFKD_string = NFKD($string)>
136 returns the Normalization Form KD (formed by compatibility decomposition).
138 =item C<$NFKC_string = NFKC($string)>
140 returns the Normalization Form KC (formed by compatibility decomposition
141 followed by B<canonical> composition).
143 =item C<$normalized_string = normalize($form_name, $string)>
145 As C<$form_name>, one of the following names must be given.
147 'C' or 'NFC' for Normalization Form C
148 'D' or 'NFD' for Normalization Form D
149 'KC' or 'NFKC' for Normalization Form KC
150 'KD' or 'NFKD' for Normalization Form KD
154 =head2 Decomposition and Composition
158 =item C<$decomposed_string = decompose($string)>
160 =item C<$decomposed_string = decompose($string, $useCompatMapping)>
162 Decomposes the specified string and returns the result.
164 If the second parameter (a boolean) is omitted or false, decomposes it
165 using the Canonical Decomposition Mapping.
166 If true, decomposes it using the Compatibility Decomposition Mapping.
168 The string returned is not always in NFD/NFKD.
169 Reordering may be required.
171 $NFD_string = reorder(decompose($string)); # eq. to NFD()
172 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
174 =item C<$reordered_string = reorder($string)>
176 Reorders the combining characters and the like in the canonical ordering
177 and returns the result.
179 E.g., when you have a list of NFD/NFKD strings,
180 you can get the concatenated NFD/NFKD string from them, saying
182 $concat_NFD = reorder(join '', @NFD_strings);
183 $concat_NFKD = reorder(join '', @NFKD_strings);
185 =item C<$composed_string = compose($string)>
187 Returns the string where composable pairs are composed.
189 E.g., when you have a NFD/NFKD string,
190 you can get its NFC/NFKC string, saying
192 $NFC_string = compose($NFD_string);
193 $NFKC_string = compose($NFKD_string);
199 (see Annex 8, UAX #15, and F<DerivedNormalizationProps.txt>)
201 The following functions check whether the string is in that normalization form.
203 The result returned will be:
205 YES The string is in that normalization form.
206 NO The string is not in that normalization form.
207 MAYBE Dubious. Maybe yes, maybe no.
211 =item C<$result = checkNFD($string)>
213 returns C<YES> (C<1>) or C<NO> (C<empty string>).
215 =item C<$result = checkNFC($string)>
217 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
219 =item C<$result = checkNFKD($string)>
221 returns C<YES> (C<1>) or C<NO> (C<empty string>).
223 =item C<$result = checkNFKC($string)>
225 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
227 =item C<$result = check($form_name, $string)>
229 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
231 C<$form_name> is alike to that for C<normalize()>.
237 In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
238 The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
240 A MAYBE-NFC/NFKC string should contain at least
241 one combining character or the like.
242 For example, C<COMBINING ACUTE ACCENT> has
243 the MAYBE_NFC/MAYBE_NFKC property.
244 Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
245 and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
246 C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
247 (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
248 while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
250 If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
252 $string eq NFC($string) # more thorough than checkNFC($string)
253 $string eq NFKC($string) # more thorough than checkNFKC($string)
255 =head2 Character Data
257 These functions are interface of character data used internally.
258 If you want only to get Unicode normalization forms, you don't need
263 =item C<$canonical_decomposed = getCanon($codepoint)>
265 If the character of the specified codepoint is canonically
266 decomposable (including Hangul Syllables),
267 returns the B<completely decomposed> string canonically equivalent to it.
269 If it is not decomposable, returns C<undef>.
271 =item C<$compatibility_decomposed = getCompat($codepoint)>
273 If the character of the specified codepoint is compatibility
274 decomposable (including Hangul Syllables),
275 returns the B<completely decomposed> string compatibility equivalent to it.
277 If it is not decomposable, returns C<undef>.
279 =item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
281 If two characters here and next (as codepoints) are composable
282 (including Hangul Jamo/Syllables and Composition Exclusions),
283 returns the codepoint of the composite.
285 If they are not composable, returns C<undef>.
287 =item C<$combining_class = getCombinClass($codepoint)>
289 Returns the combining class of the character as an integer.
291 =item C<$is_exclusion = isExclusion($codepoint)>
293 Returns a boolean whether the character of the specified codepoint
294 is a composition exclusion.
296 =item C<$is_singleton = isSingleton($codepoint)>
298 Returns a boolean whether the character of the specified codepoint is
301 =item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
303 Returns a boolean whether the canonical decomposition
304 of the character of the specified codepoint
305 is a Non-Starter Decomposition.
307 =item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
309 Returns a boolean whether the character of the specified codepoint
310 may be composed with the previous one in a certain composition
311 (including Hangul Compositions, but excluding
312 Composition Exclusions and Non-Starter Decompositions).
318 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
320 C<normalize> and other some functions: on request.
324 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
326 http://homepage1.nifty.com/nomenclator/perl/
328 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
330 This module is free software; you can redistribute it
331 and/or modify it under the same terms as Perl itself.
337 =item http://www.unicode.org/unicode/reports/tr15/
339 Unicode Normalization Forms - UAX #15
341 =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
343 Derived Normalization Properties