1 package Unicode::Normalize;
5 die "Unicode::Normalize not ported to EBCDIC\n";
14 our $VERSION = '0.16';
15 our $PACKAGE = __PACKAGE__;
21 our @ISA = qw(Exporter DynaLoader);
22 our @EXPORT = qw( NFC NFD NFKC NFKD );
24 normalize decompose reorder compose
25 checkNFD checkNFKD checkNFC checkNFKC check
26 getCanon getCompat getComposite getCombinClass
27 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
28 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
31 all => [ @EXPORT, @EXPORT_OK ],
32 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
33 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
36 bootstrap Unicode::Normalize $VERSION;
38 use constant COMPAT => 1;
40 sub NFD ($) { reorder(decompose($_[0])) }
41 sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
42 sub NFC ($) { compose(reorder(decompose($_[0]))) }
43 sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
50 $form eq 'D' ? NFD ($_[0]) :
51 $form eq 'C' ? NFC ($_[0]) :
52 $form eq 'KD' ? NFKD($_[0]) :
53 $form eq 'KC' ? NFKC($_[0]) :
54 croak $PACKAGE."::normalize: invalid form name: $form";
62 $form eq 'D' ? checkNFD ($_[0]) :
63 $form eq 'C' ? checkNFC ($_[0]) :
64 $form eq 'KD' ? checkNFKD($_[0]) :
65 $form eq 'KC' ? checkNFKC($_[0]) :
66 croak $PACKAGE."::check: invalid form name: $form";
74 Unicode::Normalize - normalized forms of Unicode text
78 use Unicode::Normalize;
80 $NFD_string = NFD($string); # Normalization Form D
81 $NFC_string = NFC($string); # Normalization Form C
82 $NFKD_string = NFKD($string); # Normalization Form KD
83 $NFKC_string = NFKC($string); # Normalization Form KC
87 use Unicode::Normalize 'normalize';
89 $NFD_string = normalize('D', $string); # Normalization Form D
90 $NFC_string = normalize('C', $string); # Normalization Form C
91 $NFKD_string = normalize('KD', $string); # Normalization Form KD
92 $NFKC_string = normalize('KC', $string); # Normalization Form KC
96 =head2 Normalization Forms
100 =item C<$NFD_string = NFD($string)>
102 returns the Normalization Form D (formed by canonical decomposition).
104 =item C<$NFC_string = NFC($string)>
106 returns the Normalization Form C (formed by canonical decomposition
107 followed by canonical composition).
109 =item C<$NFKD_string = NFKD($string)>
111 returns the Normalization Form KD (formed by compatibility decomposition).
113 =item C<$NFKC_string = NFKC($string)>
115 returns the Normalization Form KC (formed by compatibility decomposition
116 followed by B<canonical> composition).
118 =item C<$normalized_string = normalize($form_name, $string)>
120 As C<$form_name>, one of the following names must be given.
122 'C' or 'NFC' for Normalization Form C
123 'D' or 'NFD' for Normalization Form D
124 'KC' or 'NFKC' for Normalization Form KC
125 'KD' or 'NFKD' for Normalization Form KD
129 =head2 Decomposition and Composition
133 =item C<$decomposed_string = decompose($string)>
135 =item C<$decomposed_string = decompose($string, $useCompatMapping)>
137 Decompose the specified string and returns the result.
139 If the second parameter (a boolean) is omitted or false, decomposes it
140 using the Canonical Decomposition Mapping.
141 If true, decomposes it using the Compatibility Decomposition Mapping.
143 The string returned is not always in NFD/NFKD.
144 Reordering may be required.
146 $NFD_string = reorder(decompose($string)); # eq. to NFD()
147 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
149 =item C<$reordered_string = reorder($string)>
151 Reorder the combining characters and the like in the canonical ordering
152 and returns the result.
154 E.g., when you have a list of NFD/NFKD strings,
155 you can get the concatenated NFD/NFKD string from them, saying
157 $concat_NFD = reorder(join '', @NFD_strings);
158 $concat_NFKD = reorder(join '', @NFKD_strings);
160 =item C<$composed_string = compose($string)>
162 Returns the string where composable pairs are composed.
164 E.g., when you have a NFD/NFKD string,
165 you can get its NFC/NFKC string, saying
167 $NFC_string = compose($NFD_string);
168 $NFKC_string = compose($NFKD_string);
174 (see Annex 8, UAX #15; F<DerivedNormalizationProperties.txt>)
176 The following functions check whether the string is in that normalization form.
178 The result returned will be:
180 YES The string is in that normalization form.
181 NO The string is not in that normalization form.
182 MAYBE Dubious. Maybe yes, maybe no.
186 =item C<$result = checkNFD($string)>
188 returns YES (1) or NO (empty string).
190 =item C<$result = checkNFC($string)>
192 returns YES (1), NO (empty string), or MAYBE (undef).
194 =item C<$result = checkNFKD($string)>
196 returns YES (1) or NO (empty string).
198 =item C<$result = checkNFKC($string)>
200 returns YES (1), NO (empty string), or MAYBE (undef).
202 =item C<$result = check($form_name, $string)>
204 returns YES (1), NO (empty string), or MAYBE (undef).
206 C<$form_name> is alike to that for C<normalize()>.
212 In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
213 The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
215 A MAYBE-NFC/NFKC string should contain at least
216 one combining character or the like.
217 For example, C<COMBINING ACUTE ACCENT> has
218 the MAYBE_NFC/MAYBE_NFKC property.
219 Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
220 and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
221 Though, C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
222 (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
223 while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
225 If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
227 $string eq NFC($string) # more thorough than checkNFC($string)
228 $string eq NFKC($string) # more thorough than checkNFKC($string)
230 =head2 Character Data
232 These functions are interface of character data used internally.
233 If you want only to get Unicode normalization forms, you don't need
238 =item C<$canonical_decomposed = getCanon($codepoint)>
240 If the character of the specified codepoint is canonically
241 decomposable (including Hangul Syllables),
242 returns the B<completely decomposed> string canonically equivalent to it.
244 If it is not decomposable, returns undef.
246 =item C<$compatibility_decomposed = getCompat($codepoint)>
248 If the character of the specified codepoint is compatibility
249 decomposable (including Hangul Syllables),
250 returns the B<completely decomposed> string compatibility equivalent to it.
252 If it is not decomposable, returns undef.
254 =item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
256 If two characters here and next (as codepoints) are composable
257 (including Hangul Jamo/Syllables and Composition Exclusions),
258 returns the codepoint of the composite.
260 If they are not composable, returns undef.
262 =item C<$combining_class = getCombinClass($codepoint)>
264 Returns the combining class of the character as an integer.
266 =item C<$is_exclusion = isExclusion($codepoint)>
268 Returns a boolean whether the character of the specified codepoint
269 is a composition exclusion.
271 =item C<$is_singleton = isSingleton($codepoint)>
273 Returns a boolean whether the character of the specified codepoint is
276 =item C<$is_non_startar_decomposition = isNonStDecomp($codepoint)>
278 Returns a boolean whether the canonical decomposition
279 of the character of the specified codepoint
280 is a Non-Starter Decomposition.
282 =item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
284 Returns a boolean whether the character of the specified codepoint
285 may be composed with the previous one in a certain composition
286 (including Hangul Compositions, but excluding
287 Composition Exclusions and Non-Starter Decompositions).
293 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
295 C<normalize> and other some functions: on request.
299 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
301 http://homepage1.nifty.com/nomenclator/perl/
303 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
305 This program is free software; you can redistribute it and/or
306 modify it under the same terms as Perl itself.
312 =item http://www.unicode.org/unicode/reports/tr15/
314 Unicode Normalization Forms - UAX #15
316 =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProperties.txt
318 Derived Normalization Properties