1 package Unicode::Normalize;
5 die "Unicode::Normalize not ported to EBCDIC\n";
14 our $VERSION = '0.17';
15 our $PACKAGE = __PACKAGE__;
21 our @ISA = qw(Exporter DynaLoader);
22 our @EXPORT = qw( NFC NFD NFKC NFKD );
24 normalize decompose reorder compose
25 checkNFD checkNFKD checkNFC checkNFKC check
26 getCanon getCompat getComposite getCombinClass
27 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
28 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
31 all => [ @EXPORT, @EXPORT_OK ],
32 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
33 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
36 bootstrap Unicode::Normalize $VERSION;
38 use constant COMPAT => 1;
40 sub NFD ($) { reorder(decompose($_[0])) }
41 sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
42 sub NFC ($) { compose(reorder(decompose($_[0]))) }
43 sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
51 $form eq 'D' ? NFD ($str) :
52 $form eq 'C' ? NFC ($str) :
53 $form eq 'KD' ? NFKD($str) :
54 $form eq 'KC' ? NFKC($str) :
55 croak $PACKAGE."::normalize: invalid form name: $form";
64 $form eq 'D' ? checkNFD ($str) :
65 $form eq 'C' ? checkNFC ($str) :
66 $form eq 'KD' ? checkNFKD($str) :
67 $form eq 'KC' ? checkNFKC($str) :
68 croak $PACKAGE."::check: invalid form name: $form";
76 Unicode::Normalize - Unicode Normalization Forms
80 use Unicode::Normalize;
82 $NFD_string = NFD($string); # Normalization Form D
83 $NFC_string = NFC($string); # Normalization Form C
84 $NFKD_string = NFKD($string); # Normalization Form KD
85 $NFKC_string = NFKC($string); # Normalization Form KC
89 use Unicode::Normalize 'normalize';
91 $NFD_string = normalize('D', $string); # Normalization Form D
92 $NFC_string = normalize('C', $string); # Normalization Form C
93 $NFKD_string = normalize('KD', $string); # Normalization Form KD
94 $NFKC_string = normalize('KC', $string); # Normalization Form KC
98 =head2 Normalization Forms
102 =item C<$NFD_string = NFD($string)>
104 returns the Normalization Form D (formed by canonical decomposition).
106 =item C<$NFC_string = NFC($string)>
108 returns the Normalization Form C (formed by canonical decomposition
109 followed by canonical composition).
111 =item C<$NFKD_string = NFKD($string)>
113 returns the Normalization Form KD (formed by compatibility decomposition).
115 =item C<$NFKC_string = NFKC($string)>
117 returns the Normalization Form KC (formed by compatibility decomposition
118 followed by B<canonical> composition).
120 =item C<$normalized_string = normalize($form_name, $string)>
122 As C<$form_name>, one of the following names must be given.
124 'C' or 'NFC' for Normalization Form C
125 'D' or 'NFD' for Normalization Form D
126 'KC' or 'NFKC' for Normalization Form KC
127 'KD' or 'NFKD' for Normalization Form KD
131 =head2 Decomposition and Composition
135 =item C<$decomposed_string = decompose($string)>
137 =item C<$decomposed_string = decompose($string, $useCompatMapping)>
139 Decompose the specified string and returns the result.
141 If the second parameter (a boolean) is omitted or false, decomposes it
142 using the Canonical Decomposition Mapping.
143 If true, decomposes it using the Compatibility Decomposition Mapping.
145 The string returned is not always in NFD/NFKD.
146 Reordering may be required.
148 $NFD_string = reorder(decompose($string)); # eq. to NFD()
149 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
151 =item C<$reordered_string = reorder($string)>
153 Reorder the combining characters and the like in the canonical ordering
154 and returns the result.
156 E.g., when you have a list of NFD/NFKD strings,
157 you can get the concatenated NFD/NFKD string from them, saying
159 $concat_NFD = reorder(join '', @NFD_strings);
160 $concat_NFKD = reorder(join '', @NFKD_strings);
162 =item C<$composed_string = compose($string)>
164 Returns the string where composable pairs are composed.
166 E.g., when you have a NFD/NFKD string,
167 you can get its NFC/NFKC string, saying
169 $NFC_string = compose($NFD_string);
170 $NFKC_string = compose($NFKD_string);
176 (see Annex 8, UAX #15; F<DerivedNormalizationProps.txt>)
178 The following functions check whether the string is in that normalization form.
180 The result returned will be:
182 YES The string is in that normalization form.
183 NO The string is not in that normalization form.
184 MAYBE Dubious. Maybe yes, maybe no.
188 =item C<$result = checkNFD($string)>
190 returns C<YES> (C<1>) or C<NO> (C<empty string>).
192 =item C<$result = checkNFC($string)>
194 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
196 =item C<$result = checkNFKD($string)>
198 returns C<YES> (C<1>) or C<NO> (C<empty string>).
200 =item C<$result = checkNFKC($string)>
202 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
204 =item C<$result = check($form_name, $string)>
206 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
208 C<$form_name> is alike to that for C<normalize()>.
214 In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
215 The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
217 A MAYBE-NFC/NFKC string should contain at least
218 one combining character or the like.
219 For example, C<COMBINING ACUTE ACCENT> has
220 the MAYBE_NFC/MAYBE_NFKC property.
221 Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
222 and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
223 C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
224 (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
225 while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
227 If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
229 $string eq NFC($string) # more thorough than checkNFC($string)
230 $string eq NFKC($string) # more thorough than checkNFKC($string)
232 =head2 Character Data
234 These functions are interface of character data used internally.
235 If you want only to get Unicode normalization forms, you don't need
240 =item C<$canonical_decomposed = getCanon($codepoint)>
242 If the character of the specified codepoint is canonically
243 decomposable (including Hangul Syllables),
244 returns the B<completely decomposed> string canonically equivalent to it.
246 If it is not decomposable, returns C<undef>.
248 =item C<$compatibility_decomposed = getCompat($codepoint)>
250 If the character of the specified codepoint is compatibility
251 decomposable (including Hangul Syllables),
252 returns the B<completely decomposed> string compatibility equivalent to it.
254 If it is not decomposable, returns C<undef>.
256 =item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
258 If two characters here and next (as codepoints) are composable
259 (including Hangul Jamo/Syllables and Composition Exclusions),
260 returns the codepoint of the composite.
262 If they are not composable, returns C<undef>.
264 =item C<$combining_class = getCombinClass($codepoint)>
266 Returns the combining class of the character as an integer.
268 =item C<$is_exclusion = isExclusion($codepoint)>
270 Returns a boolean whether the character of the specified codepoint
271 is a composition exclusion.
273 =item C<$is_singleton = isSingleton($codepoint)>
275 Returns a boolean whether the character of the specified codepoint is
278 =item C<$is_non_startar_decomposition = isNonStDecomp($codepoint)>
280 Returns a boolean whether the canonical decomposition
281 of the character of the specified codepoint
282 is a Non-Starter Decomposition.
284 =item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
286 Returns a boolean whether the character of the specified codepoint
287 may be composed with the previous one in a certain composition
288 (including Hangul Compositions, but excluding
289 Composition Exclusions and Non-Starter Decompositions).
295 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
297 C<normalize> and other some functions: on request.
301 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
303 http://homepage1.nifty.com/nomenclator/perl/
305 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
307 This program is free software; you can redistribute it and/or
308 modify it under the same terms as Perl itself.
314 =item http://www.unicode.org/unicode/reports/tr15/
316 Unicode Normalization Forms - UAX #15
318 =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
320 Derived Normalization Properties