Commit | Line | Data |
ac5ea531 |
1 | package Unicode::Normalize; |
2 | |
4a2e806c |
3 | BEGIN { |
9f1f04a1 |
4 | unless ("A" eq pack('U', 0x41) || "A" eq pack('U', ord("A"))) { |
5 | die "Unicode::Normalize cannot stringify a Unicode code point\n"; |
4a2e806c |
6 | } |
7 | } |
8 | |
ac5ea531 |
9 | use 5.006; |
10 | use strict; |
11 | use warnings; |
12 | use Carp; |
13 | |
9f1f04a1 |
14 | our $VERSION = '0.21'; |
ac5ea531 |
15 | our $PACKAGE = __PACKAGE__; |
16 | |
17 | require Exporter; |
18 | require DynaLoader; |
19 | require AutoLoader; |
20 | |
21 | our @ISA = qw(Exporter DynaLoader); |
22 | our @EXPORT = qw( NFC NFD NFKC NFKD ); |
2a204b45 |
23 | our @EXPORT_OK = qw( |
24 | normalize decompose reorder compose |
8f118dcd |
25 | checkNFD checkNFKD checkNFC checkNFKC check |
26 | getCanon getCompat getComposite getCombinClass |
27 | isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex |
28 | isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE |
29 | ); |
30 | our %EXPORT_TAGS = ( |
31 | all => [ @EXPORT, @EXPORT_OK ], |
32 | normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ], |
33 | check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ], |
2a204b45 |
34 | ); |
ac5ea531 |
35 | |
36 | bootstrap Unicode::Normalize $VERSION; |
37 | |
9f1f04a1 |
38 | use constant UNICODE_FOR_PACK => "A" eq pack('U', 0x41); |
39 | use constant NATIVE_FOR_PACK => "A" eq pack('U', ord("A")); |
40 | |
41 | use constant UNICODE_FOR_UNPACK => 0x41 == unpack('U', "A"); |
42 | use constant NATIVE_FOR_UNPACK => ord("A") == unpack('U', "A"); |
43 | |
44 | sub pack_U { |
45 | return UNICODE_FOR_PACK |
46 | ? pack('U*', @_) |
47 | : NATIVE_FOR_PACK |
48 | ? pack('U*', map utf8::unicode_to_native($_), @_) |
49 | : die "$PACKAGE, a Unicode code point cannot be stringified.\n"; |
50 | } |
51 | |
52 | sub unpack_U { |
53 | return UNICODE_FOR_UNPACK |
54 | ? unpack('U*', shift) |
55 | : NATIVE_FOR_UNPACK |
56 | ? map(utf8::native_to_unicode($_), unpack 'U*', shift) |
57 | : die "$PACKAGE, a code point returned from unpack U " . |
58 | "cannot be converted into Unicode.\n"; |
59 | } |
60 | |
ac5ea531 |
61 | use constant COMPAT => 1; |
62 | |
d85850a7 |
63 | sub NFD ($) { reorder(decompose($_[0])) } |
ac5ea531 |
64 | sub NFKD ($) { reorder(decompose($_[0], COMPAT)) } |
d85850a7 |
65 | sub NFC ($) { compose(reorder(decompose($_[0]))) } |
ac5ea531 |
66 | sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) } |
67 | |
68 | sub normalize($$) |
69 | { |
d85850a7 |
70 | my $form = shift; |
f027f502 |
71 | my $str = shift; |
d85850a7 |
72 | $form =~ s/^NF//; |
73 | return |
f027f502 |
74 | $form eq 'D' ? NFD ($str) : |
75 | $form eq 'C' ? NFC ($str) : |
76 | $form eq 'KD' ? NFKD($str) : |
77 | $form eq 'KC' ? NFKC($str) : |
d85850a7 |
78 | croak $PACKAGE."::normalize: invalid form name: $form"; |
ac5ea531 |
79 | } |
80 | |
8f118dcd |
81 | sub check($$) |
82 | { |
83 | my $form = shift; |
f027f502 |
84 | my $str = shift; |
8f118dcd |
85 | $form =~ s/^NF//; |
86 | return |
f027f502 |
87 | $form eq 'D' ? checkNFD ($str) : |
88 | $form eq 'C' ? checkNFC ($str) : |
89 | $form eq 'KD' ? checkNFKD($str) : |
90 | $form eq 'KC' ? checkNFKC($str) : |
8f118dcd |
91 | croak $PACKAGE."::check: invalid form name: $form"; |
92 | } |
93 | |
ac5ea531 |
94 | 1; |
95 | __END__ |
2a204b45 |
96 | |
97 | =head1 NAME |
98 | |
f027f502 |
99 | Unicode::Normalize - Unicode Normalization Forms |
2a204b45 |
100 | |
101 | =head1 SYNOPSIS |
102 | |
103 | use Unicode::Normalize; |
104 | |
8f118dcd |
105 | $NFD_string = NFD($string); # Normalization Form D |
106 | $NFC_string = NFC($string); # Normalization Form C |
107 | $NFKD_string = NFKD($string); # Normalization Form KD |
108 | $NFKC_string = NFKC($string); # Normalization Form KC |
2a204b45 |
109 | |
110 | or |
111 | |
112 | use Unicode::Normalize 'normalize'; |
113 | |
8f118dcd |
114 | $NFD_string = normalize('D', $string); # Normalization Form D |
115 | $NFC_string = normalize('C', $string); # Normalization Form C |
116 | $NFKD_string = normalize('KD', $string); # Normalization Form KD |
117 | $NFKC_string = normalize('KC', $string); # Normalization Form KC |
2a204b45 |
118 | |
119 | =head1 DESCRIPTION |
120 | |
d85850a7 |
121 | =head2 Normalization Forms |
2a204b45 |
122 | |
123 | =over 4 |
124 | |
8f118dcd |
125 | =item C<$NFD_string = NFD($string)> |
2a204b45 |
126 | |
127 | returns the Normalization Form D (formed by canonical decomposition). |
128 | |
8f118dcd |
129 | =item C<$NFC_string = NFC($string)> |
2a204b45 |
130 | |
131 | returns the Normalization Form C (formed by canonical decomposition |
132 | followed by canonical composition). |
133 | |
8f118dcd |
134 | =item C<$NFKD_string = NFKD($string)> |
2a204b45 |
135 | |
136 | returns the Normalization Form KD (formed by compatibility decomposition). |
137 | |
8f118dcd |
138 | =item C<$NFKC_string = NFKC($string)> |
2a204b45 |
139 | |
140 | returns the Normalization Form KC (formed by compatibility decomposition |
141 | followed by B<canonical> composition). |
142 | |
8f118dcd |
143 | =item C<$normalized_string = normalize($form_name, $string)> |
2a204b45 |
144 | |
145 | As C<$form_name>, one of the following names must be given. |
146 | |
147 | 'C' or 'NFC' for Normalization Form C |
148 | 'D' or 'NFD' for Normalization Form D |
149 | 'KC' or 'NFKC' for Normalization Form KC |
150 | 'KD' or 'NFKD' for Normalization Form KD |
151 | |
152 | =back |
153 | |
8f118dcd |
154 | =head2 Decomposition and Composition |
155 | |
156 | =over 4 |
157 | |
158 | =item C<$decomposed_string = decompose($string)> |
159 | |
160 | =item C<$decomposed_string = decompose($string, $useCompatMapping)> |
161 | |
9f1f04a1 |
162 | Decomposes the specified string and returns the result. |
8f118dcd |
163 | |
164 | If the second parameter (a boolean) is omitted or false, decomposes it |
165 | using the Canonical Decomposition Mapping. |
166 | If true, decomposes it using the Compatibility Decomposition Mapping. |
167 | |
168 | The string returned is not always in NFD/NFKD. |
169 | Reordering may be required. |
170 | |
171 | $NFD_string = reorder(decompose($string)); # eq. to NFD() |
172 | $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD() |
173 | |
174 | =item C<$reordered_string = reorder($string)> |
175 | |
9f1f04a1 |
176 | Reorders the combining characters and the like in the canonical ordering |
8f118dcd |
177 | and returns the result. |
178 | |
179 | E.g., when you have a list of NFD/NFKD strings, |
180 | you can get the concatenated NFD/NFKD string from them, saying |
181 | |
182 | $concat_NFD = reorder(join '', @NFD_strings); |
183 | $concat_NFKD = reorder(join '', @NFKD_strings); |
184 | |
185 | =item C<$composed_string = compose($string)> |
186 | |
187 | Returns the string where composable pairs are composed. |
188 | |
189 | E.g., when you have a NFD/NFKD string, |
190 | you can get its NFC/NFKC string, saying |
191 | |
192 | $NFC_string = compose($NFD_string); |
193 | $NFKC_string = compose($NFKD_string); |
194 | |
195 | =back |
196 | |
197 | =head2 Quick Check |
198 | |
6c941e0c |
199 | (see Annex 8, UAX #15, and F<DerivedNormalizationProps.txt>) |
8f118dcd |
200 | |
201 | The following functions check whether the string is in that normalization form. |
202 | |
203 | The result returned will be: |
204 | |
205 | YES The string is in that normalization form. |
206 | NO The string is not in that normalization form. |
207 | MAYBE Dubious. Maybe yes, maybe no. |
208 | |
209 | =over 4 |
210 | |
211 | =item C<$result = checkNFD($string)> |
212 | |
f027f502 |
213 | returns C<YES> (C<1>) or C<NO> (C<empty string>). |
8f118dcd |
214 | |
215 | =item C<$result = checkNFC($string)> |
216 | |
f027f502 |
217 | returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>). |
8f118dcd |
218 | |
219 | =item C<$result = checkNFKD($string)> |
220 | |
f027f502 |
221 | returns C<YES> (C<1>) or C<NO> (C<empty string>). |
8f118dcd |
222 | |
223 | =item C<$result = checkNFKC($string)> |
224 | |
f027f502 |
225 | returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>). |
8f118dcd |
226 | |
227 | =item C<$result = check($form_name, $string)> |
228 | |
f027f502 |
229 | returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>). |
8f118dcd |
230 | |
231 | C<$form_name> is alike to that for C<normalize()>. |
232 | |
233 | =back |
234 | |
235 | B<Note> |
236 | |
237 | In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>. |
238 | The answer C<MAYBE> may be returned in the cases of NFC and NFKC. |
239 | |
240 | A MAYBE-NFC/NFKC string should contain at least |
241 | one combining character or the like. |
242 | For example, C<COMBINING ACUTE ACCENT> has |
243 | the MAYBE_NFC/MAYBE_NFKC property. |
244 | Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")> |
245 | and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>. |
f027f502 |
246 | C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC |
8f118dcd |
247 | (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">), |
248 | while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC. |
249 | |
250 | If you want to check exactly, compare the string with its NFC/NFKC; i.e., |
251 | |
252 | $string eq NFC($string) # more thorough than checkNFC($string) |
253 | $string eq NFKC($string) # more thorough than checkNFKC($string) |
254 | |
2a204b45 |
255 | =head2 Character Data |
256 | |
257 | These functions are interface of character data used internally. |
d0ed0342 |
258 | If you want only to get Unicode normalization forms, you don't need |
259 | call them yourself. |
2a204b45 |
260 | |
261 | =over 4 |
262 | |
263 | =item C<$canonical_decomposed = getCanon($codepoint)> |
264 | |
8f118dcd |
265 | If the character of the specified codepoint is canonically |
266 | decomposable (including Hangul Syllables), |
267 | returns the B<completely decomposed> string canonically equivalent to it. |
268 | |
f027f502 |
269 | If it is not decomposable, returns C<undef>. |
8f118dcd |
270 | |
2a204b45 |
271 | =item C<$compatibility_decomposed = getCompat($codepoint)> |
272 | |
8f118dcd |
273 | If the character of the specified codepoint is compatibility |
274 | decomposable (including Hangul Syllables), |
275 | returns the B<completely decomposed> string compatibility equivalent to it. |
2a204b45 |
276 | |
f027f502 |
277 | If it is not decomposable, returns C<undef>. |
2a204b45 |
278 | |
8f118dcd |
279 | =item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)> |
2a204b45 |
280 | |
d85850a7 |
281 | If two characters here and next (as codepoints) are composable |
8f118dcd |
282 | (including Hangul Jamo/Syllables and Composition Exclusions), |
2a204b45 |
283 | returns the codepoint of the composite. |
284 | |
f027f502 |
285 | If they are not composable, returns C<undef>. |
2a204b45 |
286 | |
287 | =item C<$combining_class = getCombinClass($codepoint)> |
288 | |
8f118dcd |
289 | Returns the combining class of the character as an integer. |
2a204b45 |
290 | |
291 | =item C<$is_exclusion = isExclusion($codepoint)> |
292 | |
8f118dcd |
293 | Returns a boolean whether the character of the specified codepoint |
294 | is a composition exclusion. |
295 | |
296 | =item C<$is_singleton = isSingleton($codepoint)> |
297 | |
2a204b45 |
298 | Returns a boolean whether the character of the specified codepoint is |
8f118dcd |
299 | a singleton. |
300 | |
6c941e0c |
301 | =item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)> |
8f118dcd |
302 | |
303 | Returns a boolean whether the canonical decomposition |
304 | of the character of the specified codepoint |
305 | is a Non-Starter Decomposition. |
306 | |
307 | =item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)> |
308 | |
309 | Returns a boolean whether the character of the specified codepoint |
310 | may be composed with the previous one in a certain composition |
311 | (including Hangul Compositions, but excluding |
312 | Composition Exclusions and Non-Starter Decompositions). |
2a204b45 |
313 | |
314 | =back |
315 | |
316 | =head2 EXPORT |
317 | |
318 | C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. |
319 | |
320 | C<normalize> and other some functions: on request. |
321 | |
322 | =head1 AUTHOR |
323 | |
324 | SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt> |
325 | |
326 | http://homepage1.nifty.com/nomenclator/perl/ |
327 | |
6c941e0c |
328 | Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved. |
2a204b45 |
329 | |
6c941e0c |
330 | This module is free software; you can redistribute it |
331 | and/or modify it under the same terms as Perl itself. |
2a204b45 |
332 | |
333 | =head1 SEE ALSO |
334 | |
335 | =over 4 |
336 | |
337 | =item http://www.unicode.org/unicode/reports/tr15/ |
338 | |
339 | Unicode Normalization Forms - UAX #15 |
340 | |
14e6b36c |
341 | =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt |
8f118dcd |
342 | |
343 | Derived Normalization Properties |
344 | |
2a204b45 |
345 | =back |
346 | |
347 | =cut |
348 | |