Commit | Line | Data |
ac5ea531 |
1 | package Unicode::Normalize; |
2 | |
4a2e806c |
3 | BEGIN { |
1efaba7f |
4 | unless ("A" eq pack('U', 0x41)) { |
9f1f04a1 |
5 | die "Unicode::Normalize cannot stringify a Unicode code point\n"; |
4a2e806c |
6 | } |
7 | } |
8 | |
ac5ea531 |
9 | use 5.006; |
10 | use strict; |
11 | use warnings; |
12 | use Carp; |
13 | |
e524f5b2 |
14 | no warnings 'utf8'; |
15 | |
628bbff0 |
16 | our $VERSION = '0.32'; |
ac5ea531 |
17 | our $PACKAGE = __PACKAGE__; |
18 | |
19 | require Exporter; |
20 | require DynaLoader; |
ac5ea531 |
21 | |
22 | our @ISA = qw(Exporter DynaLoader); |
23 | our @EXPORT = qw( NFC NFD NFKC NFKD ); |
2a204b45 |
24 | our @EXPORT_OK = qw( |
25 | normalize decompose reorder compose |
8f118dcd |
26 | checkNFD checkNFKD checkNFC checkNFKC check |
27 | getCanon getCompat getComposite getCombinClass |
28 | isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex |
29 | isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE |
82e740b6 |
30 | FCD checkFCD FCC checkFCC composeContiguous |
31 | splitOnLastStarter |
8f118dcd |
32 | ); |
33 | our %EXPORT_TAGS = ( |
34 | all => [ @EXPORT, @EXPORT_OK ], |
35 | normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ], |
36 | check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ], |
82e740b6 |
37 | fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ], |
2a204b45 |
38 | ); |
ac5ea531 |
39 | |
82e740b6 |
40 | ###### |
41 | |
ac5ea531 |
42 | bootstrap Unicode::Normalize $VERSION; |
43 | |
82e740b6 |
44 | ###### |
45 | |
9f1f04a1 |
46 | sub pack_U { |
b8d10bc1 |
47 | return pack('U*', @_); |
9f1f04a1 |
48 | } |
49 | |
50 | sub unpack_U { |
b8d10bc1 |
51 | return unpack('U*', pack('U*').shift); |
9f1f04a1 |
52 | } |
53 | |
82e740b6 |
54 | |
55 | ## |
56 | ## normalization forms |
57 | ## |
58 | |
ac5ea531 |
59 | use constant COMPAT => 1; |
60 | |
d85850a7 |
61 | sub NFD ($) { reorder(decompose($_[0])) } |
ac5ea531 |
62 | sub NFKD ($) { reorder(decompose($_[0], COMPAT)) } |
d85850a7 |
63 | sub NFC ($) { compose(reorder(decompose($_[0]))) } |
ac5ea531 |
64 | sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) } |
65 | |
82e740b6 |
66 | sub FCD ($) { |
67 | my $str = shift; |
68 | return checkFCD($str) ? $str : NFD($str); |
69 | } |
70 | sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) } |
71 | |
72 | our %formNorm = ( |
73 | NFC => \&NFC, C => \&NFC, |
74 | NFD => \&NFD, D => \&NFD, |
75 | NFKC => \&NFKC, KC => \&NFKC, |
76 | NFKD => \&NFKD, KD => \&NFKD, |
77 | FCD => \&FCD, FCC => \&FCC, |
78 | ); |
79 | |
ac5ea531 |
80 | sub normalize($$) |
81 | { |
d85850a7 |
82 | my $form = shift; |
f027f502 |
83 | my $str = shift; |
628bbff0 |
84 | return exists $formNorm{$form} |
82e740b6 |
85 | ? $formNorm{$form}->($str) |
86 | : croak $PACKAGE."::normalize: invalid form name: $form"; |
ac5ea531 |
87 | } |
88 | |
82e740b6 |
89 | |
90 | ## |
91 | ## quick check |
92 | ## |
93 | |
94 | our %formCheck = ( |
95 | NFC => \&checkNFC, C => \&checkNFC, |
96 | NFD => \&checkNFD, D => \&checkNFD, |
97 | NFKC => \&checkNFKC, KC => \&checkNFKC, |
98 | NFKD => \&checkNFKD, KD => \&checkNFKD, |
99 | FCD => \&checkFCD, FCC => \&checkFCC, |
100 | ); |
101 | |
8f118dcd |
102 | sub check($$) |
103 | { |
104 | my $form = shift; |
f027f502 |
105 | my $str = shift; |
628bbff0 |
106 | return exists $formCheck{$form} |
82e740b6 |
107 | ? $formCheck{$form}->($str) |
108 | : croak $PACKAGE."::check: invalid form name: $form"; |
8f118dcd |
109 | } |
110 | |
ac5ea531 |
111 | 1; |
112 | __END__ |
2a204b45 |
113 | |
114 | =head1 NAME |
115 | |
f027f502 |
116 | Unicode::Normalize - Unicode Normalization Forms |
2a204b45 |
117 | |
118 | =head1 SYNOPSIS |
119 | |
a092bcfd |
120 | (1) using function names exported by default: |
121 | |
2a204b45 |
122 | use Unicode::Normalize; |
123 | |
8f118dcd |
124 | $NFD_string = NFD($string); # Normalization Form D |
125 | $NFC_string = NFC($string); # Normalization Form C |
126 | $NFKD_string = NFKD($string); # Normalization Form KD |
127 | $NFKC_string = NFKC($string); # Normalization Form KC |
2a204b45 |
128 | |
a092bcfd |
129 | (2) using function names exported on request: |
2a204b45 |
130 | |
131 | use Unicode::Normalize 'normalize'; |
132 | |
8f118dcd |
133 | $NFD_string = normalize('D', $string); # Normalization Form D |
134 | $NFC_string = normalize('C', $string); # Normalization Form C |
135 | $NFKD_string = normalize('KD', $string); # Normalization Form KD |
136 | $NFKC_string = normalize('KC', $string); # Normalization Form KC |
2a204b45 |
137 | |
138 | =head1 DESCRIPTION |
139 | |
00f2676f |
140 | Parameters: |
141 | |
142 | C<$string> is used as a string under character semantics |
143 | (see F<perlunicode>). |
144 | |
145 | C<$codepoint> should be an unsigned integer |
146 | representing a Unicode code point. |
147 | |
628bbff0 |
148 | Note: Between XSUB and pure Perl, there is an incompatibility |
149 | about the interpretation of C<$codepoint> as a decimal number. |
150 | XSUB converts C<$codepoint> to an unsigned integer, but pure Perl does not. |
00f2676f |
151 | Do not use a floating point nor a negative sign in C<$codepoint>. |
152 | |
d85850a7 |
153 | =head2 Normalization Forms |
2a204b45 |
154 | |
155 | =over 4 |
156 | |
8f118dcd |
157 | =item C<$NFD_string = NFD($string)> |
2a204b45 |
158 | |
159 | returns the Normalization Form D (formed by canonical decomposition). |
160 | |
8f118dcd |
161 | =item C<$NFC_string = NFC($string)> |
2a204b45 |
162 | |
163 | returns the Normalization Form C (formed by canonical decomposition |
164 | followed by canonical composition). |
165 | |
8f118dcd |
166 | =item C<$NFKD_string = NFKD($string)> |
2a204b45 |
167 | |
168 | returns the Normalization Form KD (formed by compatibility decomposition). |
169 | |
8f118dcd |
170 | =item C<$NFKC_string = NFKC($string)> |
2a204b45 |
171 | |
172 | returns the Normalization Form KC (formed by compatibility decomposition |
173 | followed by B<canonical> composition). |
174 | |
82e740b6 |
175 | =item C<$FCD_string = FCD($string)> |
176 | |
177 | If the given string is in FCD ("Fast C or D" form; cf. UTN #5), |
178 | returns it without modification; otherwise returns an FCD string. |
179 | |
180 | Note: FCD is not always unique, then plural forms may be equivalent |
181 | each other. C<FCD()> will return one of these equivalent forms. |
182 | |
183 | =item C<$FCC_string = FCC($string)> |
184 | |
185 | returns the FCC form ("Fast C Contiguous"; cf. UTN #5). |
186 | |
e524f5b2 |
187 | Note: FCC is unique, as well as four normalization forms (NF*). |
82e740b6 |
188 | |
8f118dcd |
189 | =item C<$normalized_string = normalize($form_name, $string)> |
2a204b45 |
190 | |
191 | As C<$form_name>, one of the following names must be given. |
192 | |
82e740b6 |
193 | 'C' or 'NFC' for Normalization Form C (UAX #15) |
194 | 'D' or 'NFD' for Normalization Form D (UAX #15) |
195 | 'KC' or 'NFKC' for Normalization Form KC (UAX #15) |
196 | 'KD' or 'NFKD' for Normalization Form KD (UAX #15) |
197 | |
198 | 'FCD' for "Fast C or D" Form (UTN #5) |
199 | 'FCC' for "Fast C Contiguous" (UTN #5) |
2a204b45 |
200 | |
201 | =back |
202 | |
8f118dcd |
203 | =head2 Decomposition and Composition |
204 | |
205 | =over 4 |
206 | |
207 | =item C<$decomposed_string = decompose($string)> |
208 | |
209 | =item C<$decomposed_string = decompose($string, $useCompatMapping)> |
210 | |
9f1f04a1 |
211 | Decomposes the specified string and returns the result. |
8f118dcd |
212 | |
213 | If the second parameter (a boolean) is omitted or false, decomposes it |
214 | using the Canonical Decomposition Mapping. |
215 | If true, decomposes it using the Compatibility Decomposition Mapping. |
216 | |
217 | The string returned is not always in NFD/NFKD. |
218 | Reordering may be required. |
219 | |
220 | $NFD_string = reorder(decompose($string)); # eq. to NFD() |
221 | $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD() |
222 | |
223 | =item C<$reordered_string = reorder($string)> |
224 | |
9f1f04a1 |
225 | Reorders the combining characters and the like in the canonical ordering |
8f118dcd |
226 | and returns the result. |
227 | |
228 | E.g., when you have a list of NFD/NFKD strings, |
229 | you can get the concatenated NFD/NFKD string from them, saying |
230 | |
231 | $concat_NFD = reorder(join '', @NFD_strings); |
232 | $concat_NFKD = reorder(join '', @NFKD_strings); |
233 | |
234 | =item C<$composed_string = compose($string)> |
235 | |
236 | Returns the string where composable pairs are composed. |
237 | |
238 | E.g., when you have a NFD/NFKD string, |
239 | you can get its NFC/NFKC string, saying |
240 | |
241 | $NFC_string = compose($NFD_string); |
242 | $NFKC_string = compose($NFKD_string); |
243 | |
244 | =back |
245 | |
246 | =head2 Quick Check |
247 | |
82e740b6 |
248 | (see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>) |
8f118dcd |
249 | |
250 | The following functions check whether the string is in that normalization form. |
251 | |
252 | The result returned will be: |
253 | |
254 | YES The string is in that normalization form. |
255 | NO The string is not in that normalization form. |
256 | MAYBE Dubious. Maybe yes, maybe no. |
257 | |
258 | =over 4 |
259 | |
260 | =item C<$result = checkNFD($string)> |
261 | |
628bbff0 |
262 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. |
8f118dcd |
263 | |
264 | =item C<$result = checkNFC($string)> |
265 | |
628bbff0 |
266 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; |
267 | C<undef> if C<MAYBE>. |
8f118dcd |
268 | |
269 | =item C<$result = checkNFKD($string)> |
270 | |
628bbff0 |
271 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. |
8f118dcd |
272 | |
273 | =item C<$result = checkNFKC($string)> |
274 | |
628bbff0 |
275 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; |
276 | C<undef> if C<MAYBE>. |
8f118dcd |
277 | |
82e740b6 |
278 | =item C<$result = checkFCD($string)> |
279 | |
628bbff0 |
280 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. |
82e740b6 |
281 | |
282 | =item C<$result = checkFCC($string)> |
283 | |
628bbff0 |
284 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; |
285 | C<undef> if C<MAYBE>. |
82e740b6 |
286 | |
e524f5b2 |
287 | If a string is not in FCD, it must not be in FCC. |
82e740b6 |
288 | So C<checkFCC($not_FCD_string)> should return C<NO>. |
289 | |
8f118dcd |
290 | =item C<$result = check($form_name, $string)> |
291 | |
628bbff0 |
292 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; |
293 | C<undef> if C<MAYBE>. |
8f118dcd |
294 | |
628bbff0 |
295 | As C<$form_name>, one of the following names must be given. |
296 | |
297 | 'C' or 'NFC' for Normalization Form C (UAX #15) |
298 | 'D' or 'NFD' for Normalization Form D (UAX #15) |
299 | 'KC' or 'NFKC' for Normalization Form KC (UAX #15) |
300 | 'KD' or 'NFKD' for Normalization Form KD (UAX #15) |
301 | |
302 | 'FCD' for "Fast C or D" Form (UTN #5) |
303 | 'FCC' for "Fast C Contiguous" (UTN #5) |
8f118dcd |
304 | |
305 | =back |
306 | |
307 | B<Note> |
308 | |
82e740b6 |
309 | In the cases of NFD, NFKD, and FCD, the answer must be |
310 | either C<YES> or C<NO>. The answer C<MAYBE> may be returned |
311 | in the cases of NFC, NFKC, and FCC. |
8f118dcd |
312 | |
82e740b6 |
313 | A C<MAYBE> string should contain at least one combining character |
314 | or the like. For example, C<COMBINING ACUTE ACCENT> has |
8f118dcd |
315 | the MAYBE_NFC/MAYBE_NFKC property. |
82e740b6 |
316 | |
8f118dcd |
317 | Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")> |
318 | and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>. |
f027f502 |
319 | C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC |
8f118dcd |
320 | (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">), |
321 | while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC. |
322 | |
628bbff0 |
323 | If you want to check exactly, compare the string with its NFC/NFKC/FCC. |
324 | |
325 | if ($string eq NFC($string)) { |
326 | # $string is exactly normalized in NFC; |
327 | } else { |
328 | # $string is not normalized in NFC; |
329 | } |
8f118dcd |
330 | |
628bbff0 |
331 | if ($string eq NFKC($string)) { |
332 | # $string is exactly normalized in NFKC; |
333 | } else { |
334 | # $string is not normalized in NFKC; |
335 | } |
8f118dcd |
336 | |
2a204b45 |
337 | =head2 Character Data |
338 | |
339 | These functions are interface of character data used internally. |
d0ed0342 |
340 | If you want only to get Unicode normalization forms, you don't need |
341 | call them yourself. |
2a204b45 |
342 | |
343 | =over 4 |
344 | |
345 | =item C<$canonical_decomposed = getCanon($codepoint)> |
346 | |
8f118dcd |
347 | If the character of the specified codepoint is canonically |
348 | decomposable (including Hangul Syllables), |
349 | returns the B<completely decomposed> string canonically equivalent to it. |
350 | |
f027f502 |
351 | If it is not decomposable, returns C<undef>. |
8f118dcd |
352 | |
2a204b45 |
353 | =item C<$compatibility_decomposed = getCompat($codepoint)> |
354 | |
8f118dcd |
355 | If the character of the specified codepoint is compatibility |
356 | decomposable (including Hangul Syllables), |
357 | returns the B<completely decomposed> string compatibility equivalent to it. |
2a204b45 |
358 | |
f027f502 |
359 | If it is not decomposable, returns C<undef>. |
2a204b45 |
360 | |
8f118dcd |
361 | =item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)> |
2a204b45 |
362 | |
d85850a7 |
363 | If two characters here and next (as codepoints) are composable |
8f118dcd |
364 | (including Hangul Jamo/Syllables and Composition Exclusions), |
2a204b45 |
365 | returns the codepoint of the composite. |
366 | |
f027f502 |
367 | If they are not composable, returns C<undef>. |
2a204b45 |
368 | |
369 | =item C<$combining_class = getCombinClass($codepoint)> |
370 | |
8f118dcd |
371 | Returns the combining class of the character as an integer. |
2a204b45 |
372 | |
373 | =item C<$is_exclusion = isExclusion($codepoint)> |
374 | |
8f118dcd |
375 | Returns a boolean whether the character of the specified codepoint |
376 | is a composition exclusion. |
377 | |
378 | =item C<$is_singleton = isSingleton($codepoint)> |
379 | |
2a204b45 |
380 | Returns a boolean whether the character of the specified codepoint is |
8f118dcd |
381 | a singleton. |
382 | |
6c941e0c |
383 | =item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)> |
8f118dcd |
384 | |
385 | Returns a boolean whether the canonical decomposition |
386 | of the character of the specified codepoint |
387 | is a Non-Starter Decomposition. |
388 | |
389 | =item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)> |
390 | |
391 | Returns a boolean whether the character of the specified codepoint |
392 | may be composed with the previous one in a certain composition |
393 | (including Hangul Compositions, but excluding |
394 | Composition Exclusions and Non-Starter Decompositions). |
2a204b45 |
395 | |
396 | =back |
397 | |
628bbff0 |
398 | =head1 EXPORT |
2a204b45 |
399 | |
400 | C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. |
401 | |
402 | C<normalize> and other some functions: on request. |
403 | |
628bbff0 |
404 | =head1 CAVEATS |
405 | |
406 | =over 4 |
407 | |
408 | =item Perl's version vs. Unicode version |
409 | |
410 | Since this module refers to perl core's Unicode database in the directory |
411 | F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of |
412 | normalization implemented by this module depends on your perl's version. |
413 | |
414 | perl's version implemented Unicode version |
415 | 5.6.1 3.0.1 |
416 | 5.7.2 3.1.0 |
417 | 5.7.3 3.1.1 (same normalized form as that of 3.1.0) |
418 | 5.8.0 3.2.0 |
419 | 5.8.1-5.8.3 4.0.0 |
420 | 5.8.4-5.8.6 (latest) 4.0.1 (same normalized form as that of 4.0.0) |
421 | |
422 | =item Correction of decomposition mapping |
423 | |
424 | In older Unicode versions, a small number of characters (all of which are |
425 | CJK compatibility ideographs as far as they have been found) may have |
426 | an erroneous decomposition mapping (see F<NormalizationCorrections.txt>). |
427 | Anyhow, this module will neither refer to F<NormalizationCorrections.txt> |
428 | nor provide any specific version of normalization. Therefore this module |
429 | running on an older perl with an older Unicode database may use |
430 | the erroneous decomposition mapping blindly conforming to the Unicode database. |
431 | |
432 | =item Revised definition of canonical composition |
433 | |
434 | In Unicode 4.1.0, the definition D2 of canonical composition (which |
435 | affects NFC and NFKC) has been changed (see Public Review Issue #29 |
436 | and recent UAX #15). This module has used the newer definition |
437 | since the version 0.07 (Oct 31, 2001). |
438 | This module does not support normalization according to the older |
439 | definition, even if the Unicode version implemented by perl is |
440 | lower than 4.1.0. |
441 | |
442 | =back |
443 | |
2a204b45 |
444 | =head1 AUTHOR |
445 | |
a092bcfd |
446 | SADAHIRO Tomoyuki <SADAHIRO@cpan.org> |
2a204b45 |
447 | |
628bbff0 |
448 | Copyright(C) 2001-2005, SADAHIRO Tomoyuki. Japan. All rights reserved. |
2a204b45 |
449 | |
628bbff0 |
450 | This module is free software; you can redistribute it |
451 | and/or modify it under the same terms as Perl itself. |
2a204b45 |
452 | |
453 | =head1 SEE ALSO |
454 | |
455 | =over 4 |
456 | |
e524f5b2 |
457 | =item http://www.unicode.org/reports/tr15/ |
2a204b45 |
458 | |
459 | Unicode Normalization Forms - UAX #15 |
460 | |
14e6b36c |
461 | =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt |
8f118dcd |
462 | |
463 | Derived Normalization Properties |
464 | |
628bbff0 |
465 | =item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt |
466 | |
467 | Normalization Corrections |
468 | |
469 | =item http://www.unicode.org/review/pr-29.html |
470 | |
471 | Public Review Issue #29: Normalization Issue |
472 | |
82e740b6 |
473 | =item http://www.unicode.org/notes/tn5/ |
474 | |
475 | Canonical Equivalence in Applications - UTN #5 |
476 | |
2a204b45 |
477 | =back |
478 | |
479 | =cut |