Update Unicode::Normalize to 0.28
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.pm
CommitLineData
ac5ea531 1package Unicode::Normalize;
2
4a2e806c 3BEGIN {
1efaba7f 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Normalize cannot stringify a Unicode code point\n";
4a2e806c 6 }
7}
8
ac5ea531 9use 5.006;
10use strict;
11use warnings;
12use Carp;
13
e524f5b2 14no warnings 'utf8';
15
16our $VERSION = '0.28';
ac5ea531 17our $PACKAGE = __PACKAGE__;
18
19require Exporter;
20require DynaLoader;
ac5ea531 21
22our @ISA = qw(Exporter DynaLoader);
23our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45 24our @EXPORT_OK = qw(
25 normalize decompose reorder compose
8f118dcd 26 checkNFD checkNFKD checkNFC checkNFKC check
27 getCanon getCompat getComposite getCombinClass
28 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
29 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
82e740b6 30 FCD checkFCD FCC checkFCC composeContiguous
31 splitOnLastStarter
8f118dcd 32);
33our %EXPORT_TAGS = (
34 all => [ @EXPORT, @EXPORT_OK ],
35 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
36 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
82e740b6 37 fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
2a204b45 38);
ac5ea531 39
82e740b6 40######
41
ac5ea531 42bootstrap Unicode::Normalize $VERSION;
43
82e740b6 44######
45
9f1f04a1 46sub pack_U {
b8d10bc1 47 return pack('U*', @_);
9f1f04a1 48}
49
50sub unpack_U {
b8d10bc1 51 return unpack('U*', pack('U*').shift);
9f1f04a1 52}
53
82e740b6 54
55##
56## normalization forms
57##
58
ac5ea531 59use constant COMPAT => 1;
60
d85850a7 61sub NFD ($) { reorder(decompose($_[0])) }
ac5ea531 62sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
d85850a7 63sub NFC ($) { compose(reorder(decompose($_[0]))) }
ac5ea531 64sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
65
82e740b6 66sub FCD ($) {
67 my $str = shift;
68 return checkFCD($str) ? $str : NFD($str);
69}
70sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) }
71
72our %formNorm = (
73 NFC => \&NFC, C => \&NFC,
74 NFD => \&NFD, D => \&NFD,
75 NFKC => \&NFKC, KC => \&NFKC,
76 NFKD => \&NFKD, KD => \&NFKD,
77 FCD => \&FCD, FCC => \&FCC,
78);
79
ac5ea531 80sub normalize($$)
81{
d85850a7 82 my $form = shift;
f027f502 83 my $str = shift;
82e740b6 84 return exists $formNorm{$form}
85 ? $formNorm{$form}->($str)
86 : croak $PACKAGE."::normalize: invalid form name: $form";
ac5ea531 87}
88
82e740b6 89
90##
91## quick check
92##
93
94our %formCheck = (
95 NFC => \&checkNFC, C => \&checkNFC,
96 NFD => \&checkNFD, D => \&checkNFD,
97 NFKC => \&checkNFKC, KC => \&checkNFKC,
98 NFKD => \&checkNFKD, KD => \&checkNFKD,
99 FCD => \&checkFCD, FCC => \&checkFCC,
100);
101
8f118dcd 102sub check($$)
103{
104 my $form = shift;
f027f502 105 my $str = shift;
82e740b6 106 return exists $formCheck{$form}
107 ? $formCheck{$form}->($str)
108 : croak $PACKAGE."::check: invalid form name: $form";
8f118dcd 109}
110
ac5ea531 1111;
112__END__
2a204b45 113
114=head1 NAME
115
f027f502 116Unicode::Normalize - Unicode Normalization Forms
2a204b45 117
118=head1 SYNOPSIS
119
120 use Unicode::Normalize;
121
8f118dcd 122 $NFD_string = NFD($string); # Normalization Form D
123 $NFC_string = NFC($string); # Normalization Form C
124 $NFKD_string = NFKD($string); # Normalization Form KD
125 $NFKC_string = NFKC($string); # Normalization Form KC
2a204b45 126
127 or
128
129 use Unicode::Normalize 'normalize';
130
8f118dcd 131 $NFD_string = normalize('D', $string); # Normalization Form D
132 $NFC_string = normalize('C', $string); # Normalization Form C
133 $NFKD_string = normalize('KD', $string); # Normalization Form KD
134 $NFKC_string = normalize('KC', $string); # Normalization Form KC
2a204b45 135
136=head1 DESCRIPTION
137
00f2676f 138Parameters:
139
140C<$string> is used as a string under character semantics
141(see F<perlunicode>).
142
143C<$codepoint> should be an unsigned integer
144representing a Unicode code point.
145
146Note: Between XS edition and pure Perl edition,
147interpretation of C<$codepoint> as a decimal number has incompatibility.
148XS converts C<$codepoint> to an unsigned integer, but pure Perl does not.
149Do not use a floating point nor a negative sign in C<$codepoint>.
150
d85850a7 151=head2 Normalization Forms
2a204b45 152
153=over 4
154
8f118dcd 155=item C<$NFD_string = NFD($string)>
2a204b45 156
157returns the Normalization Form D (formed by canonical decomposition).
158
8f118dcd 159=item C<$NFC_string = NFC($string)>
2a204b45 160
161returns the Normalization Form C (formed by canonical decomposition
162followed by canonical composition).
163
8f118dcd 164=item C<$NFKD_string = NFKD($string)>
2a204b45 165
166returns the Normalization Form KD (formed by compatibility decomposition).
167
8f118dcd 168=item C<$NFKC_string = NFKC($string)>
2a204b45 169
170returns the Normalization Form KC (formed by compatibility decomposition
171followed by B<canonical> composition).
172
82e740b6 173=item C<$FCD_string = FCD($string)>
174
175If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
176returns it without modification; otherwise returns an FCD string.
177
178Note: FCD is not always unique, then plural forms may be equivalent
179each other. C<FCD()> will return one of these equivalent forms.
180
181=item C<$FCC_string = FCC($string)>
182
183returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
184
e524f5b2 185Note: FCC is unique, as well as four normalization forms (NF*).
82e740b6 186
8f118dcd 187=item C<$normalized_string = normalize($form_name, $string)>
2a204b45 188
189As C<$form_name>, one of the following names must be given.
190
82e740b6 191 'C' or 'NFC' for Normalization Form C (UAX #15)
192 'D' or 'NFD' for Normalization Form D (UAX #15)
193 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
194 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
195
196 'FCD' for "Fast C or D" Form (UTN #5)
197 'FCC' for "Fast C Contiguous" (UTN #5)
2a204b45 198
199=back
200
8f118dcd 201=head2 Decomposition and Composition
202
203=over 4
204
205=item C<$decomposed_string = decompose($string)>
206
207=item C<$decomposed_string = decompose($string, $useCompatMapping)>
208
9f1f04a1 209Decomposes the specified string and returns the result.
8f118dcd 210
211If the second parameter (a boolean) is omitted or false, decomposes it
212using the Canonical Decomposition Mapping.
213If true, decomposes it using the Compatibility Decomposition Mapping.
214
215The string returned is not always in NFD/NFKD.
216Reordering may be required.
217
218 $NFD_string = reorder(decompose($string)); # eq. to NFD()
219 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
220
221=item C<$reordered_string = reorder($string)>
222
9f1f04a1 223Reorders the combining characters and the like in the canonical ordering
8f118dcd 224and returns the result.
225
226E.g., when you have a list of NFD/NFKD strings,
227you can get the concatenated NFD/NFKD string from them, saying
228
229 $concat_NFD = reorder(join '', @NFD_strings);
230 $concat_NFKD = reorder(join '', @NFKD_strings);
231
232=item C<$composed_string = compose($string)>
233
234Returns the string where composable pairs are composed.
235
236E.g., when you have a NFD/NFKD string,
237you can get its NFC/NFKC string, saying
238
239 $NFC_string = compose($NFD_string);
240 $NFKC_string = compose($NFKD_string);
241
242=back
243
244=head2 Quick Check
245
82e740b6 246(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
8f118dcd 247
248The following functions check whether the string is in that normalization form.
249
250The result returned will be:
251
252 YES The string is in that normalization form.
253 NO The string is not in that normalization form.
254 MAYBE Dubious. Maybe yes, maybe no.
255
256=over 4
257
258=item C<$result = checkNFD($string)>
259
f027f502 260returns C<YES> (C<1>) or C<NO> (C<empty string>).
8f118dcd 261
262=item C<$result = checkNFC($string)>
263
f027f502 264returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd 265
266=item C<$result = checkNFKD($string)>
267
f027f502 268returns C<YES> (C<1>) or C<NO> (C<empty string>).
8f118dcd 269
270=item C<$result = checkNFKC($string)>
271
f027f502 272returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd 273
82e740b6 274=item C<$result = checkFCD($string)>
275
276returns C<YES> (C<1>) or C<NO> (C<empty string>).
277
278=item C<$result = checkFCC($string)>
279
280returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
281
e524f5b2 282If a string is not in FCD, it must not be in FCC.
82e740b6 283So C<checkFCC($not_FCD_string)> should return C<NO>.
284
8f118dcd 285=item C<$result = check($form_name, $string)>
286
f027f502 287returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd 288
289C<$form_name> is alike to that for C<normalize()>.
290
291=back
292
293B<Note>
294
82e740b6 295In the cases of NFD, NFKD, and FCD, the answer must be
296either C<YES> or C<NO>. The answer C<MAYBE> may be returned
297in the cases of NFC, NFKC, and FCC.
8f118dcd 298
82e740b6 299A C<MAYBE> string should contain at least one combining character
300or the like. For example, C<COMBINING ACUTE ACCENT> has
8f118dcd 301the MAYBE_NFC/MAYBE_NFKC property.
82e740b6 302
8f118dcd 303Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
304and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
f027f502 305C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
8f118dcd 306(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
307while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
308
82e740b6 309If you want to check exactly, compare the string with its NFC/NFKC/FCC;
310i.e.,
8f118dcd 311
82e740b6 312 $string eq NFC($string) # thorough than checkNFC($string)
313 $string eq NFKC($string) # thorough than checkNFKC($string)
314 $string eq FCC($string) # thorough than checkFCC($string)
8f118dcd 315
2a204b45 316=head2 Character Data
317
318These functions are interface of character data used internally.
d0ed0342 319If you want only to get Unicode normalization forms, you don't need
320call them yourself.
2a204b45 321
322=over 4
323
324=item C<$canonical_decomposed = getCanon($codepoint)>
325
8f118dcd 326If the character of the specified codepoint is canonically
327decomposable (including Hangul Syllables),
328returns the B<completely decomposed> string canonically equivalent to it.
329
f027f502 330If it is not decomposable, returns C<undef>.
8f118dcd 331
2a204b45 332=item C<$compatibility_decomposed = getCompat($codepoint)>
333
8f118dcd 334If the character of the specified codepoint is compatibility
335decomposable (including Hangul Syllables),
336returns the B<completely decomposed> string compatibility equivalent to it.
2a204b45 337
f027f502 338If it is not decomposable, returns C<undef>.
2a204b45 339
8f118dcd 340=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
2a204b45 341
d85850a7 342If two characters here and next (as codepoints) are composable
8f118dcd 343(including Hangul Jamo/Syllables and Composition Exclusions),
2a204b45 344returns the codepoint of the composite.
345
f027f502 346If they are not composable, returns C<undef>.
2a204b45 347
348=item C<$combining_class = getCombinClass($codepoint)>
349
8f118dcd 350Returns the combining class of the character as an integer.
2a204b45 351
352=item C<$is_exclusion = isExclusion($codepoint)>
353
8f118dcd 354Returns a boolean whether the character of the specified codepoint
355is a composition exclusion.
356
357=item C<$is_singleton = isSingleton($codepoint)>
358
2a204b45 359Returns a boolean whether the character of the specified codepoint is
8f118dcd 360a singleton.
361
6c941e0c 362=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
8f118dcd 363
364Returns a boolean whether the canonical decomposition
365of the character of the specified codepoint
366is a Non-Starter Decomposition.
367
368=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
369
370Returns a boolean whether the character of the specified codepoint
371may be composed with the previous one in a certain composition
372(including Hangul Compositions, but excluding
373Composition Exclusions and Non-Starter Decompositions).
2a204b45 374
375=back
376
377=head2 EXPORT
378
379C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
380
381C<normalize> and other some functions: on request.
382
383=head1 AUTHOR
384
82e740b6 385SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>
2a204b45 386
387 http://homepage1.nifty.com/nomenclator/perl/
388
6c941e0c 389 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
2a204b45 390
6c941e0c 391 This module is free software; you can redistribute it
392 and/or modify it under the same terms as Perl itself.
2a204b45 393
394=head1 SEE ALSO
395
396=over 4
397
e524f5b2 398=item http://www.unicode.org/reports/tr15/
2a204b45 399
400Unicode Normalization Forms - UAX #15
401
14e6b36c 402=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
8f118dcd 403
404Derived Normalization Properties
405
82e740b6 406=item http://www.unicode.org/notes/tn5/
407
408Canonical Equivalence in Applications - UTN #5
409
2a204b45 410=back
411
412=cut
413