Upgrade to Unicode::Normalize 0.21 and Unicode::Collate 0.24,
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.pm
CommitLineData
ac5ea531 1package Unicode::Normalize;
2
4a2e806c 3BEGIN {
9f1f04a1 4 unless ("A" eq pack('U', 0x41) || "A" eq pack('U', ord("A"))) {
5 die "Unicode::Normalize cannot stringify a Unicode code point\n";
4a2e806c 6 }
7}
8
ac5ea531 9use 5.006;
10use strict;
11use warnings;
12use Carp;
13
9f1f04a1 14our $VERSION = '0.21';
ac5ea531 15our $PACKAGE = __PACKAGE__;
16
17require Exporter;
18require DynaLoader;
19require AutoLoader;
20
21our @ISA = qw(Exporter DynaLoader);
22our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45 23our @EXPORT_OK = qw(
24 normalize decompose reorder compose
8f118dcd 25 checkNFD checkNFKD checkNFC checkNFKC check
26 getCanon getCompat getComposite getCombinClass
27 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
28 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
29);
30our %EXPORT_TAGS = (
31 all => [ @EXPORT, @EXPORT_OK ],
32 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
33 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
2a204b45 34);
ac5ea531 35
36bootstrap Unicode::Normalize $VERSION;
37
9f1f04a1 38use constant UNICODE_FOR_PACK => "A" eq pack('U', 0x41);
39use constant NATIVE_FOR_PACK => "A" eq pack('U', ord("A"));
40
41use constant UNICODE_FOR_UNPACK => 0x41 == unpack('U', "A");
42use constant NATIVE_FOR_UNPACK => ord("A") == unpack('U', "A");
43
44sub pack_U {
45 return UNICODE_FOR_PACK
46 ? pack('U*', @_)
47 : NATIVE_FOR_PACK
48 ? pack('U*', map utf8::unicode_to_native($_), @_)
49 : die "$PACKAGE, a Unicode code point cannot be stringified.\n";
50}
51
52sub unpack_U {
53 return UNICODE_FOR_UNPACK
54 ? unpack('U*', shift)
55 : NATIVE_FOR_UNPACK
56 ? map(utf8::native_to_unicode($_), unpack 'U*', shift)
57 : die "$PACKAGE, a code point returned from unpack U " .
58 "cannot be converted into Unicode.\n";
59}
60
ac5ea531 61use constant COMPAT => 1;
62
d85850a7 63sub NFD ($) { reorder(decompose($_[0])) }
ac5ea531 64sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
d85850a7 65sub NFC ($) { compose(reorder(decompose($_[0]))) }
ac5ea531 66sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
67
68sub normalize($$)
69{
d85850a7 70 my $form = shift;
f027f502 71 my $str = shift;
d85850a7 72 $form =~ s/^NF//;
73 return
f027f502 74 $form eq 'D' ? NFD ($str) :
75 $form eq 'C' ? NFC ($str) :
76 $form eq 'KD' ? NFKD($str) :
77 $form eq 'KC' ? NFKC($str) :
d85850a7 78 croak $PACKAGE."::normalize: invalid form name: $form";
ac5ea531 79}
80
8f118dcd 81sub check($$)
82{
83 my $form = shift;
f027f502 84 my $str = shift;
8f118dcd 85 $form =~ s/^NF//;
86 return
f027f502 87 $form eq 'D' ? checkNFD ($str) :
88 $form eq 'C' ? checkNFC ($str) :
89 $form eq 'KD' ? checkNFKD($str) :
90 $form eq 'KC' ? checkNFKC($str) :
8f118dcd 91 croak $PACKAGE."::check: invalid form name: $form";
92}
93
ac5ea531 941;
95__END__
2a204b45 96
97=head1 NAME
98
f027f502 99Unicode::Normalize - Unicode Normalization Forms
2a204b45 100
101=head1 SYNOPSIS
102
103 use Unicode::Normalize;
104
8f118dcd 105 $NFD_string = NFD($string); # Normalization Form D
106 $NFC_string = NFC($string); # Normalization Form C
107 $NFKD_string = NFKD($string); # Normalization Form KD
108 $NFKC_string = NFKC($string); # Normalization Form KC
2a204b45 109
110 or
111
112 use Unicode::Normalize 'normalize';
113
8f118dcd 114 $NFD_string = normalize('D', $string); # Normalization Form D
115 $NFC_string = normalize('C', $string); # Normalization Form C
116 $NFKD_string = normalize('KD', $string); # Normalization Form KD
117 $NFKC_string = normalize('KC', $string); # Normalization Form KC
2a204b45 118
119=head1 DESCRIPTION
120
d85850a7 121=head2 Normalization Forms
2a204b45 122
123=over 4
124
8f118dcd 125=item C<$NFD_string = NFD($string)>
2a204b45 126
127returns the Normalization Form D (formed by canonical decomposition).
128
8f118dcd 129=item C<$NFC_string = NFC($string)>
2a204b45 130
131returns the Normalization Form C (formed by canonical decomposition
132followed by canonical composition).
133
8f118dcd 134=item C<$NFKD_string = NFKD($string)>
2a204b45 135
136returns the Normalization Form KD (formed by compatibility decomposition).
137
8f118dcd 138=item C<$NFKC_string = NFKC($string)>
2a204b45 139
140returns the Normalization Form KC (formed by compatibility decomposition
141followed by B<canonical> composition).
142
8f118dcd 143=item C<$normalized_string = normalize($form_name, $string)>
2a204b45 144
145As C<$form_name>, one of the following names must be given.
146
147 'C' or 'NFC' for Normalization Form C
148 'D' or 'NFD' for Normalization Form D
149 'KC' or 'NFKC' for Normalization Form KC
150 'KD' or 'NFKD' for Normalization Form KD
151
152=back
153
8f118dcd 154=head2 Decomposition and Composition
155
156=over 4
157
158=item C<$decomposed_string = decompose($string)>
159
160=item C<$decomposed_string = decompose($string, $useCompatMapping)>
161
9f1f04a1 162Decomposes the specified string and returns the result.
8f118dcd 163
164If the second parameter (a boolean) is omitted or false, decomposes it
165using the Canonical Decomposition Mapping.
166If true, decomposes it using the Compatibility Decomposition Mapping.
167
168The string returned is not always in NFD/NFKD.
169Reordering may be required.
170
171 $NFD_string = reorder(decompose($string)); # eq. to NFD()
172 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
173
174=item C<$reordered_string = reorder($string)>
175
9f1f04a1 176Reorders the combining characters and the like in the canonical ordering
8f118dcd 177and returns the result.
178
179E.g., when you have a list of NFD/NFKD strings,
180you can get the concatenated NFD/NFKD string from them, saying
181
182 $concat_NFD = reorder(join '', @NFD_strings);
183 $concat_NFKD = reorder(join '', @NFKD_strings);
184
185=item C<$composed_string = compose($string)>
186
187Returns the string where composable pairs are composed.
188
189E.g., when you have a NFD/NFKD string,
190you can get its NFC/NFKC string, saying
191
192 $NFC_string = compose($NFD_string);
193 $NFKC_string = compose($NFKD_string);
194
195=back
196
197=head2 Quick Check
198
6c941e0c 199(see Annex 8, UAX #15, and F<DerivedNormalizationProps.txt>)
8f118dcd 200
201The following functions check whether the string is in that normalization form.
202
203The result returned will be:
204
205 YES The string is in that normalization form.
206 NO The string is not in that normalization form.
207 MAYBE Dubious. Maybe yes, maybe no.
208
209=over 4
210
211=item C<$result = checkNFD($string)>
212
f027f502 213returns C<YES> (C<1>) or C<NO> (C<empty string>).
8f118dcd 214
215=item C<$result = checkNFC($string)>
216
f027f502 217returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd 218
219=item C<$result = checkNFKD($string)>
220
f027f502 221returns C<YES> (C<1>) or C<NO> (C<empty string>).
8f118dcd 222
223=item C<$result = checkNFKC($string)>
224
f027f502 225returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd 226
227=item C<$result = check($form_name, $string)>
228
f027f502 229returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd 230
231C<$form_name> is alike to that for C<normalize()>.
232
233=back
234
235B<Note>
236
237In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
238The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
239
240A MAYBE-NFC/NFKC string should contain at least
241one combining character or the like.
242For example, C<COMBINING ACUTE ACCENT> has
243the MAYBE_NFC/MAYBE_NFKC property.
244Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
245and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
f027f502 246C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
8f118dcd 247(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
248while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
249
250If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
251
252 $string eq NFC($string) # more thorough than checkNFC($string)
253 $string eq NFKC($string) # more thorough than checkNFKC($string)
254
2a204b45 255=head2 Character Data
256
257These functions are interface of character data used internally.
d0ed0342 258If you want only to get Unicode normalization forms, you don't need
259call them yourself.
2a204b45 260
261=over 4
262
263=item C<$canonical_decomposed = getCanon($codepoint)>
264
8f118dcd 265If the character of the specified codepoint is canonically
266decomposable (including Hangul Syllables),
267returns the B<completely decomposed> string canonically equivalent to it.
268
f027f502 269If it is not decomposable, returns C<undef>.
8f118dcd 270
2a204b45 271=item C<$compatibility_decomposed = getCompat($codepoint)>
272
8f118dcd 273If the character of the specified codepoint is compatibility
274decomposable (including Hangul Syllables),
275returns the B<completely decomposed> string compatibility equivalent to it.
2a204b45 276
f027f502 277If it is not decomposable, returns C<undef>.
2a204b45 278
8f118dcd 279=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
2a204b45 280
d85850a7 281If two characters here and next (as codepoints) are composable
8f118dcd 282(including Hangul Jamo/Syllables and Composition Exclusions),
2a204b45 283returns the codepoint of the composite.
284
f027f502 285If they are not composable, returns C<undef>.
2a204b45 286
287=item C<$combining_class = getCombinClass($codepoint)>
288
8f118dcd 289Returns the combining class of the character as an integer.
2a204b45 290
291=item C<$is_exclusion = isExclusion($codepoint)>
292
8f118dcd 293Returns a boolean whether the character of the specified codepoint
294is a composition exclusion.
295
296=item C<$is_singleton = isSingleton($codepoint)>
297
2a204b45 298Returns a boolean whether the character of the specified codepoint is
8f118dcd 299a singleton.
300
6c941e0c 301=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
8f118dcd 302
303Returns a boolean whether the canonical decomposition
304of the character of the specified codepoint
305is a Non-Starter Decomposition.
306
307=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
308
309Returns a boolean whether the character of the specified codepoint
310may be composed with the previous one in a certain composition
311(including Hangul Compositions, but excluding
312Composition Exclusions and Non-Starter Decompositions).
2a204b45 313
314=back
315
316=head2 EXPORT
317
318C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
319
320C<normalize> and other some functions: on request.
321
322=head1 AUTHOR
323
324SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
325
326 http://homepage1.nifty.com/nomenclator/perl/
327
6c941e0c 328 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
2a204b45 329
6c941e0c 330 This module is free software; you can redistribute it
331 and/or modify it under the same terms as Perl itself.
2a204b45 332
333=head1 SEE ALSO
334
335=over 4
336
337=item http://www.unicode.org/unicode/reports/tr15/
338
339Unicode Normalization Forms - UAX #15
340
14e6b36c 341=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
8f118dcd 342
343Derived Normalization Properties
344
2a204b45 345=back
346
347=cut
348