Possible fix for the problems in EBCDIC from Sadahiro Tomoyuki.
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.pm
CommitLineData
ac5ea531 1package Unicode::Normalize;
2
4a2e806c 3BEGIN {
9f1f04a1 4 unless ("A" eq pack('U', 0x41) || "A" eq pack('U', ord("A"))) {
5 die "Unicode::Normalize cannot stringify a Unicode code point\n";
4a2e806c 6 }
7}
8
ac5ea531 9use 5.006;
10use strict;
11use warnings;
12use Carp;
13
9f1f04a1 14our $VERSION = '0.21';
ac5ea531 15our $PACKAGE = __PACKAGE__;
16
17require Exporter;
18require DynaLoader;
19require AutoLoader;
20
21our @ISA = qw(Exporter DynaLoader);
22our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45 23our @EXPORT_OK = qw(
24 normalize decompose reorder compose
8f118dcd 25 checkNFD checkNFKD checkNFC checkNFKC check
26 getCanon getCompat getComposite getCombinClass
27 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
28 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
29);
30our %EXPORT_TAGS = (
31 all => [ @EXPORT, @EXPORT_OK ],
32 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
33 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
2a204b45 34);
ac5ea531 35
36bootstrap Unicode::Normalize $VERSION;
37
9f1f04a1 38sub pack_U {
b8d10bc1 39 return pack('U*', @_);
9f1f04a1 40}
41
42sub unpack_U {
b8d10bc1 43 return unpack('U*', pack('U*').shift);
9f1f04a1 44}
45
ac5ea531 46use constant COMPAT => 1;
47
d85850a7 48sub NFD ($) { reorder(decompose($_[0])) }
ac5ea531 49sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
d85850a7 50sub NFC ($) { compose(reorder(decompose($_[0]))) }
ac5ea531 51sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
52
53sub normalize($$)
54{
d85850a7 55 my $form = shift;
f027f502 56 my $str = shift;
d85850a7 57 $form =~ s/^NF//;
58 return
f027f502 59 $form eq 'D' ? NFD ($str) :
60 $form eq 'C' ? NFC ($str) :
61 $form eq 'KD' ? NFKD($str) :
62 $form eq 'KC' ? NFKC($str) :
d85850a7 63 croak $PACKAGE."::normalize: invalid form name: $form";
ac5ea531 64}
65
8f118dcd 66sub check($$)
67{
68 my $form = shift;
f027f502 69 my $str = shift;
8f118dcd 70 $form =~ s/^NF//;
71 return
f027f502 72 $form eq 'D' ? checkNFD ($str) :
73 $form eq 'C' ? checkNFC ($str) :
74 $form eq 'KD' ? checkNFKD($str) :
75 $form eq 'KC' ? checkNFKC($str) :
8f118dcd 76 croak $PACKAGE."::check: invalid form name: $form";
77}
78
ac5ea531 791;
80__END__
2a204b45 81
82=head1 NAME
83
f027f502 84Unicode::Normalize - Unicode Normalization Forms
2a204b45 85
86=head1 SYNOPSIS
87
88 use Unicode::Normalize;
89
8f118dcd 90 $NFD_string = NFD($string); # Normalization Form D
91 $NFC_string = NFC($string); # Normalization Form C
92 $NFKD_string = NFKD($string); # Normalization Form KD
93 $NFKC_string = NFKC($string); # Normalization Form KC
2a204b45 94
95 or
96
97 use Unicode::Normalize 'normalize';
98
8f118dcd 99 $NFD_string = normalize('D', $string); # Normalization Form D
100 $NFC_string = normalize('C', $string); # Normalization Form C
101 $NFKD_string = normalize('KD', $string); # Normalization Form KD
102 $NFKC_string = normalize('KC', $string); # Normalization Form KC
2a204b45 103
104=head1 DESCRIPTION
105
d85850a7 106=head2 Normalization Forms
2a204b45 107
108=over 4
109
8f118dcd 110=item C<$NFD_string = NFD($string)>
2a204b45 111
112returns the Normalization Form D (formed by canonical decomposition).
113
8f118dcd 114=item C<$NFC_string = NFC($string)>
2a204b45 115
116returns the Normalization Form C (formed by canonical decomposition
117followed by canonical composition).
118
8f118dcd 119=item C<$NFKD_string = NFKD($string)>
2a204b45 120
121returns the Normalization Form KD (formed by compatibility decomposition).
122
8f118dcd 123=item C<$NFKC_string = NFKC($string)>
2a204b45 124
125returns the Normalization Form KC (formed by compatibility decomposition
126followed by B<canonical> composition).
127
8f118dcd 128=item C<$normalized_string = normalize($form_name, $string)>
2a204b45 129
130As C<$form_name>, one of the following names must be given.
131
132 'C' or 'NFC' for Normalization Form C
133 'D' or 'NFD' for Normalization Form D
134 'KC' or 'NFKC' for Normalization Form KC
135 'KD' or 'NFKD' for Normalization Form KD
136
137=back
138
8f118dcd 139=head2 Decomposition and Composition
140
141=over 4
142
143=item C<$decomposed_string = decompose($string)>
144
145=item C<$decomposed_string = decompose($string, $useCompatMapping)>
146
9f1f04a1 147Decomposes the specified string and returns the result.
8f118dcd 148
149If the second parameter (a boolean) is omitted or false, decomposes it
150using the Canonical Decomposition Mapping.
151If true, decomposes it using the Compatibility Decomposition Mapping.
152
153The string returned is not always in NFD/NFKD.
154Reordering may be required.
155
156 $NFD_string = reorder(decompose($string)); # eq. to NFD()
157 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
158
159=item C<$reordered_string = reorder($string)>
160
9f1f04a1 161Reorders the combining characters and the like in the canonical ordering
8f118dcd 162and returns the result.
163
164E.g., when you have a list of NFD/NFKD strings,
165you can get the concatenated NFD/NFKD string from them, saying
166
167 $concat_NFD = reorder(join '', @NFD_strings);
168 $concat_NFKD = reorder(join '', @NFKD_strings);
169
170=item C<$composed_string = compose($string)>
171
172Returns the string where composable pairs are composed.
173
174E.g., when you have a NFD/NFKD string,
175you can get its NFC/NFKC string, saying
176
177 $NFC_string = compose($NFD_string);
178 $NFKC_string = compose($NFKD_string);
179
180=back
181
182=head2 Quick Check
183
6c941e0c 184(see Annex 8, UAX #15, and F<DerivedNormalizationProps.txt>)
8f118dcd 185
186The following functions check whether the string is in that normalization form.
187
188The result returned will be:
189
190 YES The string is in that normalization form.
191 NO The string is not in that normalization form.
192 MAYBE Dubious. Maybe yes, maybe no.
193
194=over 4
195
196=item C<$result = checkNFD($string)>
197
f027f502 198returns C<YES> (C<1>) or C<NO> (C<empty string>).
8f118dcd 199
200=item C<$result = checkNFC($string)>
201
f027f502 202returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd 203
204=item C<$result = checkNFKD($string)>
205
f027f502 206returns C<YES> (C<1>) or C<NO> (C<empty string>).
8f118dcd 207
208=item C<$result = checkNFKC($string)>
209
f027f502 210returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd 211
212=item C<$result = check($form_name, $string)>
213
f027f502 214returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd 215
216C<$form_name> is alike to that for C<normalize()>.
217
218=back
219
220B<Note>
221
222In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
223The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
224
225A MAYBE-NFC/NFKC string should contain at least
226one combining character or the like.
227For example, C<COMBINING ACUTE ACCENT> has
228the MAYBE_NFC/MAYBE_NFKC property.
229Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
230and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
f027f502 231C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
8f118dcd 232(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
233while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
234
235If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
236
237 $string eq NFC($string) # more thorough than checkNFC($string)
238 $string eq NFKC($string) # more thorough than checkNFKC($string)
239
2a204b45 240=head2 Character Data
241
242These functions are interface of character data used internally.
d0ed0342 243If you want only to get Unicode normalization forms, you don't need
244call them yourself.
2a204b45 245
246=over 4
247
248=item C<$canonical_decomposed = getCanon($codepoint)>
249
8f118dcd 250If the character of the specified codepoint is canonically
251decomposable (including Hangul Syllables),
252returns the B<completely decomposed> string canonically equivalent to it.
253
f027f502 254If it is not decomposable, returns C<undef>.
8f118dcd 255
2a204b45 256=item C<$compatibility_decomposed = getCompat($codepoint)>
257
8f118dcd 258If the character of the specified codepoint is compatibility
259decomposable (including Hangul Syllables),
260returns the B<completely decomposed> string compatibility equivalent to it.
2a204b45 261
f027f502 262If it is not decomposable, returns C<undef>.
2a204b45 263
8f118dcd 264=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
2a204b45 265
d85850a7 266If two characters here and next (as codepoints) are composable
8f118dcd 267(including Hangul Jamo/Syllables and Composition Exclusions),
2a204b45 268returns the codepoint of the composite.
269
f027f502 270If they are not composable, returns C<undef>.
2a204b45 271
272=item C<$combining_class = getCombinClass($codepoint)>
273
8f118dcd 274Returns the combining class of the character as an integer.
2a204b45 275
276=item C<$is_exclusion = isExclusion($codepoint)>
277
8f118dcd 278Returns a boolean whether the character of the specified codepoint
279is a composition exclusion.
280
281=item C<$is_singleton = isSingleton($codepoint)>
282
2a204b45 283Returns a boolean whether the character of the specified codepoint is
8f118dcd 284a singleton.
285
6c941e0c 286=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
8f118dcd 287
288Returns a boolean whether the canonical decomposition
289of the character of the specified codepoint
290is a Non-Starter Decomposition.
291
292=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
293
294Returns a boolean whether the character of the specified codepoint
295may be composed with the previous one in a certain composition
296(including Hangul Compositions, but excluding
297Composition Exclusions and Non-Starter Decompositions).
2a204b45 298
299=back
300
301=head2 EXPORT
302
303C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
304
305C<normalize> and other some functions: on request.
306
307=head1 AUTHOR
308
309SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
310
311 http://homepage1.nifty.com/nomenclator/perl/
312
6c941e0c 313 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
2a204b45 314
6c941e0c 315 This module is free software; you can redistribute it
316 and/or modify it under the same terms as Perl itself.
2a204b45 317
318=head1 SEE ALSO
319
320=over 4
321
322=item http://www.unicode.org/unicode/reports/tr15/
323
324Unicode Normalization Forms - UAX #15
325
14e6b36c 326=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
8f118dcd 327
328Derived Normalization Properties
329
2a204b45 330=back
331
332=cut
333