Sync with Unicode::Collate 0.30
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.pm
CommitLineData
ac5ea531 1package Unicode::Normalize;
2
4a2e806c 3BEGIN {
1efaba7f 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Normalize cannot stringify a Unicode code point\n";
4a2e806c 6 }
7}
8
ac5ea531 9use 5.006;
10use strict;
11use warnings;
12use Carp;
13
00f2676f 14our $VERSION = '0.23';
ac5ea531 15our $PACKAGE = __PACKAGE__;
16
17require Exporter;
18require DynaLoader;
19require AutoLoader;
20
21our @ISA = qw(Exporter DynaLoader);
22our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45 23our @EXPORT_OK = qw(
24 normalize decompose reorder compose
8f118dcd 25 checkNFD checkNFKD checkNFC checkNFKC check
26 getCanon getCompat getComposite getCombinClass
27 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
28 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
29);
30our %EXPORT_TAGS = (
31 all => [ @EXPORT, @EXPORT_OK ],
32 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
33 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
2a204b45 34);
ac5ea531 35
36bootstrap Unicode::Normalize $VERSION;
37
9f1f04a1 38sub pack_U {
b8d10bc1 39 return pack('U*', @_);
9f1f04a1 40}
41
42sub unpack_U {
b8d10bc1 43 return unpack('U*', pack('U*').shift);
9f1f04a1 44}
45
ac5ea531 46use constant COMPAT => 1;
47
d85850a7 48sub NFD ($) { reorder(decompose($_[0])) }
ac5ea531 49sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
d85850a7 50sub NFC ($) { compose(reorder(decompose($_[0]))) }
ac5ea531 51sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
52
53sub normalize($$)
54{
d85850a7 55 my $form = shift;
f027f502 56 my $str = shift;
d85850a7 57 $form =~ s/^NF//;
58 return
f027f502 59 $form eq 'D' ? NFD ($str) :
60 $form eq 'C' ? NFC ($str) :
61 $form eq 'KD' ? NFKD($str) :
62 $form eq 'KC' ? NFKC($str) :
d85850a7 63 croak $PACKAGE."::normalize: invalid form name: $form";
ac5ea531 64}
65
8f118dcd 66sub check($$)
67{
68 my $form = shift;
f027f502 69 my $str = shift;
8f118dcd 70 $form =~ s/^NF//;
71 return
f027f502 72 $form eq 'D' ? checkNFD ($str) :
73 $form eq 'C' ? checkNFC ($str) :
74 $form eq 'KD' ? checkNFKD($str) :
75 $form eq 'KC' ? checkNFKC($str) :
8f118dcd 76 croak $PACKAGE."::check: invalid form name: $form";
77}
78
ac5ea531 791;
80__END__
2a204b45 81
82=head1 NAME
83
f027f502 84Unicode::Normalize - Unicode Normalization Forms
2a204b45 85
86=head1 SYNOPSIS
87
88 use Unicode::Normalize;
89
8f118dcd 90 $NFD_string = NFD($string); # Normalization Form D
91 $NFC_string = NFC($string); # Normalization Form C
92 $NFKD_string = NFKD($string); # Normalization Form KD
93 $NFKC_string = NFKC($string); # Normalization Form KC
2a204b45 94
95 or
96
97 use Unicode::Normalize 'normalize';
98
8f118dcd 99 $NFD_string = normalize('D', $string); # Normalization Form D
100 $NFC_string = normalize('C', $string); # Normalization Form C
101 $NFKD_string = normalize('KD', $string); # Normalization Form KD
102 $NFKC_string = normalize('KC', $string); # Normalization Form KC
2a204b45 103
104=head1 DESCRIPTION
105
00f2676f 106Parameters:
107
108C<$string> is used as a string under character semantics
109(see F<perlunicode>).
110
111C<$codepoint> should be an unsigned integer
112representing a Unicode code point.
113
114Note: Between XS edition and pure Perl edition,
115interpretation of C<$codepoint> as a decimal number has incompatibility.
116XS converts C<$codepoint> to an unsigned integer, but pure Perl does not.
117Do not use a floating point nor a negative sign in C<$codepoint>.
118
d85850a7 119=head2 Normalization Forms
2a204b45 120
121=over 4
122
8f118dcd 123=item C<$NFD_string = NFD($string)>
2a204b45 124
125returns the Normalization Form D (formed by canonical decomposition).
126
8f118dcd 127=item C<$NFC_string = NFC($string)>
2a204b45 128
129returns the Normalization Form C (formed by canonical decomposition
130followed by canonical composition).
131
8f118dcd 132=item C<$NFKD_string = NFKD($string)>
2a204b45 133
134returns the Normalization Form KD (formed by compatibility decomposition).
135
8f118dcd 136=item C<$NFKC_string = NFKC($string)>
2a204b45 137
138returns the Normalization Form KC (formed by compatibility decomposition
139followed by B<canonical> composition).
140
8f118dcd 141=item C<$normalized_string = normalize($form_name, $string)>
2a204b45 142
143As C<$form_name>, one of the following names must be given.
144
145 'C' or 'NFC' for Normalization Form C
146 'D' or 'NFD' for Normalization Form D
147 'KC' or 'NFKC' for Normalization Form KC
148 'KD' or 'NFKD' for Normalization Form KD
149
150=back
151
8f118dcd 152=head2 Decomposition and Composition
153
154=over 4
155
156=item C<$decomposed_string = decompose($string)>
157
158=item C<$decomposed_string = decompose($string, $useCompatMapping)>
159
9f1f04a1 160Decomposes the specified string and returns the result.
8f118dcd 161
162If the second parameter (a boolean) is omitted or false, decomposes it
163using the Canonical Decomposition Mapping.
164If true, decomposes it using the Compatibility Decomposition Mapping.
165
166The string returned is not always in NFD/NFKD.
167Reordering may be required.
168
169 $NFD_string = reorder(decompose($string)); # eq. to NFD()
170 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
171
172=item C<$reordered_string = reorder($string)>
173
9f1f04a1 174Reorders the combining characters and the like in the canonical ordering
8f118dcd 175and returns the result.
176
177E.g., when you have a list of NFD/NFKD strings,
178you can get the concatenated NFD/NFKD string from them, saying
179
180 $concat_NFD = reorder(join '', @NFD_strings);
181 $concat_NFKD = reorder(join '', @NFKD_strings);
182
183=item C<$composed_string = compose($string)>
184
185Returns the string where composable pairs are composed.
186
187E.g., when you have a NFD/NFKD string,
188you can get its NFC/NFKC string, saying
189
190 $NFC_string = compose($NFD_string);
191 $NFKC_string = compose($NFKD_string);
192
193=back
194
195=head2 Quick Check
196
6c941e0c 197(see Annex 8, UAX #15, and F<DerivedNormalizationProps.txt>)
8f118dcd 198
199The following functions check whether the string is in that normalization form.
200
201The result returned will be:
202
203 YES The string is in that normalization form.
204 NO The string is not in that normalization form.
205 MAYBE Dubious. Maybe yes, maybe no.
206
207=over 4
208
209=item C<$result = checkNFD($string)>
210
f027f502 211returns C<YES> (C<1>) or C<NO> (C<empty string>).
8f118dcd 212
213=item C<$result = checkNFC($string)>
214
f027f502 215returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd 216
217=item C<$result = checkNFKD($string)>
218
f027f502 219returns C<YES> (C<1>) or C<NO> (C<empty string>).
8f118dcd 220
221=item C<$result = checkNFKC($string)>
222
f027f502 223returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd 224
225=item C<$result = check($form_name, $string)>
226
f027f502 227returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd 228
229C<$form_name> is alike to that for C<normalize()>.
230
231=back
232
233B<Note>
234
235In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
236The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
237
238A MAYBE-NFC/NFKC string should contain at least
239one combining character or the like.
240For example, C<COMBINING ACUTE ACCENT> has
241the MAYBE_NFC/MAYBE_NFKC property.
242Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
243and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
f027f502 244C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
8f118dcd 245(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
246while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
247
248If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
249
250 $string eq NFC($string) # more thorough than checkNFC($string)
251 $string eq NFKC($string) # more thorough than checkNFKC($string)
252
2a204b45 253=head2 Character Data
254
255These functions are interface of character data used internally.
d0ed0342 256If you want only to get Unicode normalization forms, you don't need
257call them yourself.
2a204b45 258
259=over 4
260
261=item C<$canonical_decomposed = getCanon($codepoint)>
262
8f118dcd 263If the character of the specified codepoint is canonically
264decomposable (including Hangul Syllables),
265returns the B<completely decomposed> string canonically equivalent to it.
266
f027f502 267If it is not decomposable, returns C<undef>.
8f118dcd 268
2a204b45 269=item C<$compatibility_decomposed = getCompat($codepoint)>
270
8f118dcd 271If the character of the specified codepoint is compatibility
272decomposable (including Hangul Syllables),
273returns the B<completely decomposed> string compatibility equivalent to it.
2a204b45 274
f027f502 275If it is not decomposable, returns C<undef>.
2a204b45 276
8f118dcd 277=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
2a204b45 278
d85850a7 279If two characters here and next (as codepoints) are composable
8f118dcd 280(including Hangul Jamo/Syllables and Composition Exclusions),
2a204b45 281returns the codepoint of the composite.
282
f027f502 283If they are not composable, returns C<undef>.
2a204b45 284
285=item C<$combining_class = getCombinClass($codepoint)>
286
8f118dcd 287Returns the combining class of the character as an integer.
2a204b45 288
289=item C<$is_exclusion = isExclusion($codepoint)>
290
8f118dcd 291Returns a boolean whether the character of the specified codepoint
292is a composition exclusion.
293
294=item C<$is_singleton = isSingleton($codepoint)>
295
2a204b45 296Returns a boolean whether the character of the specified codepoint is
8f118dcd 297a singleton.
298
6c941e0c 299=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
8f118dcd 300
301Returns a boolean whether the canonical decomposition
302of the character of the specified codepoint
303is a Non-Starter Decomposition.
304
305=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
306
307Returns a boolean whether the character of the specified codepoint
308may be composed with the previous one in a certain composition
309(including Hangul Compositions, but excluding
310Composition Exclusions and Non-Starter Decompositions).
2a204b45 311
312=back
313
314=head2 EXPORT
315
316C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
317
318C<normalize> and other some functions: on request.
319
320=head1 AUTHOR
321
322SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
323
324 http://homepage1.nifty.com/nomenclator/perl/
325
6c941e0c 326 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
2a204b45 327
6c941e0c 328 This module is free software; you can redistribute it
329 and/or modify it under the same terms as Perl itself.
2a204b45 330
331=head1 SEE ALSO
332
333=over 4
334
335=item http://www.unicode.org/unicode/reports/tr15/
336
337Unicode Normalization Forms - UAX #15
338
14e6b36c 339=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
8f118dcd 340
341Derived Normalization Properties
342
2a204b45 343=back
344
345=cut
346