Upgrade to Encode 1.89. The enc_module.t required
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.pm
CommitLineData
ac5ea531 1package Unicode::Normalize;
2
4a2e806c 3BEGIN {
4 if (ord("A") == 193) {
be32ca57 5 die "Unicode::Normalize not ported to EBCDIC\n";
4a2e806c 6 }
7}
8
ac5ea531 9use 5.006;
10use strict;
11use warnings;
12use Carp;
13
f027f502 14our $VERSION = '0.17';
ac5ea531 15our $PACKAGE = __PACKAGE__;
16
17require Exporter;
18require DynaLoader;
19require AutoLoader;
20
21our @ISA = qw(Exporter DynaLoader);
22our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45 23our @EXPORT_OK = qw(
24 normalize decompose reorder compose
8f118dcd 25 checkNFD checkNFKD checkNFC checkNFKC check
26 getCanon getCompat getComposite getCombinClass
27 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
28 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
29);
30our %EXPORT_TAGS = (
31 all => [ @EXPORT, @EXPORT_OK ],
32 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
33 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
2a204b45 34);
ac5ea531 35
36bootstrap Unicode::Normalize $VERSION;
37
ac5ea531 38use constant COMPAT => 1;
39
d85850a7 40sub NFD ($) { reorder(decompose($_[0])) }
ac5ea531 41sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
d85850a7 42sub NFC ($) { compose(reorder(decompose($_[0]))) }
ac5ea531 43sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
44
45sub normalize($$)
46{
d85850a7 47 my $form = shift;
f027f502 48 my $str = shift;
d85850a7 49 $form =~ s/^NF//;
50 return
f027f502 51 $form eq 'D' ? NFD ($str) :
52 $form eq 'C' ? NFC ($str) :
53 $form eq 'KD' ? NFKD($str) :
54 $form eq 'KC' ? NFKC($str) :
d85850a7 55 croak $PACKAGE."::normalize: invalid form name: $form";
ac5ea531 56}
57
8f118dcd 58sub check($$)
59{
60 my $form = shift;
f027f502 61 my $str = shift;
8f118dcd 62 $form =~ s/^NF//;
63 return
f027f502 64 $form eq 'D' ? checkNFD ($str) :
65 $form eq 'C' ? checkNFC ($str) :
66 $form eq 'KD' ? checkNFKD($str) :
67 $form eq 'KC' ? checkNFKC($str) :
8f118dcd 68 croak $PACKAGE."::check: invalid form name: $form";
69}
70
ac5ea531 711;
72__END__
2a204b45 73
74=head1 NAME
75
f027f502 76Unicode::Normalize - Unicode Normalization Forms
2a204b45 77
78=head1 SYNOPSIS
79
80 use Unicode::Normalize;
81
8f118dcd 82 $NFD_string = NFD($string); # Normalization Form D
83 $NFC_string = NFC($string); # Normalization Form C
84 $NFKD_string = NFKD($string); # Normalization Form KD
85 $NFKC_string = NFKC($string); # Normalization Form KC
2a204b45 86
87 or
88
89 use Unicode::Normalize 'normalize';
90
8f118dcd 91 $NFD_string = normalize('D', $string); # Normalization Form D
92 $NFC_string = normalize('C', $string); # Normalization Form C
93 $NFKD_string = normalize('KD', $string); # Normalization Form KD
94 $NFKC_string = normalize('KC', $string); # Normalization Form KC
2a204b45 95
96=head1 DESCRIPTION
97
d85850a7 98=head2 Normalization Forms
2a204b45 99
100=over 4
101
8f118dcd 102=item C<$NFD_string = NFD($string)>
2a204b45 103
104returns the Normalization Form D (formed by canonical decomposition).
105
8f118dcd 106=item C<$NFC_string = NFC($string)>
2a204b45 107
108returns the Normalization Form C (formed by canonical decomposition
109followed by canonical composition).
110
8f118dcd 111=item C<$NFKD_string = NFKD($string)>
2a204b45 112
113returns the Normalization Form KD (formed by compatibility decomposition).
114
8f118dcd 115=item C<$NFKC_string = NFKC($string)>
2a204b45 116
117returns the Normalization Form KC (formed by compatibility decomposition
118followed by B<canonical> composition).
119
8f118dcd 120=item C<$normalized_string = normalize($form_name, $string)>
2a204b45 121
122As C<$form_name>, one of the following names must be given.
123
124 'C' or 'NFC' for Normalization Form C
125 'D' or 'NFD' for Normalization Form D
126 'KC' or 'NFKC' for Normalization Form KC
127 'KD' or 'NFKD' for Normalization Form KD
128
129=back
130
8f118dcd 131=head2 Decomposition and Composition
132
133=over 4
134
135=item C<$decomposed_string = decompose($string)>
136
137=item C<$decomposed_string = decompose($string, $useCompatMapping)>
138
139Decompose the specified string and returns the result.
140
141If the second parameter (a boolean) is omitted or false, decomposes it
142using the Canonical Decomposition Mapping.
143If true, decomposes it using the Compatibility Decomposition Mapping.
144
145The string returned is not always in NFD/NFKD.
146Reordering may be required.
147
148 $NFD_string = reorder(decompose($string)); # eq. to NFD()
149 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
150
151=item C<$reordered_string = reorder($string)>
152
153Reorder the combining characters and the like in the canonical ordering
154and returns the result.
155
156E.g., when you have a list of NFD/NFKD strings,
157you can get the concatenated NFD/NFKD string from them, saying
158
159 $concat_NFD = reorder(join '', @NFD_strings);
160 $concat_NFKD = reorder(join '', @NFKD_strings);
161
162=item C<$composed_string = compose($string)>
163
164Returns the string where composable pairs are composed.
165
166E.g., when you have a NFD/NFKD string,
167you can get its NFC/NFKC string, saying
168
169 $NFC_string = compose($NFD_string);
170 $NFKC_string = compose($NFKD_string);
171
172=back
173
174=head2 Quick Check
175
14e6b36c 176(see Annex 8, UAX #15; F<DerivedNormalizationProps.txt>)
8f118dcd 177
178The following functions check whether the string is in that normalization form.
179
180The result returned will be:
181
182 YES The string is in that normalization form.
183 NO The string is not in that normalization form.
184 MAYBE Dubious. Maybe yes, maybe no.
185
186=over 4
187
188=item C<$result = checkNFD($string)>
189
f027f502 190returns C<YES> (C<1>) or C<NO> (C<empty string>).
8f118dcd 191
192=item C<$result = checkNFC($string)>
193
f027f502 194returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd 195
196=item C<$result = checkNFKD($string)>
197
f027f502 198returns C<YES> (C<1>) or C<NO> (C<empty string>).
8f118dcd 199
200=item C<$result = checkNFKC($string)>
201
f027f502 202returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd 203
204=item C<$result = check($form_name, $string)>
205
f027f502 206returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd 207
208C<$form_name> is alike to that for C<normalize()>.
209
210=back
211
212B<Note>
213
214In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
215The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
216
217A MAYBE-NFC/NFKC string should contain at least
218one combining character or the like.
219For example, C<COMBINING ACUTE ACCENT> has
220the MAYBE_NFC/MAYBE_NFKC property.
221Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
222and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
f027f502 223C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
8f118dcd 224(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
225while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
226
227If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
228
229 $string eq NFC($string) # more thorough than checkNFC($string)
230 $string eq NFKC($string) # more thorough than checkNFKC($string)
231
2a204b45 232=head2 Character Data
233
234These functions are interface of character data used internally.
d0ed0342 235If you want only to get Unicode normalization forms, you don't need
236call them yourself.
2a204b45 237
238=over 4
239
240=item C<$canonical_decomposed = getCanon($codepoint)>
241
8f118dcd 242If the character of the specified codepoint is canonically
243decomposable (including Hangul Syllables),
244returns the B<completely decomposed> string canonically equivalent to it.
245
f027f502 246If it is not decomposable, returns C<undef>.
8f118dcd 247
2a204b45 248=item C<$compatibility_decomposed = getCompat($codepoint)>
249
8f118dcd 250If the character of the specified codepoint is compatibility
251decomposable (including Hangul Syllables),
252returns the B<completely decomposed> string compatibility equivalent to it.
2a204b45 253
f027f502 254If it is not decomposable, returns C<undef>.
2a204b45 255
8f118dcd 256=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
2a204b45 257
d85850a7 258If two characters here and next (as codepoints) are composable
8f118dcd 259(including Hangul Jamo/Syllables and Composition Exclusions),
2a204b45 260returns the codepoint of the composite.
261
f027f502 262If they are not composable, returns C<undef>.
2a204b45 263
264=item C<$combining_class = getCombinClass($codepoint)>
265
8f118dcd 266Returns the combining class of the character as an integer.
2a204b45 267
268=item C<$is_exclusion = isExclusion($codepoint)>
269
8f118dcd 270Returns a boolean whether the character of the specified codepoint
271is a composition exclusion.
272
273=item C<$is_singleton = isSingleton($codepoint)>
274
2a204b45 275Returns a boolean whether the character of the specified codepoint is
8f118dcd 276a singleton.
277
278=item C<$is_non_startar_decomposition = isNonStDecomp($codepoint)>
279
280Returns a boolean whether the canonical decomposition
281of the character of the specified codepoint
282is a Non-Starter Decomposition.
283
284=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
285
286Returns a boolean whether the character of the specified codepoint
287may be composed with the previous one in a certain composition
288(including Hangul Compositions, but excluding
289Composition Exclusions and Non-Starter Decompositions).
2a204b45 290
291=back
292
293=head2 EXPORT
294
295C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
296
297C<normalize> and other some functions: on request.
298
299=head1 AUTHOR
300
301SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
302
303 http://homepage1.nifty.com/nomenclator/perl/
304
ab8fe378 305 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
2a204b45 306
307 This program is free software; you can redistribute it and/or
308 modify it under the same terms as Perl itself.
309
310=head1 SEE ALSO
311
312=over 4
313
314=item http://www.unicode.org/unicode/reports/tr15/
315
316Unicode Normalization Forms - UAX #15
317
14e6b36c 318=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
8f118dcd 319
320Derived Normalization Properties
321
2a204b45 322=back
323
324=cut
325