D'oh! This has been moved to lib/Digest/t/digest.t but not deleted.
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.pm
CommitLineData
ac5ea531 1package Unicode::Normalize;
2
4a2e806c 3BEGIN {
1efaba7f 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Normalize cannot stringify a Unicode code point\n";
4a2e806c 6 }
7}
8
ac5ea531 9use 5.006;
10use strict;
11use warnings;
12use Carp;
13
82e740b6 14our $VERSION = '0.25';
ac5ea531 15our $PACKAGE = __PACKAGE__;
16
17require Exporter;
18require DynaLoader;
ac5ea531 19
20our @ISA = qw(Exporter DynaLoader);
21our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45 22our @EXPORT_OK = qw(
23 normalize decompose reorder compose
8f118dcd 24 checkNFD checkNFKD checkNFC checkNFKC check
25 getCanon getCompat getComposite getCombinClass
26 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
27 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
82e740b6 28 FCD checkFCD FCC checkFCC composeContiguous
29 splitOnLastStarter
8f118dcd 30);
31our %EXPORT_TAGS = (
32 all => [ @EXPORT, @EXPORT_OK ],
33 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
34 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
82e740b6 35 fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
2a204b45 36);
ac5ea531 37
82e740b6 38######
39
ac5ea531 40bootstrap Unicode::Normalize $VERSION;
41
82e740b6 42######
43
9f1f04a1 44sub pack_U {
b8d10bc1 45 return pack('U*', @_);
9f1f04a1 46}
47
48sub unpack_U {
b8d10bc1 49 return unpack('U*', pack('U*').shift);
9f1f04a1 50}
51
82e740b6 52
53##
54## normalization forms
55##
56
ac5ea531 57use constant COMPAT => 1;
58
d85850a7 59sub NFD ($) { reorder(decompose($_[0])) }
ac5ea531 60sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
d85850a7 61sub NFC ($) { compose(reorder(decompose($_[0]))) }
ac5ea531 62sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
63
82e740b6 64sub FCD ($) {
65 my $str = shift;
66 return checkFCD($str) ? $str : NFD($str);
67}
68sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) }
69
70our %formNorm = (
71 NFC => \&NFC, C => \&NFC,
72 NFD => \&NFD, D => \&NFD,
73 NFKC => \&NFKC, KC => \&NFKC,
74 NFKD => \&NFKD, KD => \&NFKD,
75 FCD => \&FCD, FCC => \&FCC,
76);
77
ac5ea531 78sub normalize($$)
79{
d85850a7 80 my $form = shift;
f027f502 81 my $str = shift;
82e740b6 82 return exists $formNorm{$form}
83 ? $formNorm{$form}->($str)
84 : croak $PACKAGE."::normalize: invalid form name: $form";
ac5ea531 85}
86
82e740b6 87
88##
89## quick check
90##
91
92our %formCheck = (
93 NFC => \&checkNFC, C => \&checkNFC,
94 NFD => \&checkNFD, D => \&checkNFD,
95 NFKC => \&checkNFKC, KC => \&checkNFKC,
96 NFKD => \&checkNFKD, KD => \&checkNFKD,
97 FCD => \&checkFCD, FCC => \&checkFCC,
98);
99
8f118dcd 100sub check($$)
101{
102 my $form = shift;
f027f502 103 my $str = shift;
82e740b6 104 return exists $formCheck{$form}
105 ? $formCheck{$form}->($str)
106 : croak $PACKAGE."::check: invalid form name: $form";
8f118dcd 107}
108
ac5ea531 1091;
110__END__
2a204b45 111
112=head1 NAME
113
f027f502 114Unicode::Normalize - Unicode Normalization Forms
2a204b45 115
116=head1 SYNOPSIS
117
118 use Unicode::Normalize;
119
8f118dcd 120 $NFD_string = NFD($string); # Normalization Form D
121 $NFC_string = NFC($string); # Normalization Form C
122 $NFKD_string = NFKD($string); # Normalization Form KD
123 $NFKC_string = NFKC($string); # Normalization Form KC
2a204b45 124
125 or
126
127 use Unicode::Normalize 'normalize';
128
8f118dcd 129 $NFD_string = normalize('D', $string); # Normalization Form D
130 $NFC_string = normalize('C', $string); # Normalization Form C
131 $NFKD_string = normalize('KD', $string); # Normalization Form KD
132 $NFKC_string = normalize('KC', $string); # Normalization Form KC
2a204b45 133
134=head1 DESCRIPTION
135
00f2676f 136Parameters:
137
138C<$string> is used as a string under character semantics
139(see F<perlunicode>).
140
141C<$codepoint> should be an unsigned integer
142representing a Unicode code point.
143
144Note: Between XS edition and pure Perl edition,
145interpretation of C<$codepoint> as a decimal number has incompatibility.
146XS converts C<$codepoint> to an unsigned integer, but pure Perl does not.
147Do not use a floating point nor a negative sign in C<$codepoint>.
148
d85850a7 149=head2 Normalization Forms
2a204b45 150
151=over 4
152
8f118dcd 153=item C<$NFD_string = NFD($string)>
2a204b45 154
155returns the Normalization Form D (formed by canonical decomposition).
156
8f118dcd 157=item C<$NFC_string = NFC($string)>
2a204b45 158
159returns the Normalization Form C (formed by canonical decomposition
160followed by canonical composition).
161
8f118dcd 162=item C<$NFKD_string = NFKD($string)>
2a204b45 163
164returns the Normalization Form KD (formed by compatibility decomposition).
165
8f118dcd 166=item C<$NFKC_string = NFKC($string)>
2a204b45 167
168returns the Normalization Form KC (formed by compatibility decomposition
169followed by B<canonical> composition).
170
82e740b6 171=item C<$FCD_string = FCD($string)>
172
173If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
174returns it without modification; otherwise returns an FCD string.
175
176Note: FCD is not always unique, then plural forms may be equivalent
177each other. C<FCD()> will return one of these equivalent forms.
178
179=item C<$FCC_string = FCC($string)>
180
181returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
182
183Note: FCD is unique, as well as four normalization forms (NF*).
184
8f118dcd 185=item C<$normalized_string = normalize($form_name, $string)>
2a204b45 186
187As C<$form_name>, one of the following names must be given.
188
82e740b6 189 'C' or 'NFC' for Normalization Form C (UAX #15)
190 'D' or 'NFD' for Normalization Form D (UAX #15)
191 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
192 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
193
194 'FCD' for "Fast C or D" Form (UTN #5)
195 'FCC' for "Fast C Contiguous" (UTN #5)
2a204b45 196
197=back
198
8f118dcd 199=head2 Decomposition and Composition
200
201=over 4
202
203=item C<$decomposed_string = decompose($string)>
204
205=item C<$decomposed_string = decompose($string, $useCompatMapping)>
206
9f1f04a1 207Decomposes the specified string and returns the result.
8f118dcd 208
209If the second parameter (a boolean) is omitted or false, decomposes it
210using the Canonical Decomposition Mapping.
211If true, decomposes it using the Compatibility Decomposition Mapping.
212
213The string returned is not always in NFD/NFKD.
214Reordering may be required.
215
216 $NFD_string = reorder(decompose($string)); # eq. to NFD()
217 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
218
219=item C<$reordered_string = reorder($string)>
220
9f1f04a1 221Reorders the combining characters and the like in the canonical ordering
8f118dcd 222and returns the result.
223
224E.g., when you have a list of NFD/NFKD strings,
225you can get the concatenated NFD/NFKD string from them, saying
226
227 $concat_NFD = reorder(join '', @NFD_strings);
228 $concat_NFKD = reorder(join '', @NFKD_strings);
229
230=item C<$composed_string = compose($string)>
231
232Returns the string where composable pairs are composed.
233
234E.g., when you have a NFD/NFKD string,
235you can get its NFC/NFKC string, saying
236
237 $NFC_string = compose($NFD_string);
238 $NFKC_string = compose($NFKD_string);
239
240=back
241
242=head2 Quick Check
243
82e740b6 244(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
8f118dcd 245
246The following functions check whether the string is in that normalization form.
247
248The result returned will be:
249
250 YES The string is in that normalization form.
251 NO The string is not in that normalization form.
252 MAYBE Dubious. Maybe yes, maybe no.
253
254=over 4
255
256=item C<$result = checkNFD($string)>
257
f027f502 258returns C<YES> (C<1>) or C<NO> (C<empty string>).
8f118dcd 259
260=item C<$result = checkNFC($string)>
261
f027f502 262returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd 263
264=item C<$result = checkNFKD($string)>
265
f027f502 266returns C<YES> (C<1>) or C<NO> (C<empty string>).
8f118dcd 267
268=item C<$result = checkNFKC($string)>
269
f027f502 270returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd 271
82e740b6 272=item C<$result = checkFCD($string)>
273
274returns C<YES> (C<1>) or C<NO> (C<empty string>).
275
276=item C<$result = checkFCC($string)>
277
278returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
279
280If a string is not in C<FCD>, it must not be in <FCC>.
281So C<checkFCC($not_FCD_string)> should return C<NO>.
282
8f118dcd 283=item C<$result = check($form_name, $string)>
284
f027f502 285returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd 286
287C<$form_name> is alike to that for C<normalize()>.
288
289=back
290
291B<Note>
292
82e740b6 293In the cases of NFD, NFKD, and FCD, the answer must be
294either C<YES> or C<NO>. The answer C<MAYBE> may be returned
295in the cases of NFC, NFKC, and FCC.
8f118dcd 296
82e740b6 297A C<MAYBE> string should contain at least one combining character
298or the like. For example, C<COMBINING ACUTE ACCENT> has
8f118dcd 299the MAYBE_NFC/MAYBE_NFKC property.
82e740b6 300
8f118dcd 301Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
302and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
f027f502 303C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
8f118dcd 304(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
305while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
306
82e740b6 307If you want to check exactly, compare the string with its NFC/NFKC/FCC;
308i.e.,
8f118dcd 309
82e740b6 310 $string eq NFC($string) # thorough than checkNFC($string)
311 $string eq NFKC($string) # thorough than checkNFKC($string)
312 $string eq FCC($string) # thorough than checkFCC($string)
8f118dcd 313
2a204b45 314=head2 Character Data
315
316These functions are interface of character data used internally.
d0ed0342 317If you want only to get Unicode normalization forms, you don't need
318call them yourself.
2a204b45 319
320=over 4
321
322=item C<$canonical_decomposed = getCanon($codepoint)>
323
8f118dcd 324If the character of the specified codepoint is canonically
325decomposable (including Hangul Syllables),
326returns the B<completely decomposed> string canonically equivalent to it.
327
f027f502 328If it is not decomposable, returns C<undef>.
8f118dcd 329
2a204b45 330=item C<$compatibility_decomposed = getCompat($codepoint)>
331
8f118dcd 332If the character of the specified codepoint is compatibility
333decomposable (including Hangul Syllables),
334returns the B<completely decomposed> string compatibility equivalent to it.
2a204b45 335
f027f502 336If it is not decomposable, returns C<undef>.
2a204b45 337
8f118dcd 338=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
2a204b45 339
d85850a7 340If two characters here and next (as codepoints) are composable
8f118dcd 341(including Hangul Jamo/Syllables and Composition Exclusions),
2a204b45 342returns the codepoint of the composite.
343
f027f502 344If they are not composable, returns C<undef>.
2a204b45 345
346=item C<$combining_class = getCombinClass($codepoint)>
347
8f118dcd 348Returns the combining class of the character as an integer.
2a204b45 349
350=item C<$is_exclusion = isExclusion($codepoint)>
351
8f118dcd 352Returns a boolean whether the character of the specified codepoint
353is a composition exclusion.
354
355=item C<$is_singleton = isSingleton($codepoint)>
356
2a204b45 357Returns a boolean whether the character of the specified codepoint is
8f118dcd 358a singleton.
359
6c941e0c 360=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
8f118dcd 361
362Returns a boolean whether the canonical decomposition
363of the character of the specified codepoint
364is a Non-Starter Decomposition.
365
366=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
367
368Returns a boolean whether the character of the specified codepoint
369may be composed with the previous one in a certain composition
370(including Hangul Compositions, but excluding
371Composition Exclusions and Non-Starter Decompositions).
2a204b45 372
373=back
374
375=head2 EXPORT
376
377C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
378
379C<normalize> and other some functions: on request.
380
381=head1 AUTHOR
382
82e740b6 383SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>
2a204b45 384
385 http://homepage1.nifty.com/nomenclator/perl/
386
6c941e0c 387 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
2a204b45 388
6c941e0c 389 This module is free software; you can redistribute it
390 and/or modify it under the same terms as Perl itself.
2a204b45 391
392=head1 SEE ALSO
393
394=over 4
395
396=item http://www.unicode.org/unicode/reports/tr15/
397
398Unicode Normalization Forms - UAX #15
399
14e6b36c 400=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
8f118dcd 401
402Derived Normalization Properties
403
82e740b6 404=item http://www.unicode.org/notes/tn5/
405
406Canonical Equivalence in Applications - UTN #5
407
2a204b45 408=back
409
410=cut
411