1 package Unicode::Normalize;
5 die "Unicode::Normalize not ported to EBCDIC\n";
14 our $VERSION = '0.14';
15 our $PACKAGE = __PACKAGE__;
21 our @ISA = qw(Exporter DynaLoader);
22 our @EXPORT = qw( NFC NFD NFKC NFKD );
24 normalize decompose reorder compose
25 getCanon getCompat getComposite getCombinClass isExclusion
27 our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
29 bootstrap Unicode::Normalize $VERSION;
31 use constant COMPAT => 1;
33 sub NFD ($) { reorder(decompose($_[0])) }
34 sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
36 sub NFC ($) { compose(reorder(decompose($_[0]))) }
37 sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
44 $form eq 'D' ? NFD ($_[0]) :
45 $form eq 'C' ? NFC ($_[0]) :
46 $form eq 'KD' ? NFKD($_[0]) :
47 $form eq 'KC' ? NFKC($_[0]) :
48 croak $PACKAGE."::normalize: invalid form name: $form";
56 Unicode::Normalize - normalized forms of Unicode text
60 use Unicode::Normalize;
62 $string_NFD = NFD($raw_string); # Normalization Form D
63 $string_NFC = NFC($raw_string); # Normalization Form C
64 $string_NFKD = NFKD($raw_string); # Normalization Form KD
65 $string_NFKC = NFKC($raw_string); # Normalization Form KC
69 use Unicode::Normalize 'normalize';
71 $string_NFD = normalize('D', $raw_string); # Normalization Form D
72 $string_NFC = normalize('C', $raw_string); # Normalization Form C
73 $string_NFKD = normalize('KD', $raw_string); # Normalization Form KD
74 $string_NFKC = normalize('KC', $raw_string); # Normalization Form KC
78 =head2 Normalization Forms
82 =item C<$string_NFD = NFD($raw_string)>
84 returns the Normalization Form D (formed by canonical decomposition).
87 =item C<$string_NFC = NFC($raw_string)>
89 returns the Normalization Form C (formed by canonical decomposition
90 followed by canonical composition).
92 =item C<$string_NFKD = NFKD($raw_string)>
94 returns the Normalization Form KD (formed by compatibility decomposition).
96 =item C<$string_NFKC = NFKC($raw_string)>
98 returns the Normalization Form KC (formed by compatibility decomposition
99 followed by B<canonical> composition).
101 =item C<$normalized_string = normalize($form_name, $raw_string)>
103 As C<$form_name>, one of the following names must be given.
105 'C' or 'NFC' for Normalization Form C
106 'D' or 'NFD' for Normalization Form D
107 'KC' or 'NFKC' for Normalization Form KC
108 'KD' or 'NFKD' for Normalization Form KD
112 =head2 Character Data
114 These functions are interface of character data used internally.
115 If you want only to get Unicode normalization forms, you don't need
120 =item C<$canonical_decomposed = getCanon($codepoint)>
122 =item C<$compatibility_decomposed = getCompat($codepoint)>
124 If the character of the specified codepoint is canonically or
125 compatibility decomposable (including Hangul Syllables),
126 returns the B<completely decomposed> string equivalent to it.
128 If it is not decomposable, returns undef.
130 =item C<$uv_composite = getComposite($uv_here, $uv_next)>
132 If two characters here and next (as codepoints) are composable
133 (including Hangul Jamo/Syllables and Exclusions),
134 returns the codepoint of the composite.
136 If they are not composable, returns undef.
138 =item C<$combining_class = getCombinClass($codepoint)>
140 Returns the combining class as integer of the character.
142 =item C<$is_exclusion = isExclusion($codepoint)>
144 Returns a boolean whether the character of the specified codepoint is
145 a composition exclusion.
151 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
153 C<normalize> and other some functions: on request.
157 Unicode::Normalize has not been ported to EBCDIC. The code mostly
158 would work just fine but a decision needs to be made: how the module
159 should work in EBCDIC? Should the low 256 characters be understood as
160 Unicode or as EBCDIC code points? Should one be chosen or should
161 there be a way to do either? Or should such translation be left
162 outside the module for the user to do, for example by using
167 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
169 http://homepage1.nifty.com/nomenclator/perl/
171 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
173 This program is free software; you can redistribute it and/or
174 modify it under the same terms as Perl itself.
180 =item http://www.unicode.org/unicode/reports/tr15/
182 Unicode Normalization Forms - UAX #15