Commit | Line | Data |
ac5ea531 |
1 | package Unicode::Normalize; |
2 | |
4a2e806c |
3 | BEGIN { |
4 | if (ord("A") == 193) { |
5 | die "Unicode::Normalize not ported to EBCDIC\n"; |
6 | } |
7 | } |
8 | |
ac5ea531 |
9 | use 5.006; |
10 | use strict; |
11 | use warnings; |
12 | use Carp; |
13 | |
d85850a7 |
14 | our $VERSION = '0.13'; |
ac5ea531 |
15 | our $PACKAGE = __PACKAGE__; |
16 | |
17 | require Exporter; |
18 | require DynaLoader; |
19 | require AutoLoader; |
20 | |
21 | our @ISA = qw(Exporter DynaLoader); |
22 | our @EXPORT = qw( NFC NFD NFKC NFKD ); |
2a204b45 |
23 | our @EXPORT_OK = qw( |
24 | normalize decompose reorder compose |
25 | getCanon getCompat getComposite getCombinClass isExclusion |
26 | ); |
ac5ea531 |
27 | our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] ); |
28 | |
29 | bootstrap Unicode::Normalize $VERSION; |
30 | |
ac5ea531 |
31 | use constant COMPAT => 1; |
32 | |
d85850a7 |
33 | sub NFD ($) { reorder(decompose($_[0])) } |
ac5ea531 |
34 | sub NFKD ($) { reorder(decompose($_[0], COMPAT)) } |
35 | |
d85850a7 |
36 | sub NFC ($) { compose(reorder(decompose($_[0]))) } |
ac5ea531 |
37 | sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) } |
38 | |
39 | sub normalize($$) |
40 | { |
d85850a7 |
41 | my $form = shift; |
42 | $form =~ s/^NF//; |
43 | return |
44 | $form eq 'D' ? NFD ($_[0]) : |
45 | $form eq 'C' ? NFC ($_[0]) : |
46 | $form eq 'KD' ? NFKD($_[0]) : |
47 | $form eq 'KC' ? NFKC($_[0]) : |
48 | croak $PACKAGE."::normalize: invalid form name: $form"; |
ac5ea531 |
49 | } |
50 | |
51 | 1; |
52 | __END__ |
2a204b45 |
53 | |
54 | =head1 NAME |
55 | |
56 | Unicode::Normalize - normalized forms of Unicode text |
57 | |
58 | =head1 SYNOPSIS |
59 | |
60 | use Unicode::Normalize; |
61 | |
62 | $string_NFD = NFD($raw_string); # Normalization Form D |
63 | $string_NFC = NFC($raw_string); # Normalization Form C |
64 | $string_NFKD = NFKD($raw_string); # Normalization Form KD |
65 | $string_NFKC = NFKC($raw_string); # Normalization Form KC |
66 | |
67 | or |
68 | |
69 | use Unicode::Normalize 'normalize'; |
70 | |
71 | $string_NFD = normalize('D', $raw_string); # Normalization Form D |
72 | $string_NFC = normalize('C', $raw_string); # Normalization Form C |
73 | $string_NFKD = normalize('KD', $raw_string); # Normalization Form KD |
74 | $string_NFKC = normalize('KC', $raw_string); # Normalization Form KC |
75 | |
76 | =head1 DESCRIPTION |
77 | |
d85850a7 |
78 | =head2 Normalization Forms |
2a204b45 |
79 | |
80 | =over 4 |
81 | |
82 | =item C<$string_NFD = NFD($raw_string)> |
83 | |
84 | returns the Normalization Form D (formed by canonical decomposition). |
85 | |
86 | |
87 | =item C<$string_NFC = NFC($raw_string)> |
88 | |
89 | returns the Normalization Form C (formed by canonical decomposition |
90 | followed by canonical composition). |
91 | |
92 | =item C<$string_NFKD = NFKD($raw_string)> |
93 | |
94 | returns the Normalization Form KD (formed by compatibility decomposition). |
95 | |
96 | =item C<$string_NFKC = NFKC($raw_string)> |
97 | |
98 | returns the Normalization Form KC (formed by compatibility decomposition |
99 | followed by B<canonical> composition). |
100 | |
101 | =item C<$normalized_string = normalize($form_name, $raw_string)> |
102 | |
103 | As C<$form_name>, one of the following names must be given. |
104 | |
105 | 'C' or 'NFC' for Normalization Form C |
106 | 'D' or 'NFD' for Normalization Form D |
107 | 'KC' or 'NFKC' for Normalization Form KC |
108 | 'KD' or 'NFKD' for Normalization Form KD |
109 | |
110 | =back |
111 | |
112 | =head2 Character Data |
113 | |
114 | These functions are interface of character data used internally. |
d0ed0342 |
115 | If you want only to get Unicode normalization forms, you don't need |
116 | call them yourself. |
2a204b45 |
117 | |
118 | =over 4 |
119 | |
120 | =item C<$canonical_decomposed = getCanon($codepoint)> |
121 | |
122 | =item C<$compatibility_decomposed = getCompat($codepoint)> |
123 | |
124 | If the character of the specified codepoint is canonically or |
125 | compatibility decomposable (including Hangul Syllables), |
126 | returns the B<completely decomposed> string equivalent to it. |
127 | |
128 | If it is not decomposable, returns undef. |
129 | |
130 | =item C<$uv_composite = getComposite($uv_here, $uv_next)> |
131 | |
d85850a7 |
132 | If two characters here and next (as codepoints) are composable |
2a204b45 |
133 | (including Hangul Jamo/Syllables and Exclusions), |
134 | returns the codepoint of the composite. |
135 | |
136 | If they are not composable, returns undef. |
137 | |
138 | =item C<$combining_class = getCombinClass($codepoint)> |
139 | |
140 | Returns the combining class as integer of the character. |
141 | |
142 | =item C<$is_exclusion = isExclusion($codepoint)> |
143 | |
144 | Returns a boolean whether the character of the specified codepoint is |
145 | a composition exclusion. |
146 | |
147 | =back |
148 | |
149 | =head2 EXPORT |
150 | |
151 | C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. |
152 | |
153 | C<normalize> and other some functions: on request. |
154 | |
4a2e806c |
155 | =head2 TODO |
156 | |
157 | Unicode::Normalize has not been ported to EBCDIC. The code mostly |
158 | would work just fine but a decision needs to be made: how the module |
159 | should work in EBCDIC? Should the low 256 characters be understood as |
160 | Unicode or as EBCDIC code points? Should one be chosen or should |
161 | there be a way to do either? Or should such translation be left |
162 | outside the module for the user to do, for example by using |
163 | Encode::from_to()? |
164 | |
2a204b45 |
165 | =head1 AUTHOR |
166 | |
167 | SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt> |
168 | |
169 | http://homepage1.nifty.com/nomenclator/perl/ |
170 | |
171 | Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved. |
172 | |
173 | This program is free software; you can redistribute it and/or |
174 | modify it under the same terms as Perl itself. |
175 | |
176 | =head1 SEE ALSO |
177 | |
178 | =over 4 |
179 | |
180 | =item http://www.unicode.org/unicode/reports/tr15/ |
181 | |
182 | Unicode Normalization Forms - UAX #15 |
183 | |
184 | =back |
185 | |
186 | =cut |
187 | |