Commit | Line | Data |
ac5ea531 |
1 | package Unicode::Normalize; |
2 | |
3 | use 5.006; |
4 | use strict; |
5 | use warnings; |
6 | use Carp; |
7 | |
2a204b45 |
8 | our $VERSION = '0.12'; |
ac5ea531 |
9 | our $PACKAGE = __PACKAGE__; |
10 | |
11 | require Exporter; |
12 | require DynaLoader; |
13 | require AutoLoader; |
14 | |
15 | our @ISA = qw(Exporter DynaLoader); |
16 | our @EXPORT = qw( NFC NFD NFKC NFKD ); |
2a204b45 |
17 | our @EXPORT_OK = qw( |
18 | normalize decompose reorder compose |
19 | getCanon getCompat getComposite getCombinClass isExclusion |
20 | ); |
ac5ea531 |
21 | our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] ); |
22 | |
23 | bootstrap Unicode::Normalize $VERSION; |
24 | |
25 | use constant CANON => 0; |
26 | use constant COMPAT => 1; |
27 | |
2a204b45 |
28 | sub NFD ($) { reorder(decompose($_[0], CANON )) } |
ac5ea531 |
29 | sub NFKD ($) { reorder(decompose($_[0], COMPAT)) } |
30 | |
2a204b45 |
31 | sub NFC ($) { compose(reorder(decompose($_[0], CANON ))) } |
ac5ea531 |
32 | sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) } |
33 | |
34 | sub normalize($$) |
35 | { |
36 | my $form = shift; |
93deb893 |
37 | $form =~ s/^NF//; |
2a204b45 |
38 | $form eq 'D' ? NFD ($_[0]) : |
39 | $form eq 'C' ? NFC ($_[0]) : |
40 | $form eq 'KD' ? NFKD($_[0]) : |
41 | $form eq 'KC' ? NFKC($_[0]) : |
ac5ea531 |
42 | croak $PACKAGE."::normalize: invalid form name: $form"; |
43 | } |
44 | |
45 | 1; |
46 | __END__ |
2a204b45 |
47 | |
48 | =head1 NAME |
49 | |
50 | Unicode::Normalize - normalized forms of Unicode text |
51 | |
52 | =head1 SYNOPSIS |
53 | |
54 | use Unicode::Normalize; |
55 | |
56 | $string_NFD = NFD($raw_string); # Normalization Form D |
57 | $string_NFC = NFC($raw_string); # Normalization Form C |
58 | $string_NFKD = NFKD($raw_string); # Normalization Form KD |
59 | $string_NFKC = NFKC($raw_string); # Normalization Form KC |
60 | |
61 | or |
62 | |
63 | use Unicode::Normalize 'normalize'; |
64 | |
65 | $string_NFD = normalize('D', $raw_string); # Normalization Form D |
66 | $string_NFC = normalize('C', $raw_string); # Normalization Form C |
67 | $string_NFKD = normalize('KD', $raw_string); # Normalization Form KD |
68 | $string_NFKC = normalize('KC', $raw_string); # Normalization Form KC |
69 | |
70 | =head1 DESCRIPTION |
71 | |
72 | =head2 Normalization |
73 | |
74 | =over 4 |
75 | |
76 | =item C<$string_NFD = NFD($raw_string)> |
77 | |
78 | returns the Normalization Form D (formed by canonical decomposition). |
79 | |
80 | |
81 | =item C<$string_NFC = NFC($raw_string)> |
82 | |
83 | returns the Normalization Form C (formed by canonical decomposition |
84 | followed by canonical composition). |
85 | |
86 | =item C<$string_NFKD = NFKD($raw_string)> |
87 | |
88 | returns the Normalization Form KD (formed by compatibility decomposition). |
89 | |
90 | =item C<$string_NFKC = NFKC($raw_string)> |
91 | |
92 | returns the Normalization Form KC (formed by compatibility decomposition |
93 | followed by B<canonical> composition). |
94 | |
95 | =item C<$normalized_string = normalize($form_name, $raw_string)> |
96 | |
97 | As C<$form_name>, one of the following names must be given. |
98 | |
99 | 'C' or 'NFC' for Normalization Form C |
100 | 'D' or 'NFD' for Normalization Form D |
101 | 'KC' or 'NFKC' for Normalization Form KC |
102 | 'KD' or 'NFKD' for Normalization Form KD |
103 | |
104 | =back |
105 | |
106 | =head2 Character Data |
107 | |
108 | These functions are interface of character data used internally. |
109 | If you want only to get unicode normalization forms, |
110 | you need not to call them by yourself. |
111 | |
112 | =over 4 |
113 | |
114 | =item C<$canonical_decomposed = getCanon($codepoint)> |
115 | |
116 | =item C<$compatibility_decomposed = getCompat($codepoint)> |
117 | |
118 | If the character of the specified codepoint is canonically or |
119 | compatibility decomposable (including Hangul Syllables), |
120 | returns the B<completely decomposed> string equivalent to it. |
121 | |
122 | If it is not decomposable, returns undef. |
123 | |
124 | =item C<$uv_composite = getComposite($uv_here, $uv_next)> |
125 | |
126 | If the couple of two characters here and next (as codepoints) is composable |
127 | (including Hangul Jamo/Syllables and Exclusions), |
128 | returns the codepoint of the composite. |
129 | |
130 | If they are not composable, returns undef. |
131 | |
132 | =item C<$combining_class = getCombinClass($codepoint)> |
133 | |
134 | Returns the combining class as integer of the character. |
135 | |
136 | =item C<$is_exclusion = isExclusion($codepoint)> |
137 | |
138 | Returns a boolean whether the character of the specified codepoint is |
139 | a composition exclusion. |
140 | |
141 | =back |
142 | |
143 | =head2 EXPORT |
144 | |
145 | C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. |
146 | |
147 | C<normalize> and other some functions: on request. |
148 | |
149 | =head1 AUTHOR |
150 | |
151 | SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt> |
152 | |
153 | http://homepage1.nifty.com/nomenclator/perl/ |
154 | |
155 | Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved. |
156 | |
157 | This program is free software; you can redistribute it and/or |
158 | modify it under the same terms as Perl itself. |
159 | |
160 | =head1 SEE ALSO |
161 | |
162 | =over 4 |
163 | |
164 | =item http://www.unicode.org/unicode/reports/tr15/ |
165 | |
166 | Unicode Normalization Forms - UAX #15 |
167 | |
168 | =back |
169 | |
170 | =cut |
171 | |