Commit | Line | Data |
45394607 |
1 | package Unicode::Normalize; |
2 | |
3 | use 5.006; |
4 | use strict; |
5 | use warnings; |
6 | use Carp; |
7 | use Lingua::KO::Hangul::Util; |
8 | |
9 | our $VERSION = '0.04'; |
10 | our $PACKAGE = __PACKAGE__; |
11 | |
12 | require Exporter; |
13 | our @ISA = qw(Exporter); |
14 | our @EXPORT = qw( NFC NFD NFKC NFKD ); |
15 | our @EXPORT_OK = qw( normalize ); |
16 | our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] ); |
17 | |
18 | our $Combin = do "unicore/CombiningClass.pl" |
19 | || do "unicode/CombiningClass.pl" |
20 | || croak "$PACKAGE: CombiningClass.pl not found"; |
21 | |
22 | our $Decomp = do "unicore/Decomposition.pl" |
23 | || do "unicode/Decomposition.pl" |
24 | || croak "$PACKAGE: Decomposition.pl not found"; |
25 | |
26 | our %Combin; # $codepoint => $number : combination class |
27 | our %Canon; # $codepoint => \@codepoints : canonical decomp. |
28 | our %Compat; # $codepoint => \@codepoints : compat. decomp. |
29 | our %Compos; # $string => $codepoint : composite |
30 | our %Exclus; # $codepoint => 1 : composition exclusions |
31 | |
32 | { |
33 | my($f, $fh); |
34 | foreach my $d (@INC) { |
35 | use File::Spec; |
36 | $f = File::Spec->catfile($d, "unicore", "CompExcl.txt"); |
37 | last if open($fh, $f); |
38 | $f = File::Spec->catfile($d, "unicode", "CompExcl.txt"); |
39 | last if open($fh, $f); |
40 | $f = undef; |
41 | } |
42 | croak "$PACKAGE: CompExcl.txt not found in @INC" unless defined $f; |
43 | while(<$fh>){ |
44 | next if /^#/ or /^$/; |
45 | s/#.*//; |
46 | $Exclus{ hex($1) } =1 if /([0-9A-Fa-f]+)/; |
47 | } |
48 | close $fh; |
49 | } |
50 | |
51 | while($Combin =~ /(.+)/g) |
52 | { |
53 | my @tab = split /\t/, $1; |
54 | my $ini = hex $tab[0]; |
55 | if($tab[1] eq '') |
56 | { |
57 | $Combin{ $ini } = $tab[2]; |
58 | } |
59 | else |
60 | { |
61 | $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]); |
62 | } |
63 | } |
64 | |
65 | while($Decomp =~ /(.+)/g) |
66 | { |
67 | my @tab = split /\t/, $1; |
68 | my $compat = $tab[2] =~ s/<[^>]+>//; |
69 | my $dec = [ _getHexArray($tab[2]) ]; # decomposition |
70 | my $com = pack('U*', @$dec); # composable sequence |
71 | my $ini = hex($tab[0]); |
72 | if($tab[1] eq '') |
73 | { |
74 | $Compat{ $ini } = $dec; |
75 | if(! $compat){ |
76 | $Canon{ $ini } = $dec; |
77 | $Compos{ $com } = $ini; |
78 | } |
79 | } |
80 | else |
81 | { |
82 | foreach my $u ($ini .. hex($tab[1])){ |
83 | $Compat{ $u } = $dec; |
84 | if(! $compat){ |
85 | $Canon{ $u } = $dec; |
86 | $Compos{ $com } = $ini; |
87 | } |
88 | } |
89 | } |
90 | } |
91 | |
92 | foreach my $key (keys %Canon) # exhaustive decomposition |
93 | { |
94 | $Canon{$key} = [ getCanonList($key) ]; |
95 | } |
96 | |
97 | foreach my $key (keys %Compat) # exhaustive decomposition |
98 | { |
99 | $Compat{$key} = [ getCompatList($key) ]; |
100 | } |
101 | |
102 | sub getCanonList |
103 | { |
104 | my @src = @_; |
105 | my @dec = map $Canon{$_} ? @{ $Canon{$_} } : $_, @src; |
106 | join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec); |
107 | # condition @src == @dec is not ok. |
108 | } |
109 | |
110 | sub getCompatList |
111 | { |
112 | my @src = @_; |
113 | my @dec = map $Compat{$_} ? @{ $Compat{$_} } : $_, @src; |
114 | join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec); |
115 | # condition @src == @dec is not ok. |
116 | } |
117 | |
118 | sub NFD($){ _decompose(shift, 0) } |
119 | |
120 | sub NFKD($){ _decompose(shift, 1) } |
121 | |
122 | sub NFC($){ _compose(NFD(shift)) } |
123 | |
124 | sub NFKC($){ _compose(NFKD(shift)) } |
125 | |
126 | sub normalize($$) |
127 | { |
128 | my($form,$str) = @_; |
129 | $form eq 'D' || $form eq 'NFD' ? NFD($str) : |
130 | $form eq 'C' || $form eq 'NFC' ? NFC($str) : |
131 | $form eq 'KD' || $form eq 'NFKD' ? NFKD($str) : |
132 | $form eq 'KC' || $form eq 'NFKC' ? NFKC($str) : |
133 | croak $PACKAGE."::normalize: invalid form name: $form"; |
134 | } |
135 | |
136 | |
137 | ## |
138 | ## string _decompose(string, compat?) |
139 | ## |
140 | sub _decompose |
141 | { |
142 | my $str = $_[0]; |
143 | my $hash = $_[1] ? \%Compat : \%Canon; |
144 | my @ret; |
145 | my $retstr=""; |
146 | foreach my $u (unpack 'U*', $str){ |
147 | push @ret, |
148 | $hash->{ $u } ? @{ $hash->{ $u } } : |
149 | _isHangul($u) ? decomposeHangul($u) : $u; |
150 | } |
151 | for(my $i=0; $i<@ret;){ |
152 | $retstr .= pack('U', $ret[$i++]), next |
153 | unless $Combin{ $ret[$i] } && $i+1 < @ret && $Combin{ $ret[$i+1] }; |
154 | my @tmp; |
155 | push(@tmp, $ret[$i++]) while $i < @ret && $Combin{ $ret[$i] }; |
156 | $retstr .= pack 'U*', @tmp[ |
157 | sort { |
158 | $Combin{ $tmp[$a] } <=> $Combin{ $tmp[$b] } || $a <=> $b |
159 | } 0 .. @tmp - 1, |
160 | ]; |
161 | } |
162 | $retstr; |
163 | } |
164 | |
165 | ## |
166 | ## string _compose(string) |
167 | ## |
168 | ## S : starter; NS : not starter; |
169 | ## |
170 | ## composable sequence begins at S. |
171 | ## S + S or (S + S) + S may be composed. |
172 | ## NS + NS must not be composed. |
173 | ## |
174 | sub _compose |
175 | { |
176 | my @src = unpack('U*', composeHangul shift); # get codepoints |
177 | for(my $s = 0; $s+1 < @src; $s++){ |
178 | next unless defined $src[$s] && ! $Combin{ $src[$s] }; # S only |
179 | my($c, $blocked); |
180 | for(my $j = $s+1; $j < @src && !$blocked; $j++){ |
181 | $blocked = 1 if ! $Combin{ $src[$j] }; |
182 | |
183 | next if $j != $s + 1 && defined $src[$j-1] |
184 | && $Combin{ $src[$j-1] } && $Combin{ $src[$j] } |
185 | && $Combin{ $src[$j-1] } == $Combin{ $src[$j] }; |
186 | |
187 | if( # $c != 0, maybe. |
188 | $c = $Compos{pack('U*', @src[$s,$j])} and ! $Exclus{$c} |
189 | ) |
190 | { |
191 | $src[$s] = $c; $src[$j] = undef; $blocked = 0; |
192 | } |
193 | } |
194 | } |
195 | pack 'U*', grep defined(), @src; |
196 | } |
197 | |
198 | ## |
199 | ## "hhhh hhhh hhhh" to (dddd, dddd, dddd) |
200 | ## |
201 | sub _getHexArray |
202 | { |
203 | my $str = shift; |
204 | map hex(), $str =~ /([0-9A-Fa-f]+)/g; |
205 | } |
206 | |
207 | ## |
208 | ## Hangul Syllables |
209 | ## |
210 | sub _isHangul |
211 | { |
212 | my $code = shift; |
213 | return 0xAC00 <= $code && $code <= 0xD7A3; |
214 | } |
215 | |
216 | ## |
217 | ## for Debug |
218 | ## |
219 | sub _getCombin { wantarray ? %Combin : \%Combin } |
220 | sub _getCanon { wantarray ? %Canon : \%Canon } |
221 | sub _getCompat { wantarray ? %Compat : \%Compat } |
222 | sub _getCompos { wantarray ? %Compos : \%Compos } |
223 | sub _getExclus { wantarray ? %Exclus : \%Exclus } |
224 | 1; |
225 | __END__ |
226 | |
227 | =head1 NAME |
228 | |
229 | Unicode::Normalize - normalized forms of Unicode text |
230 | |
231 | =head1 SYNOPSIS |
232 | |
233 | use Unicode::Normalize; |
234 | |
235 | $string_NFD = NFD($raw_string); # Normalization Form D |
236 | $string_NFC = NFC($raw_string); # Normalization Form C |
237 | $string_NFKD = NFKD($raw_string); # Normalization Form KD |
238 | $string_NFKC = NFKC($raw_string); # Normalization Form KC |
239 | |
240 | or |
241 | |
242 | use Unicode::Normalize 'normalize'; |
243 | |
244 | $string_NFD = normalize('D', $raw_string); # Normalization Form D |
245 | $string_NFC = normalize('C', $raw_string); # Normalization Form C |
246 | $string_NFKD = normalize('KD', $raw_string); # Normalization Form KD |
247 | $string_NFKC = normalize('KC', $raw_string); # Normalization Form KC |
248 | |
249 | =head1 DESCRIPTION |
250 | |
251 | =over 4 |
252 | |
253 | =item C<$string_NFD = NFD($raw_string)> |
254 | |
255 | returns the Normalization Form D (formed by canonical decomposition). |
256 | |
257 | |
258 | =item C<$string_NFC = NFC($raw_string)> |
259 | |
260 | returns the Normalization Form C (formed by canonical decomposition |
261 | followed by canonical composition). |
262 | |
263 | =item C<$string_NFKD = NFKD($raw_string)> |
264 | |
265 | returns the Normalization Form KD (formed by compatibility decomposition). |
266 | |
267 | =item C<$string_NFKC = NFKC($raw_string)> |
268 | |
269 | returns the Normalization Form KC (formed by compatibility decomposition |
270 | followed by B<canonical> composition). |
271 | |
272 | =item C<$normalized_string = normalize($form_name, $raw_string)> |
273 | |
274 | As C<$form_name>, one of the following names must be given. |
275 | |
276 | 'C' or 'NFC' for Normalization Form C |
277 | 'D' or 'NFD' for Normalization Form D |
278 | 'KC' or 'NFKC' for Normalization Form KC |
279 | 'KD' or 'NFKD' for Normalization Form KD |
280 | |
281 | =back |
282 | |
283 | =head2 EXPORT |
284 | |
285 | C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. |
286 | |
287 | C<normalize>: on request. |
288 | |
289 | =head1 AUTHOR |
290 | |
291 | SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt> |
292 | |
293 | http://homepage1.nifty.com/nomenclator/perl/ |
294 | |
295 | Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved. |
296 | |
297 | This program is free software; you can redistribute it and/or |
298 | modify it under the same terms as Perl itself. |
299 | |
300 | =head1 SEE ALSO |
301 | |
302 | =over 4 |
303 | |
304 | =item L<Lingua::KO::Hangul::Util> |
305 | |
306 | utility functions for Hangul Syllables |
307 | |
308 | =item http://www.unicode.org/unicode/reports/tr15/ |
309 | |
310 | Unicode Normalization Forms - UAX #15 |
311 | |
312 | =back |
313 | |
314 | =cut |