Commit | Line | Data |
561c79ed |
1 | package Unicode::UCD; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | |
10a6ecd2 |
6 | our $VERSION = '0.1'; |
561c79ed |
7 | |
8 | require Exporter; |
9 | |
10 | our @ISA = qw(Exporter); |
10a6ecd2 |
11 | our @EXPORT_OK = qw(charinfo |
12 | charblock charscript |
13 | charblocks charscripts |
14 | charinrange); |
561c79ed |
15 | |
16 | use Carp; |
17 | |
18 | =head1 NAME |
19 | |
00f2772c |
20 | Unicode::UCD - Unicode character database |
561c79ed |
21 | |
22 | =head1 SYNOPSIS |
23 | |
561c79ed |
24 | use Unicode::UCD 'charinfo'; |
e882dd67 |
25 | my %charinfo = charinfo($codepoint); |
561c79ed |
26 | |
27 | use Unicode::UCD 'charblock'; |
e882dd67 |
28 | my $charblock = charblock($codepoint); |
29 | |
30 | use Unicode::UCD 'charscript'; |
31 | my $charscript = charblock($codepoint); |
561c79ed |
32 | |
33 | =head1 DESCRIPTION |
34 | |
35 | The Unicode module offers a simple interface to the Unicode Character |
36 | Database. |
37 | |
38 | =cut |
39 | |
10a6ecd2 |
40 | my $UNICODEFH; |
41 | my $BLOCKSFH; |
42 | my $SCRIPTSFH; |
43 | my $VERSIONFH; |
561c79ed |
44 | |
45 | sub openunicode { |
46 | my ($rfh, @path) = @_; |
47 | my $f; |
48 | unless (defined $$rfh) { |
49 | for my $d (@INC) { |
50 | use File::Spec; |
51 | $f = File::Spec->catfile($d, "unicode", @path); |
32c16050 |
52 | last if open($$rfh, $f); |
e882dd67 |
53 | undef $f; |
561c79ed |
54 | } |
e882dd67 |
55 | croak __PACKAGE__, ": failed to find ", |
56 | File::Spec->catfile(@path), " in @INC" |
57 | unless defined $f; |
561c79ed |
58 | } |
59 | return $f; |
60 | } |
61 | |
62 | =head2 charinfo |
63 | |
64 | use Unicode::UCD 'charinfo'; |
65 | |
66 | my %charinfo = charinfo(0x41); |
67 | |
68 | charinfo() returns a hash that has the following fields as defined |
69 | by the Unicode standard: |
70 | |
71 | key |
72 | |
73 | code code point with at least four hexdigits |
74 | name name of the character IN UPPER CASE |
75 | category general category of the character |
76 | combining classes used in the Canonical Ordering Algorithm |
77 | bidi bidirectional category |
78 | decomposition character decomposition mapping |
79 | decimal if decimal digit this is the integer numeric value |
80 | digit if digit this is the numeric value |
81 | numeric if numeric is the integer or rational numeric value |
82 | mirrored if mirrored in bidirectional text |
83 | unicode10 Unicode 1.0 name if existed and different |
84 | comment ISO 10646 comment field |
85 | upper uppercase equivalent mapping |
86 | lower lowercase equivalent mapping |
87 | title titlecase equivalent mapping |
e882dd67 |
88 | |
561c79ed |
89 | block block the character belongs to (used in \p{In...}) |
e882dd67 |
90 | script script the character belongs to |
561c79ed |
91 | |
92 | If no match is found, an empty hash is returned. |
93 | |
32c16050 |
94 | The C<block> property is the same as as returned by charinfo(). It is |
95 | not defined in the Unicode Character Database proper (Chapter 4 of the |
96 | Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14 |
e882dd67 |
97 | of TUS3). Similarly for the C<script> property. |
32c16050 |
98 | |
99 | Note that you cannot do (de)composition and casing based solely on the |
100 | above C<decomposition> and C<lower>, C<upper>, C<title>, properties, |
e882dd67 |
101 | you will need also the I<Composition Exclusions>, I<Case Folding>, and |
102 | I<SpecialCasing> tables, available as files F<CompExcl.txt>, |
103 | F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution. |
561c79ed |
104 | |
105 | =cut |
106 | |
10a6ecd2 |
107 | sub _getcode { |
108 | my $arg = shift; |
109 | |
110 | if ($arg =~ /^\d+$/) { |
111 | return $arg; |
112 | } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) { |
113 | return hex($1); |
114 | } |
115 | |
116 | return; |
117 | } |
118 | |
561c79ed |
119 | sub charinfo { |
10a6ecd2 |
120 | my $arg = shift; |
121 | my $code = _getcode($arg); |
122 | croak __PACKAGE__, "::charinfo: unknown code '$arg'" |
123 | unless defined $code; |
561c79ed |
124 | my $hexk = sprintf("%04X", $code); |
125 | |
10a6ecd2 |
126 | openunicode(\$UNICODEFH, "Unicode.txt"); |
127 | if (defined $UNICODEFH) { |
561c79ed |
128 | use Search::Dict; |
10a6ecd2 |
129 | if (look($UNICODEFH, "$hexk;") >= 0) { |
130 | my $line = <$UNICODEFH>; |
561c79ed |
131 | chomp $line; |
132 | my %prop; |
133 | @prop{qw( |
134 | code name category |
135 | combining bidi decomposition |
136 | decimal digit numeric |
137 | mirrored unicode10 comment |
138 | upper lower title |
139 | )} = split(/;/, $line, -1); |
140 | if ($prop{code} eq $hexk) { |
a196fbfd |
141 | $prop{block} = charblock($code); |
142 | $prop{script} = charscript($code); |
561c79ed |
143 | return %prop; |
144 | } |
145 | } |
146 | } |
147 | return; |
148 | } |
149 | |
e882dd67 |
150 | sub _search { # Binary search in a [[lo,hi,prop],[...],...] table. |
151 | my ($table, $lo, $hi, $code) = @_; |
152 | |
153 | return if $lo > $hi; |
154 | |
155 | my $mid = int(($lo+$hi) / 2); |
156 | |
157 | if ($table->[$mid]->[0] < $code) { |
10a6ecd2 |
158 | if ($table->[$mid]->[1] >= $code) { |
e882dd67 |
159 | return $table->[$mid]->[2]; |
160 | } else { |
161 | _search($table, $mid + 1, $hi, $code); |
162 | } |
163 | } elsif ($table->[$mid]->[0] > $code) { |
164 | _search($table, $lo, $mid - 1, $code); |
165 | } else { |
166 | return $table->[$mid]->[2]; |
167 | } |
168 | } |
169 | |
10a6ecd2 |
170 | sub charinrange { |
171 | my ($range, $arg) = @_; |
172 | my $code = _getcode($arg); |
173 | croak __PACKAGE__, "::charinrange: unknown code '$arg'" |
174 | unless defined $code; |
175 | _search($range, 0, $#$range, $code); |
176 | } |
177 | |
354a27bf |
178 | =head2 charblock |
561c79ed |
179 | |
180 | use Unicode::UCD 'charblock'; |
181 | |
182 | my $charblock = charblock(0x41); |
10a6ecd2 |
183 | my $charblock = charblock(1234); |
184 | my $charblock = charblock("0x263a"); |
185 | my $charblock = charblock("U+263a"); |
186 | |
187 | my $ranges = charblock('Armenian'); |
188 | |
189 | With a B<code point argument> charblock() returns the block the character |
190 | belongs to, e.g. C<Basic Latin>. Note that not all the character |
191 | positions within all blocks are defined. A <code point argument> |
192 | is either a decimal or a hexadecimal scalar, or "U+" followed |
193 | by hexadecimals. |
194 | |
195 | If supplied with an argument that can't be a code point, charblock() |
196 | tries to do the opposite and interpret the argument as a character |
197 | block. The return value is a I<range>: an anonymous list that |
198 | contains anonymous lists, which in turn contain I<start-of-range>, |
199 | I<end-of-range> code point pairs. You can test whether a code point |
200 | is in a range using the L</charinrange> function. If the argument is |
201 | not a known charater block, C<undef> is returned. |
561c79ed |
202 | |
561c79ed |
203 | =cut |
204 | |
205 | my @BLOCKS; |
10a6ecd2 |
206 | my %BLOCKS; |
561c79ed |
207 | |
10a6ecd2 |
208 | sub _charblocks { |
561c79ed |
209 | unless (@BLOCKS) { |
10a6ecd2 |
210 | if (openunicode(\$BLOCKSFH, "Blocks.txt")) { |
211 | while (<$BLOCKSFH>) { |
2796c109 |
212 | if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) { |
10a6ecd2 |
213 | my ($lo, $hi) = (hex($1), hex($2)); |
214 | my $subrange = [ $lo, $hi, $3 ]; |
215 | push @BLOCKS, $subrange; |
216 | push @{$BLOCKS{$3}}, $subrange; |
561c79ed |
217 | } |
218 | } |
10a6ecd2 |
219 | close($BLOCKSFH); |
561c79ed |
220 | } |
221 | } |
10a6ecd2 |
222 | } |
223 | |
224 | sub charblock { |
225 | my $arg = shift; |
226 | |
227 | _charblocks() unless @BLOCKS; |
228 | |
229 | my $code = _getcode($arg); |
561c79ed |
230 | |
10a6ecd2 |
231 | if (defined $code) { |
232 | _search(\@BLOCKS, 0, $#BLOCKS, $code); |
233 | } else { |
234 | if (exists $BLOCKS{$arg}) { |
235 | return $BLOCKS{$arg}; |
236 | } else { |
237 | return; |
238 | } |
239 | } |
e882dd67 |
240 | } |
241 | |
242 | =head2 charscript |
243 | |
244 | use Unicode::UCD 'charscript'; |
245 | |
246 | my $charscript = charscript(0x41); |
10a6ecd2 |
247 | my $charscript = charscript(1234); |
248 | my $charscript = charscript("U+263a"); |
e882dd67 |
249 | |
10a6ecd2 |
250 | my $ranges = charscript('Thai'); |
251 | |
252 | With a B<code point argument> charscript() returns the script the |
253 | character belongs to, e.g. C<Latin>, C<Greek>, C<Han>. A <code point |
254 | argument> is either a decimal or a hexadecimal scalar, or "U+" |
255 | followed by hexadecimals. |
256 | |
257 | If supplied with an argument that can't be a code point, charscript() |
258 | tries to do the opposite and interpret the argument as a character |
259 | script. The return value is a I<range>: an anonymous list that |
260 | contains anonymous lists, which in turn contain I<start-of-range>, |
261 | I<end-of-range> code point pairs. You can test whether a code point |
262 | is in a range using the L</charinrange> function. If the argument is |
263 | not a known charater script, C<undef> is returned. |
e882dd67 |
264 | |
e882dd67 |
265 | =cut |
266 | |
267 | my @SCRIPTS; |
10a6ecd2 |
268 | my %SCRIPTS; |
e882dd67 |
269 | |
10a6ecd2 |
270 | sub _charscripts { |
e882dd67 |
271 | unless (@SCRIPTS) { |
10a6ecd2 |
272 | if (openunicode(\$SCRIPTSFH, "Scripts.txt")) { |
273 | while (<$SCRIPTSFH>) { |
e882dd67 |
274 | if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) { |
10a6ecd2 |
275 | my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1)); |
276 | my $script = lc($3); |
277 | $script =~ s/\b(\w)/uc($1)/ge; |
278 | my $subrange = [ $lo, $hi, $script ]; |
279 | push @SCRIPTS, $subrange; |
280 | push @{$SCRIPTS{$script}}, $subrange; |
e882dd67 |
281 | } |
282 | } |
10a6ecd2 |
283 | close($SCRIPTSFH); |
e882dd67 |
284 | @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS; |
285 | } |
286 | } |
10a6ecd2 |
287 | } |
288 | |
289 | sub charscript { |
290 | my $arg = shift; |
291 | |
292 | _charscripts() unless @SCRIPTS; |
e882dd67 |
293 | |
10a6ecd2 |
294 | my $code = _getcode($arg); |
295 | |
296 | if (defined $code) { |
297 | _search(\@SCRIPTS, 0, $#SCRIPTS, $code); |
298 | } else { |
299 | if (exists $SCRIPTS{$arg}) { |
300 | return $SCRIPTS{$arg}; |
301 | } else { |
302 | return; |
303 | } |
304 | } |
305 | } |
306 | |
307 | =head2 charblocks |
308 | |
309 | use Unicode::UCD 'charblocks'; |
310 | |
311 | my %charblocks = charblocks(); |
312 | |
313 | charblocks() returns a hash with the known block names as the keys, |
314 | and the code point ranges (see L</charblock>) as the values. |
315 | |
316 | =cut |
317 | |
318 | sub charblocks { |
319 | _charblocks() unless @BLOCKS; |
320 | return %BLOCKS; |
321 | } |
322 | |
323 | =head2 charscripts |
324 | |
325 | use Unicode::UCD 'charscripts'; |
326 | |
327 | my %charscripts = charscripts(); |
328 | |
329 | charscripts() returns a hash with the known script names as the keys, |
330 | and the code point ranges (see L</charscript>) as the values. |
331 | |
332 | =cut |
333 | |
334 | sub charscripts { |
335 | _charscripts() unless @SCRIPTS; |
336 | return %SCRIPTS; |
561c79ed |
337 | } |
338 | |
10a6ecd2 |
339 | =head2 Blocks versus Scripts |
ad9cab37 |
340 | |
10a6ecd2 |
341 | The difference between a block and a script is that scripts are closer |
342 | to the linguistic notion of a set of characters required to present |
343 | languages, while block is more of an artifact of the Unicode character |
344 | numbering and separation into blocks of 256 characters. |
3aa957f9 |
345 | |
346 | For example the Latin B<script> is spread over several B<blocks>, such |
347 | as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and |
348 | C<Latin Extended-B>. On the other hand, the Latin script does not |
349 | contain all the characters of the C<Basic Latin> block (also known as |
350 | the ASCII): it includes only the letters, not for example the digits |
351 | or the punctuation. |
ad9cab37 |
352 | |
3aa957f9 |
353 | For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt |
ad9cab37 |
354 | |
355 | For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/ |
356 | |
3aa957f9 |
357 | =head2 Matching Scripts and Blocks |
358 | |
359 | Both scripts and blocks can be matched using the regular expression |
360 | construct C<\p{In...}> and its negation C<\P{In...}>. |
361 | |
362 | The name of the script or the block comes after the C<In>, for example |
363 | C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are |
10a6ecd2 |
364 | removed from the names for the C<\p{In...}>, for example |
365 | C<LatinExtendedA> instead of C<Latin Extended-A>. |
366 | |
367 | There are a few cases where there exists both a script and a block by |
368 | the same name, in these cases the block version has C<Block> appended: |
369 | C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block. |
370 | |
371 | =head2 charinrange |
372 | |
373 | In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you |
374 | can also test whether a code point is in the I<range> as returned by |
375 | L</charblock> and L</charscript> or as the values of the hash returned |
376 | by L</charblocks> and </charscripts> by using charinrange(): |
377 | |
378 | use Unicode::UCD qw(charscript charinrange); |
379 | |
380 | $range = charscript('Hiragana'); |
381 | print "looks like hiragana\n" if charinrange($range, $code); |
382 | |
383 | =cut |
384 | |
385 | =head2 Unicode::UCD::UnicodeVersion |
386 | |
387 | Unicode::UCD::UnicodeVersion() returns the version of the Unicode Character |
388 | Database, in other words, the version of the Unicode standard the |
389 | database implements. |
390 | |
391 | =cut |
392 | |
393 | my $UNICODEVERSION; |
394 | |
395 | sub UnicodeVersion { |
396 | unless (defined $UNICODEVERSION) { |
397 | openunicode(\$VERSIONFH, "version"); |
398 | chomp($UNICODEVERSION = <$VERSIONFH>); |
399 | close($VERSIONFH); |
400 | croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'" |
401 | unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/; |
402 | } |
403 | return $UNICODEVERSION; |
404 | } |
3aa957f9 |
405 | |
406 | =head2 Implementation Note |
32c16050 |
407 | |
ad9cab37 |
408 | The first use of charinfo() opens a read-only filehandle to the Unicode |
409 | Character Database (the database is included in the Perl distribution). |
410 | The filehandle is then kept open for further queries. |
32c16050 |
411 | |
561c79ed |
412 | =head1 AUTHOR |
413 | |
414 | Jarkko Hietaniemi |
415 | |
416 | =cut |
417 | |
418 | 1; |