More encoding mapping magic.
[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm
CommitLineData
561c79ed 1package Unicode::UCD;
2
3use strict;
4use warnings;
5
10a6ecd2 6our $VERSION = '0.1';
561c79ed 7
8require Exporter;
9
10our @ISA = qw(Exporter);
10a6ecd2 11our @EXPORT_OK = qw(charinfo
12 charblock charscript
13 charblocks charscripts
14 charinrange);
561c79ed 15
16use Carp;
17
18=head1 NAME
19
00f2772c 20Unicode::UCD - Unicode character database
561c79ed 21
22=head1 SYNOPSIS
23
561c79ed 24 use Unicode::UCD 'charinfo';
e882dd67 25 my %charinfo = charinfo($codepoint);
561c79ed 26
27 use Unicode::UCD 'charblock';
e882dd67 28 my $charblock = charblock($codepoint);
29
30 use Unicode::UCD 'charscript';
31 my $charscript = charblock($codepoint);
561c79ed 32
33=head1 DESCRIPTION
34
35The Unicode module offers a simple interface to the Unicode Character
36Database.
37
38=cut
39
10a6ecd2 40my $UNICODEFH;
41my $BLOCKSFH;
42my $SCRIPTSFH;
43my $VERSIONFH;
561c79ed 44
45sub openunicode {
46 my ($rfh, @path) = @_;
47 my $f;
48 unless (defined $$rfh) {
49 for my $d (@INC) {
50 use File::Spec;
51 $f = File::Spec->catfile($d, "unicode", @path);
32c16050 52 last if open($$rfh, $f);
e882dd67 53 undef $f;
561c79ed 54 }
e882dd67 55 croak __PACKAGE__, ": failed to find ",
56 File::Spec->catfile(@path), " in @INC"
57 unless defined $f;
561c79ed 58 }
59 return $f;
60}
61
62=head2 charinfo
63
64 use Unicode::UCD 'charinfo';
65
66 my %charinfo = charinfo(0x41);
67
68charinfo() returns a hash that has the following fields as defined
69by the Unicode standard:
70
71 key
72
73 code code point with at least four hexdigits
74 name name of the character IN UPPER CASE
75 category general category of the character
76 combining classes used in the Canonical Ordering Algorithm
77 bidi bidirectional category
78 decomposition character decomposition mapping
79 decimal if decimal digit this is the integer numeric value
80 digit if digit this is the numeric value
81 numeric if numeric is the integer or rational numeric value
82 mirrored if mirrored in bidirectional text
83 unicode10 Unicode 1.0 name if existed and different
84 comment ISO 10646 comment field
85 upper uppercase equivalent mapping
86 lower lowercase equivalent mapping
87 title titlecase equivalent mapping
e882dd67 88
561c79ed 89 block block the character belongs to (used in \p{In...})
e882dd67 90 script script the character belongs to
561c79ed 91
92If no match is found, an empty hash is returned.
93
32c16050 94The C<block> property is the same as as returned by charinfo(). It is
95not defined in the Unicode Character Database proper (Chapter 4 of the
96Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
e882dd67 97of TUS3). Similarly for the C<script> property.
32c16050 98
99Note that you cannot do (de)composition and casing based solely on the
100above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
e882dd67 101you will need also the I<Composition Exclusions>, I<Case Folding>, and
102I<SpecialCasing> tables, available as files F<CompExcl.txt>,
103F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
561c79ed 104
105=cut
106
10a6ecd2 107sub _getcode {
108 my $arg = shift;
109
110 if ($arg =~ /^\d+$/) {
111 return $arg;
112 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
113 return hex($1);
114 }
115
116 return;
117}
118
561c79ed 119sub charinfo {
10a6ecd2 120 my $arg = shift;
121 my $code = _getcode($arg);
122 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
123 unless defined $code;
561c79ed 124 my $hexk = sprintf("%04X", $code);
125
10a6ecd2 126 openunicode(\$UNICODEFH, "Unicode.txt");
127 if (defined $UNICODEFH) {
561c79ed 128 use Search::Dict;
10a6ecd2 129 if (look($UNICODEFH, "$hexk;") >= 0) {
130 my $line = <$UNICODEFH>;
561c79ed 131 chomp $line;
132 my %prop;
133 @prop{qw(
134 code name category
135 combining bidi decomposition
136 decimal digit numeric
137 mirrored unicode10 comment
138 upper lower title
139 )} = split(/;/, $line, -1);
140 if ($prop{code} eq $hexk) {
a196fbfd 141 $prop{block} = charblock($code);
142 $prop{script} = charscript($code);
561c79ed 143 return %prop;
144 }
145 }
146 }
147 return;
148}
149
e882dd67 150sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
151 my ($table, $lo, $hi, $code) = @_;
152
153 return if $lo > $hi;
154
155 my $mid = int(($lo+$hi) / 2);
156
157 if ($table->[$mid]->[0] < $code) {
10a6ecd2 158 if ($table->[$mid]->[1] >= $code) {
e882dd67 159 return $table->[$mid]->[2];
160 } else {
161 _search($table, $mid + 1, $hi, $code);
162 }
163 } elsif ($table->[$mid]->[0] > $code) {
164 _search($table, $lo, $mid - 1, $code);
165 } else {
166 return $table->[$mid]->[2];
167 }
168}
169
10a6ecd2 170sub charinrange {
171 my ($range, $arg) = @_;
172 my $code = _getcode($arg);
173 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
174 unless defined $code;
175 _search($range, 0, $#$range, $code);
176}
177
354a27bf 178=head2 charblock
561c79ed 179
180 use Unicode::UCD 'charblock';
181
182 my $charblock = charblock(0x41);
10a6ecd2 183 my $charblock = charblock(1234);
184 my $charblock = charblock("0x263a");
185 my $charblock = charblock("U+263a");
186
187 my $ranges = charblock('Armenian');
188
189With a B<code point argument> charblock() returns the block the character
190belongs to, e.g. C<Basic Latin>. Note that not all the character
191positions within all blocks are defined. A <code point argument>
192is either a decimal or a hexadecimal scalar, or "U+" followed
193by hexadecimals.
194
195If supplied with an argument that can't be a code point, charblock()
196tries to do the opposite and interpret the argument as a character
197block. The return value is a I<range>: an anonymous list that
198contains anonymous lists, which in turn contain I<start-of-range>,
199I<end-of-range> code point pairs. You can test whether a code point
200is in a range using the L</charinrange> function. If the argument is
201not a known charater block, C<undef> is returned.
561c79ed 202
561c79ed 203=cut
204
205my @BLOCKS;
10a6ecd2 206my %BLOCKS;
561c79ed 207
10a6ecd2 208sub _charblocks {
561c79ed 209 unless (@BLOCKS) {
10a6ecd2 210 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
211 while (<$BLOCKSFH>) {
2796c109 212 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2 213 my ($lo, $hi) = (hex($1), hex($2));
214 my $subrange = [ $lo, $hi, $3 ];
215 push @BLOCKS, $subrange;
216 push @{$BLOCKS{$3}}, $subrange;
561c79ed 217 }
218 }
10a6ecd2 219 close($BLOCKSFH);
561c79ed 220 }
221 }
10a6ecd2 222}
223
224sub charblock {
225 my $arg = shift;
226
227 _charblocks() unless @BLOCKS;
228
229 my $code = _getcode($arg);
561c79ed 230
10a6ecd2 231 if (defined $code) {
232 _search(\@BLOCKS, 0, $#BLOCKS, $code);
233 } else {
234 if (exists $BLOCKS{$arg}) {
235 return $BLOCKS{$arg};
236 } else {
237 return;
238 }
239 }
e882dd67 240}
241
242=head2 charscript
243
244 use Unicode::UCD 'charscript';
245
246 my $charscript = charscript(0x41);
10a6ecd2 247 my $charscript = charscript(1234);
248 my $charscript = charscript("U+263a");
e882dd67 249
10a6ecd2 250 my $ranges = charscript('Thai');
251
252With a B<code point argument> charscript() returns the script the
253character belongs to, e.g. C<Latin>, C<Greek>, C<Han>. A <code point
254argument> is either a decimal or a hexadecimal scalar, or "U+"
255followed by hexadecimals.
256
257If supplied with an argument that can't be a code point, charscript()
258tries to do the opposite and interpret the argument as a character
259script. The return value is a I<range>: an anonymous list that
260contains anonymous lists, which in turn contain I<start-of-range>,
261I<end-of-range> code point pairs. You can test whether a code point
262is in a range using the L</charinrange> function. If the argument is
263not a known charater script, C<undef> is returned.
e882dd67 264
e882dd67 265=cut
266
267my @SCRIPTS;
10a6ecd2 268my %SCRIPTS;
e882dd67 269
10a6ecd2 270sub _charscripts {
e882dd67 271 unless (@SCRIPTS) {
10a6ecd2 272 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
273 while (<$SCRIPTSFH>) {
e882dd67 274 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2 275 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
276 my $script = lc($3);
277 $script =~ s/\b(\w)/uc($1)/ge;
278 my $subrange = [ $lo, $hi, $script ];
279 push @SCRIPTS, $subrange;
280 push @{$SCRIPTS{$script}}, $subrange;
e882dd67 281 }
282 }
10a6ecd2 283 close($SCRIPTSFH);
e882dd67 284 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
285 }
286 }
10a6ecd2 287}
288
289sub charscript {
290 my $arg = shift;
291
292 _charscripts() unless @SCRIPTS;
e882dd67 293
10a6ecd2 294 my $code = _getcode($arg);
295
296 if (defined $code) {
297 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
298 } else {
299 if (exists $SCRIPTS{$arg}) {
300 return $SCRIPTS{$arg};
301 } else {
302 return;
303 }
304 }
305}
306
307=head2 charblocks
308
309 use Unicode::UCD 'charblocks';
310
311 my %charblocks = charblocks();
312
313charblocks() returns a hash with the known block names as the keys,
314and the code point ranges (see L</charblock>) as the values.
315
316=cut
317
318sub charblocks {
319 _charblocks() unless @BLOCKS;
320 return %BLOCKS;
321}
322
323=head2 charscripts
324
325 use Unicode::UCD 'charscripts';
326
327 my %charscripts = charscripts();
328
329charscripts() returns a hash with the known script names as the keys,
330and the code point ranges (see L</charscript>) as the values.
331
332=cut
333
334sub charscripts {
335 _charscripts() unless @SCRIPTS;
336 return %SCRIPTS;
561c79ed 337}
338
10a6ecd2 339=head2 Blocks versus Scripts
ad9cab37 340
10a6ecd2 341The difference between a block and a script is that scripts are closer
342to the linguistic notion of a set of characters required to present
343languages, while block is more of an artifact of the Unicode character
344numbering and separation into blocks of 256 characters.
3aa957f9 345
346For example the Latin B<script> is spread over several B<blocks>, such
347as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
348C<Latin Extended-B>. On the other hand, the Latin script does not
349contain all the characters of the C<Basic Latin> block (also known as
350the ASCII): it includes only the letters, not for example the digits
351or the punctuation.
ad9cab37 352
3aa957f9 353For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37 354
355For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
356
3aa957f9 357=head2 Matching Scripts and Blocks
358
359Both scripts and blocks can be matched using the regular expression
360construct C<\p{In...}> and its negation C<\P{In...}>.
361
362The name of the script or the block comes after the C<In>, for example
363C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
10a6ecd2 364removed from the names for the C<\p{In...}>, for example
365C<LatinExtendedA> instead of C<Latin Extended-A>.
366
367There are a few cases where there exists both a script and a block by
368the same name, in these cases the block version has C<Block> appended:
369C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
370
371=head2 charinrange
372
373In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
374can also test whether a code point is in the I<range> as returned by
375L</charblock> and L</charscript> or as the values of the hash returned
376by L</charblocks> and </charscripts> by using charinrange():
377
378 use Unicode::UCD qw(charscript charinrange);
379
380 $range = charscript('Hiragana');
381 print "looks like hiragana\n" if charinrange($range, $code);
382
383=cut
384
385=head2 Unicode::UCD::UnicodeVersion
386
387Unicode::UCD::UnicodeVersion() returns the version of the Unicode Character
388Database, in other words, the version of the Unicode standard the
389database implements.
390
391=cut
392
393my $UNICODEVERSION;
394
395sub UnicodeVersion {
396 unless (defined $UNICODEVERSION) {
397 openunicode(\$VERSIONFH, "version");
398 chomp($UNICODEVERSION = <$VERSIONFH>);
399 close($VERSIONFH);
400 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
401 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
402 }
403 return $UNICODEVERSION;
404}
3aa957f9 405
406=head2 Implementation Note
32c16050 407
ad9cab37 408The first use of charinfo() opens a read-only filehandle to the Unicode
409Character Database (the database is included in the Perl distribution).
410The filehandle is then kept open for further queries.
32c16050 411
561c79ed 412=head1 AUTHOR
413
414Jarkko Hietaniemi
415
416=cut
417
4181;