Workaround on Cwd bootstrap problem.
[p5sagit/p5-mst-13.2.git] / lib / UnicodeCD.pm
CommitLineData
1189d1e4 1package UnicodeCD;
561c79ed 2
3use strict;
4use warnings;
5
10a6ecd2 6our $VERSION = '0.1';
561c79ed 7
8require Exporter;
9
10our @ISA = qw(Exporter);
10a6ecd2 11our @EXPORT_OK = qw(charinfo
12 charblock charscript
13 charblocks charscripts
b08cd201 14 charinrange
15 compexcl
16 casefold casespec);
561c79ed 17
18use Carp;
19
20=head1 NAME
21
1189d1e4 22UnicodeCD - Unicode character database
561c79ed 23
24=head1 SYNOPSIS
25
1189d1e4 26 use UnicodeCD 'charinfo';
b08cd201 27 my $charinfo = charinfo($codepoint);
561c79ed 28
1189d1e4 29 use UnicodeCD 'charblock';
e882dd67 30 my $charblock = charblock($codepoint);
31
1189d1e4 32 use UnicodeCD 'charscript';
e882dd67 33 my $charscript = charblock($codepoint);
561c79ed 34
e145285f 35 use UnicodeCD 'charblocks';
36 my $charblocks = charblocks();
37
38 use UnicodeCD 'charscripts';
39 my %charscripts = charscripts();
40
41 use UnicodeCD qw(charscript charinrange);
42 my $range = charscript($script);
43 print "looks like $script\n" if charinrange($range, $codepoint);
44
45 use UnicodeCD 'compexcl';
46 my $compexcl = compexcl($codepoint);
47
48 my $unicode_version = UnicodeCD::UnicodeVersion();
49
561c79ed 50=head1 DESCRIPTION
51
e145285f 52The UnicodeCD module offers a simple interface to the Unicode Character
561c79ed 53Database.
54
55=cut
56
10a6ecd2 57my $UNICODEFH;
58my $BLOCKSFH;
59my $SCRIPTSFH;
60my $VERSIONFH;
b08cd201 61my $COMPEXCLFH;
62my $CASEFOLDFH;
63my $CASESPECFH;
561c79ed 64
65sub openunicode {
66 my ($rfh, @path) = @_;
67 my $f;
68 unless (defined $$rfh) {
69 for my $d (@INC) {
70 use File::Spec;
71 $f = File::Spec->catfile($d, "unicode", @path);
32c16050 72 last if open($$rfh, $f);
e882dd67 73 undef $f;
561c79ed 74 }
e882dd67 75 croak __PACKAGE__, ": failed to find ",
76 File::Spec->catfile(@path), " in @INC"
77 unless defined $f;
561c79ed 78 }
79 return $f;
80}
81
82=head2 charinfo
83
1189d1e4 84 use UnicodeCD 'charinfo';
561c79ed 85
b08cd201 86 my $charinfo = charinfo(0x41);
561c79ed 87
b08cd201 88charinfo() returns a reference to a hash that has the following fields
89as defined by the Unicode standard:
561c79ed 90
91 key
92
93 code code point with at least four hexdigits
94 name name of the character IN UPPER CASE
95 category general category of the character
96 combining classes used in the Canonical Ordering Algorithm
97 bidi bidirectional category
98 decomposition character decomposition mapping
99 decimal if decimal digit this is the integer numeric value
100 digit if digit this is the numeric value
101 numeric if numeric is the integer or rational numeric value
102 mirrored if mirrored in bidirectional text
103 unicode10 Unicode 1.0 name if existed and different
104 comment ISO 10646 comment field
105 upper uppercase equivalent mapping
106 lower lowercase equivalent mapping
107 title titlecase equivalent mapping
e882dd67 108
561c79ed 109 block block the character belongs to (used in \p{In...})
e882dd67 110 script script the character belongs to
561c79ed 111
b08cd201 112If no match is found, a reference to an empty hash is returned.
561c79ed 113
32c16050 114The C<block> property is the same as as returned by charinfo(). It is
115not defined in the Unicode Character Database proper (Chapter 4 of the
116Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
e882dd67 117of TUS3). Similarly for the C<script> property.
32c16050 118
119Note that you cannot do (de)composition and casing based solely on the
120above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 121you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed 122
123=cut
124
10a6ecd2 125sub _getcode {
126 my $arg = shift;
127
128 if ($arg =~ /^\d+$/) {
129 return $arg;
130 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
131 return hex($1);
132 }
133
134 return;
135}
136
561c79ed 137sub charinfo {
10a6ecd2 138 my $arg = shift;
139 my $code = _getcode($arg);
140 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
141 unless defined $code;
561c79ed 142 my $hexk = sprintf("%04X", $code);
143
10a6ecd2 144 openunicode(\$UNICODEFH, "Unicode.txt");
145 if (defined $UNICODEFH) {
561c79ed 146 use Search::Dict;
10a6ecd2 147 if (look($UNICODEFH, "$hexk;") >= 0) {
148 my $line = <$UNICODEFH>;
561c79ed 149 chomp $line;
150 my %prop;
151 @prop{qw(
152 code name category
153 combining bidi decomposition
154 decimal digit numeric
155 mirrored unicode10 comment
156 upper lower title
157 )} = split(/;/, $line, -1);
158 if ($prop{code} eq $hexk) {
a196fbfd 159 $prop{block} = charblock($code);
160 $prop{script} = charscript($code);
b08cd201 161 return \%prop;
561c79ed 162 }
163 }
164 }
165 return;
166}
167
e882dd67 168sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
169 my ($table, $lo, $hi, $code) = @_;
170
171 return if $lo > $hi;
172
173 my $mid = int(($lo+$hi) / 2);
174
175 if ($table->[$mid]->[0] < $code) {
10a6ecd2 176 if ($table->[$mid]->[1] >= $code) {
e882dd67 177 return $table->[$mid]->[2];
178 } else {
179 _search($table, $mid + 1, $hi, $code);
180 }
181 } elsif ($table->[$mid]->[0] > $code) {
182 _search($table, $lo, $mid - 1, $code);
183 } else {
184 return $table->[$mid]->[2];
185 }
186}
187
10a6ecd2 188sub charinrange {
189 my ($range, $arg) = @_;
190 my $code = _getcode($arg);
191 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
192 unless defined $code;
193 _search($range, 0, $#$range, $code);
194}
195
354a27bf 196=head2 charblock
561c79ed 197
1189d1e4 198 use UnicodeCD 'charblock';
561c79ed 199
200 my $charblock = charblock(0x41);
10a6ecd2 201 my $charblock = charblock(1234);
202 my $charblock = charblock("0x263a");
203 my $charblock = charblock("U+263a");
204
205 my $ranges = charblock('Armenian');
206
207With a B<code point argument> charblock() returns the block the character
208belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 209positions within all blocks are defined.
10a6ecd2 210
211If supplied with an argument that can't be a code point, charblock()
212tries to do the opposite and interpret the argument as a character
213block. The return value is a I<range>: an anonymous list that
214contains anonymous lists, which in turn contain I<start-of-range>,
215I<end-of-range> code point pairs. You can test whether a code point
216is in a range using the L</charinrange> function. If the argument is
217not a known charater block, C<undef> is returned.
561c79ed 218
561c79ed 219=cut
220
221my @BLOCKS;
10a6ecd2 222my %BLOCKS;
561c79ed 223
10a6ecd2 224sub _charblocks {
561c79ed 225 unless (@BLOCKS) {
10a6ecd2 226 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
227 while (<$BLOCKSFH>) {
2796c109 228 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2 229 my ($lo, $hi) = (hex($1), hex($2));
230 my $subrange = [ $lo, $hi, $3 ];
231 push @BLOCKS, $subrange;
232 push @{$BLOCKS{$3}}, $subrange;
561c79ed 233 }
234 }
10a6ecd2 235 close($BLOCKSFH);
561c79ed 236 }
237 }
10a6ecd2 238}
239
240sub charblock {
241 my $arg = shift;
242
243 _charblocks() unless @BLOCKS;
244
245 my $code = _getcode($arg);
561c79ed 246
10a6ecd2 247 if (defined $code) {
248 _search(\@BLOCKS, 0, $#BLOCKS, $code);
249 } else {
250 if (exists $BLOCKS{$arg}) {
251 return $BLOCKS{$arg};
252 } else {
253 return;
254 }
255 }
e882dd67 256}
257
258=head2 charscript
259
1189d1e4 260 use UnicodeCD 'charscript';
e882dd67 261
262 my $charscript = charscript(0x41);
10a6ecd2 263 my $charscript = charscript(1234);
264 my $charscript = charscript("U+263a");
e882dd67 265
10a6ecd2 266 my $ranges = charscript('Thai');
267
268With a B<code point argument> charscript() returns the script the
b08cd201 269character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2 270
271If supplied with an argument that can't be a code point, charscript()
272tries to do the opposite and interpret the argument as a character
273script. The return value is a I<range>: an anonymous list that
274contains anonymous lists, which in turn contain I<start-of-range>,
275I<end-of-range> code point pairs. You can test whether a code point
276is in a range using the L</charinrange> function. If the argument is
277not a known charater script, C<undef> is returned.
e882dd67 278
e882dd67 279=cut
280
281my @SCRIPTS;
10a6ecd2 282my %SCRIPTS;
e882dd67 283
10a6ecd2 284sub _charscripts {
e882dd67 285 unless (@SCRIPTS) {
10a6ecd2 286 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
287 while (<$SCRIPTSFH>) {
e882dd67 288 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2 289 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
290 my $script = lc($3);
291 $script =~ s/\b(\w)/uc($1)/ge;
292 my $subrange = [ $lo, $hi, $script ];
293 push @SCRIPTS, $subrange;
294 push @{$SCRIPTS{$script}}, $subrange;
e882dd67 295 }
296 }
10a6ecd2 297 close($SCRIPTSFH);
e882dd67 298 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
299 }
300 }
10a6ecd2 301}
302
303sub charscript {
304 my $arg = shift;
305
306 _charscripts() unless @SCRIPTS;
e882dd67 307
10a6ecd2 308 my $code = _getcode($arg);
309
310 if (defined $code) {
311 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
312 } else {
313 if (exists $SCRIPTS{$arg}) {
314 return $SCRIPTS{$arg};
315 } else {
316 return;
317 }
318 }
319}
320
321=head2 charblocks
322
1189d1e4 323 use UnicodeCD 'charblocks';
10a6ecd2 324
b08cd201 325 my $charblocks = charblocks();
10a6ecd2 326
b08cd201 327charblocks() returns a reference to a hash with the known block names
328as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2 329
330=cut
331
332sub charblocks {
b08cd201 333 _charblocks() unless %BLOCKS;
334 return \%BLOCKS;
10a6ecd2 335}
336
337=head2 charscripts
338
1189d1e4 339 use UnicodeCD 'charscripts';
10a6ecd2 340
341 my %charscripts = charscripts();
342
343charscripts() returns a hash with the known script names as the keys,
344and the code point ranges (see L</charscript>) as the values.
345
346=cut
347
348sub charscripts {
b08cd201 349 _charscripts() unless %SCRIPTS;
350 return \%SCRIPTS;
561c79ed 351}
352
10a6ecd2 353=head2 Blocks versus Scripts
ad9cab37 354
10a6ecd2 355The difference between a block and a script is that scripts are closer
356to the linguistic notion of a set of characters required to present
357languages, while block is more of an artifact of the Unicode character
358numbering and separation into blocks of 256 characters.
3aa957f9 359
360For example the Latin B<script> is spread over several B<blocks>, such
361as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
362C<Latin Extended-B>. On the other hand, the Latin script does not
363contain all the characters of the C<Basic Latin> block (also known as
364the ASCII): it includes only the letters, not for example the digits
365or the punctuation.
ad9cab37 366
3aa957f9 367For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37 368
369For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
370
3aa957f9 371=head2 Matching Scripts and Blocks
372
373Both scripts and blocks can be matched using the regular expression
374construct C<\p{In...}> and its negation C<\P{In...}>.
375
376The name of the script or the block comes after the C<In>, for example
377C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
10a6ecd2 378removed from the names for the C<\p{In...}>, for example
379C<LatinExtendedA> instead of C<Latin Extended-A>.
380
381There are a few cases where there exists both a script and a block by
382the same name, in these cases the block version has C<Block> appended:
383C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
384
b08cd201 385=head2 Code Point Arguments
386
387A <code point argument> is either a decimal or a hexadecimal scalar,
388or "U+" followed by hexadecimals.
389
10a6ecd2 390=head2 charinrange
391
392In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
393can also test whether a code point is in the I<range> as returned by
394L</charblock> and L</charscript> or as the values of the hash returned
e618509d 395by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 396
1189d1e4 397 use UnicodeCD qw(charscript charinrange);
10a6ecd2 398
399 $range = charscript('Hiragana');
e145285f 400 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2 401
402=cut
403
b08cd201 404=head2 compexcl
405
1189d1e4 406 use UnicodeCD 'compexcl';
b08cd201 407
408 my $compexcl = compexcl("09dc");
409
410The compexcl() returns the composition exclusion (that is, if the
9046a8ae 411character should not be produced during a precomposition) of the
412character specified by a B<code point argument>.
b08cd201 413
414If there is a composition exclusion for the character, true is
415returned. Otherwise, false is returned.
416
417=cut
418
419my %COMPEXCL;
420
421sub _compexcl {
422 unless (%COMPEXCL) {
423 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
424 while (<$COMPEXCLFH>) {
425 if (/^([0-9A-F]+) \# /) {
426 my $code = hex($1);
427 $COMPEXCL{$code} = undef;
428 }
429 }
430 close($COMPEXCLFH);
431 }
432 }
433}
434
435sub compexcl {
436 my $arg = shift;
437 my $code = _getcode($arg);
438
439 _compexcl() unless %COMPEXCL;
440
441 return exists $COMPEXCL{$code};
442}
443
444=head2 casefold
445
1189d1e4 446 use UnicodeCD 'casefold';
b08cd201 447
448 my %casefold = casefold("09dc");
449
450The casefold() returns the locale-independent case folding of the
451character specified by a B<code point argument>.
452
453If there is a case folding for that character, a reference to a hash
454with the following fields is returned:
455
456 key
457
458 code code point with at least four hexdigits
459 status "C", "F", "S", or "I"
460 mapping one or more codes separated by spaces
461
462The meaning of the I<status> is as follows:
463
464 C common case folding, common mappings shared
465 by both simple and full mappings
466 F full case folding, mappings that cause strings
467 to grow in length. Multiple characters are separated
468 by spaces
469 S simple case folding, mappings to single characters
470 where different from F
471 I special case for dotted uppercase I and
472 dotless lowercase i
473 - If this mapping is included, the result is
474 case-insensitive, but dotless and dotted I's
475 are not distinguished
476 - If this mapping is excluded, the result is not
477 fully case-insensitive, but dotless and dotted
478 I's are distinguished
479
480If there is no case folding for that character, C<undef> is returned.
481
482For more information about case mappings see
483http://www.unicode.org/unicode/reports/tr21/
484
485=cut
486
487my %CASEFOLD;
488
489sub _casefold {
490 unless (%CASEFOLD) {
491 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
492 while (<$CASEFOLDFH>) {
493 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
494 my $code = hex($1);
495 $CASEFOLD{$code} = { code => $1,
496 status => $2,
497 mapping => $3 };
498 }
499 }
500 close($CASEFOLDFH);
501 }
502 }
503}
504
505sub casefold {
506 my $arg = shift;
507 my $code = _getcode($arg);
508
509 _casefold() unless %CASEFOLD;
510
511 return $CASEFOLD{$code};
512}
513
514=head2 casespec
515
1189d1e4 516 use UnicodeCD 'casespec';
b08cd201 517
518 my %casespec = casespec("09dc");
519
520The casespec() returns the potentially locale-dependent case mapping
521of the character specified by a B<code point argument>. The mapping
522may change the length of the string (which the basic Unicode case
523mappings as returned by charinfo() never do).
524
525If there is a case folding for that character, a reference to a hash
526with the following fields is returned:
527
528 key
529
530 code code point with at least four hexdigits
531 lower lowercase
532 title titlecase
533 upper uppercase
534 condition condition list (may be undef)
535
536The C<condition> is optional. Where present, it consists of one or
537more I<locales> or I<contexts>, separated by spaces (other than as
538used to separate elements, spaces are to be ignored). A condition
539list overrides the normal behavior if all of the listed conditions are
540true. Case distinctions in the condition list are not significant.
541Conditions preceded by "NON_" represent the negation of the condition
542
543A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d 544followed by a "_" and a 2-letter ISO language code (possibly followed
545by a "_" and a variant code). You can find the lists of those codes,
546see L<Locale::Country> and L<Locale::Language>.
b08cd201 547
548A I<context> is one of the following choices:
549
550 FINAL The letter is not followed by a letter of
551 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
552 MODERN The mapping is only used for modern text
e618509d 553 AFTER_i The last base character was "i" (U+0069)
b08cd201 554
555For more information about case mappings see
556http://www.unicode.org/unicode/reports/tr21/
557
558=cut
559
560my %CASESPEC;
561
562sub _casespec {
563 unless (%CASESPEC) {
564 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
565 while (<$CASESPECFH>) {
566 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
567 my $code = hex($1);
568 $CASESPEC{$code} = { code => $1,
569 lower => $2,
570 title => $3,
571 upper => $4,
572 condition => $5 };
573 }
574 }
575 close($CASESPECFH);
576 }
577 }
578}
579
580sub casespec {
581 my $arg = shift;
582 my $code = _getcode($arg);
583
584 _casespec() unless %CASESPEC;
585
586 return $CASESPEC{$code};
587}
588
1189d1e4 589=head2 UnicodeCD::UnicodeVersion
10a6ecd2 590
1189d1e4 591UnicodeCD::UnicodeVersion() returns the version of the Unicode Character
10a6ecd2 592Database, in other words, the version of the Unicode standard the
593database implements.
594
595=cut
596
597my $UNICODEVERSION;
598
599sub UnicodeVersion {
600 unless (defined $UNICODEVERSION) {
601 openunicode(\$VERSIONFH, "version");
602 chomp($UNICODEVERSION = <$VERSIONFH>);
603 close($VERSIONFH);
604 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
605 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
606 }
607 return $UNICODEVERSION;
608}
3aa957f9 609
610=head2 Implementation Note
32c16050 611
ad9cab37 612The first use of charinfo() opens a read-only filehandle to the Unicode
613Character Database (the database is included in the Perl distribution).
614The filehandle is then kept open for further queries.
32c16050 615
561c79ed 616=head1 AUTHOR
617
618Jarkko Hietaniemi
619
620=cut
621
6221;