Add compexcl(), casefold(), and casespec() interfaces;
[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm
CommitLineData
561c79ed 1package Unicode::UCD;
2
3use strict;
4use warnings;
5
10a6ecd2 6our $VERSION = '0.1';
561c79ed 7
8require Exporter;
9
10our @ISA = qw(Exporter);
10a6ecd2 11our @EXPORT_OK = qw(charinfo
12 charblock charscript
13 charblocks charscripts
b08cd201 14 charinrange
15 compexcl
16 casefold casespec);
561c79ed 17
18use Carp;
19
20=head1 NAME
21
00f2772c 22Unicode::UCD - Unicode character database
561c79ed 23
24=head1 SYNOPSIS
25
561c79ed 26 use Unicode::UCD 'charinfo';
b08cd201 27 my $charinfo = charinfo($codepoint);
561c79ed 28
29 use Unicode::UCD 'charblock';
e882dd67 30 my $charblock = charblock($codepoint);
31
32 use Unicode::UCD 'charscript';
33 my $charscript = charblock($codepoint);
561c79ed 34
35=head1 DESCRIPTION
36
37The Unicode module offers a simple interface to the Unicode Character
38Database.
39
40=cut
41
10a6ecd2 42my $UNICODEFH;
43my $BLOCKSFH;
44my $SCRIPTSFH;
45my $VERSIONFH;
b08cd201 46my $COMPEXCLFH;
47my $CASEFOLDFH;
48my $CASESPECFH;
561c79ed 49
50sub openunicode {
51 my ($rfh, @path) = @_;
52 my $f;
53 unless (defined $$rfh) {
54 for my $d (@INC) {
55 use File::Spec;
56 $f = File::Spec->catfile($d, "unicode", @path);
32c16050 57 last if open($$rfh, $f);
e882dd67 58 undef $f;
561c79ed 59 }
e882dd67 60 croak __PACKAGE__, ": failed to find ",
61 File::Spec->catfile(@path), " in @INC"
62 unless defined $f;
561c79ed 63 }
64 return $f;
65}
66
67=head2 charinfo
68
69 use Unicode::UCD 'charinfo';
70
b08cd201 71 my $charinfo = charinfo(0x41);
561c79ed 72
b08cd201 73charinfo() returns a reference to a hash that has the following fields
74as defined by the Unicode standard:
561c79ed 75
76 key
77
78 code code point with at least four hexdigits
79 name name of the character IN UPPER CASE
80 category general category of the character
81 combining classes used in the Canonical Ordering Algorithm
82 bidi bidirectional category
83 decomposition character decomposition mapping
84 decimal if decimal digit this is the integer numeric value
85 digit if digit this is the numeric value
86 numeric if numeric is the integer or rational numeric value
87 mirrored if mirrored in bidirectional text
88 unicode10 Unicode 1.0 name if existed and different
89 comment ISO 10646 comment field
90 upper uppercase equivalent mapping
91 lower lowercase equivalent mapping
92 title titlecase equivalent mapping
e882dd67 93
561c79ed 94 block block the character belongs to (used in \p{In...})
e882dd67 95 script script the character belongs to
561c79ed 96
b08cd201 97If no match is found, a reference to an empty hash is returned.
561c79ed 98
32c16050 99The C<block> property is the same as as returned by charinfo(). It is
100not defined in the Unicode Character Database proper (Chapter 4 of the
101Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
e882dd67 102of TUS3). Similarly for the C<script> property.
32c16050 103
104Note that you cannot do (de)composition and casing based solely on the
105above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 106you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed 107
108=cut
109
10a6ecd2 110sub _getcode {
111 my $arg = shift;
112
113 if ($arg =~ /^\d+$/) {
114 return $arg;
115 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
116 return hex($1);
117 }
118
119 return;
120}
121
561c79ed 122sub charinfo {
10a6ecd2 123 my $arg = shift;
124 my $code = _getcode($arg);
125 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
126 unless defined $code;
561c79ed 127 my $hexk = sprintf("%04X", $code);
128
10a6ecd2 129 openunicode(\$UNICODEFH, "Unicode.txt");
130 if (defined $UNICODEFH) {
561c79ed 131 use Search::Dict;
10a6ecd2 132 if (look($UNICODEFH, "$hexk;") >= 0) {
133 my $line = <$UNICODEFH>;
561c79ed 134 chomp $line;
135 my %prop;
136 @prop{qw(
137 code name category
138 combining bidi decomposition
139 decimal digit numeric
140 mirrored unicode10 comment
141 upper lower title
142 )} = split(/;/, $line, -1);
143 if ($prop{code} eq $hexk) {
a196fbfd 144 $prop{block} = charblock($code);
145 $prop{script} = charscript($code);
b08cd201 146 return \%prop;
561c79ed 147 }
148 }
149 }
150 return;
151}
152
e882dd67 153sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
154 my ($table, $lo, $hi, $code) = @_;
155
156 return if $lo > $hi;
157
158 my $mid = int(($lo+$hi) / 2);
159
160 if ($table->[$mid]->[0] < $code) {
10a6ecd2 161 if ($table->[$mid]->[1] >= $code) {
e882dd67 162 return $table->[$mid]->[2];
163 } else {
164 _search($table, $mid + 1, $hi, $code);
165 }
166 } elsif ($table->[$mid]->[0] > $code) {
167 _search($table, $lo, $mid - 1, $code);
168 } else {
169 return $table->[$mid]->[2];
170 }
171}
172
10a6ecd2 173sub charinrange {
174 my ($range, $arg) = @_;
175 my $code = _getcode($arg);
176 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
177 unless defined $code;
178 _search($range, 0, $#$range, $code);
179}
180
354a27bf 181=head2 charblock
561c79ed 182
183 use Unicode::UCD 'charblock';
184
185 my $charblock = charblock(0x41);
10a6ecd2 186 my $charblock = charblock(1234);
187 my $charblock = charblock("0x263a");
188 my $charblock = charblock("U+263a");
189
190 my $ranges = charblock('Armenian');
191
192With a B<code point argument> charblock() returns the block the character
193belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 194positions within all blocks are defined.
10a6ecd2 195
196If supplied with an argument that can't be a code point, charblock()
197tries to do the opposite and interpret the argument as a character
198block. The return value is a I<range>: an anonymous list that
199contains anonymous lists, which in turn contain I<start-of-range>,
200I<end-of-range> code point pairs. You can test whether a code point
201is in a range using the L</charinrange> function. If the argument is
202not a known charater block, C<undef> is returned.
561c79ed 203
561c79ed 204=cut
205
206my @BLOCKS;
10a6ecd2 207my %BLOCKS;
561c79ed 208
10a6ecd2 209sub _charblocks {
561c79ed 210 unless (@BLOCKS) {
10a6ecd2 211 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
212 while (<$BLOCKSFH>) {
2796c109 213 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2 214 my ($lo, $hi) = (hex($1), hex($2));
215 my $subrange = [ $lo, $hi, $3 ];
216 push @BLOCKS, $subrange;
217 push @{$BLOCKS{$3}}, $subrange;
561c79ed 218 }
219 }
10a6ecd2 220 close($BLOCKSFH);
561c79ed 221 }
222 }
10a6ecd2 223}
224
225sub charblock {
226 my $arg = shift;
227
228 _charblocks() unless @BLOCKS;
229
230 my $code = _getcode($arg);
561c79ed 231
10a6ecd2 232 if (defined $code) {
233 _search(\@BLOCKS, 0, $#BLOCKS, $code);
234 } else {
235 if (exists $BLOCKS{$arg}) {
236 return $BLOCKS{$arg};
237 } else {
238 return;
239 }
240 }
e882dd67 241}
242
243=head2 charscript
244
245 use Unicode::UCD 'charscript';
246
247 my $charscript = charscript(0x41);
10a6ecd2 248 my $charscript = charscript(1234);
249 my $charscript = charscript("U+263a");
e882dd67 250
10a6ecd2 251 my $ranges = charscript('Thai');
252
253With a B<code point argument> charscript() returns the script the
b08cd201 254character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2 255
256If supplied with an argument that can't be a code point, charscript()
257tries to do the opposite and interpret the argument as a character
258script. The return value is a I<range>: an anonymous list that
259contains anonymous lists, which in turn contain I<start-of-range>,
260I<end-of-range> code point pairs. You can test whether a code point
261is in a range using the L</charinrange> function. If the argument is
262not a known charater script, C<undef> is returned.
e882dd67 263
e882dd67 264=cut
265
266my @SCRIPTS;
10a6ecd2 267my %SCRIPTS;
e882dd67 268
10a6ecd2 269sub _charscripts {
e882dd67 270 unless (@SCRIPTS) {
10a6ecd2 271 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
272 while (<$SCRIPTSFH>) {
e882dd67 273 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2 274 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
275 my $script = lc($3);
276 $script =~ s/\b(\w)/uc($1)/ge;
277 my $subrange = [ $lo, $hi, $script ];
278 push @SCRIPTS, $subrange;
279 push @{$SCRIPTS{$script}}, $subrange;
e882dd67 280 }
281 }
10a6ecd2 282 close($SCRIPTSFH);
e882dd67 283 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
284 }
285 }
10a6ecd2 286}
287
288sub charscript {
289 my $arg = shift;
290
291 _charscripts() unless @SCRIPTS;
e882dd67 292
10a6ecd2 293 my $code = _getcode($arg);
294
295 if (defined $code) {
296 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
297 } else {
298 if (exists $SCRIPTS{$arg}) {
299 return $SCRIPTS{$arg};
300 } else {
301 return;
302 }
303 }
304}
305
306=head2 charblocks
307
308 use Unicode::UCD 'charblocks';
309
b08cd201 310 my $charblocks = charblocks();
10a6ecd2 311
b08cd201 312charblocks() returns a reference to a hash with the known block names
313as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2 314
315=cut
316
317sub charblocks {
b08cd201 318 _charblocks() unless %BLOCKS;
319 return \%BLOCKS;
10a6ecd2 320}
321
322=head2 charscripts
323
324 use Unicode::UCD 'charscripts';
325
326 my %charscripts = charscripts();
327
328charscripts() returns a hash with the known script names as the keys,
329and the code point ranges (see L</charscript>) as the values.
330
331=cut
332
333sub charscripts {
b08cd201 334 _charscripts() unless %SCRIPTS;
335 return \%SCRIPTS;
561c79ed 336}
337
10a6ecd2 338=head2 Blocks versus Scripts
ad9cab37 339
10a6ecd2 340The difference between a block and a script is that scripts are closer
341to the linguistic notion of a set of characters required to present
342languages, while block is more of an artifact of the Unicode character
343numbering and separation into blocks of 256 characters.
3aa957f9 344
345For example the Latin B<script> is spread over several B<blocks>, such
346as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
347C<Latin Extended-B>. On the other hand, the Latin script does not
348contain all the characters of the C<Basic Latin> block (also known as
349the ASCII): it includes only the letters, not for example the digits
350or the punctuation.
ad9cab37 351
3aa957f9 352For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37 353
354For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
355
3aa957f9 356=head2 Matching Scripts and Blocks
357
358Both scripts and blocks can be matched using the regular expression
359construct C<\p{In...}> and its negation C<\P{In...}>.
360
361The name of the script or the block comes after the C<In>, for example
362C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
10a6ecd2 363removed from the names for the C<\p{In...}>, for example
364C<LatinExtendedA> instead of C<Latin Extended-A>.
365
366There are a few cases where there exists both a script and a block by
367the same name, in these cases the block version has C<Block> appended:
368C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
369
b08cd201 370=head2 Code Point Arguments
371
372A <code point argument> is either a decimal or a hexadecimal scalar,
373or "U+" followed by hexadecimals.
374
10a6ecd2 375=head2 charinrange
376
377In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
378can also test whether a code point is in the I<range> as returned by
379L</charblock> and L</charscript> or as the values of the hash returned
380by L</charblocks> and </charscripts> by using charinrange():
381
382 use Unicode::UCD qw(charscript charinrange);
383
384 $range = charscript('Hiragana');
385 print "looks like hiragana\n" if charinrange($range, $code);
386
387=cut
388
b08cd201 389=head2 compexcl
390
391 use Unicode::UCD 'compexcl';
392
393 my $compexcl = compexcl("09dc");
394
395The compexcl() returns the composition exclusion (that is, if the
396character cannot be decomposed) of the character specified by a B<code
397point argument>.
398
399If there is a composition exclusion for the character, true is
400returned. Otherwise, false is returned.
401
402=cut
403
404my %COMPEXCL;
405
406sub _compexcl {
407 unless (%COMPEXCL) {
408 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
409 while (<$COMPEXCLFH>) {
410 if (/^([0-9A-F]+) \# /) {
411 my $code = hex($1);
412 $COMPEXCL{$code} = undef;
413 }
414 }
415 close($COMPEXCLFH);
416 }
417 }
418}
419
420sub compexcl {
421 my $arg = shift;
422 my $code = _getcode($arg);
423
424 _compexcl() unless %COMPEXCL;
425
426 return exists $COMPEXCL{$code};
427}
428
429=head2 casefold
430
431 use Unicode::UCD 'casefold';
432
433 my %casefold = casefold("09dc");
434
435The casefold() returns the locale-independent case folding of the
436character specified by a B<code point argument>.
437
438If there is a case folding for that character, a reference to a hash
439with the following fields is returned:
440
441 key
442
443 code code point with at least four hexdigits
444 status "C", "F", "S", or "I"
445 mapping one or more codes separated by spaces
446
447The meaning of the I<status> is as follows:
448
449 C common case folding, common mappings shared
450 by both simple and full mappings
451 F full case folding, mappings that cause strings
452 to grow in length. Multiple characters are separated
453 by spaces
454 S simple case folding, mappings to single characters
455 where different from F
456 I special case for dotted uppercase I and
457 dotless lowercase i
458 - If this mapping is included, the result is
459 case-insensitive, but dotless and dotted I's
460 are not distinguished
461 - If this mapping is excluded, the result is not
462 fully case-insensitive, but dotless and dotted
463 I's are distinguished
464
465If there is no case folding for that character, C<undef> is returned.
466
467For more information about case mappings see
468http://www.unicode.org/unicode/reports/tr21/
469
470=cut
471
472my %CASEFOLD;
473
474sub _casefold {
475 unless (%CASEFOLD) {
476 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
477 while (<$CASEFOLDFH>) {
478 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
479 my $code = hex($1);
480 $CASEFOLD{$code} = { code => $1,
481 status => $2,
482 mapping => $3 };
483 }
484 }
485 close($CASEFOLDFH);
486 }
487 }
488}
489
490sub casefold {
491 my $arg = shift;
492 my $code = _getcode($arg);
493
494 _casefold() unless %CASEFOLD;
495
496 return $CASEFOLD{$code};
497}
498
499=head2 casespec
500
501 use Unicode::UCD 'casespec';
502
503 my %casespec = casespec("09dc");
504
505The casespec() returns the potentially locale-dependent case mapping
506of the character specified by a B<code point argument>. The mapping
507may change the length of the string (which the basic Unicode case
508mappings as returned by charinfo() never do).
509
510If there is a case folding for that character, a reference to a hash
511with the following fields is returned:
512
513 key
514
515 code code point with at least four hexdigits
516 lower lowercase
517 title titlecase
518 upper uppercase
519 condition condition list (may be undef)
520
521The C<condition> is optional. Where present, it consists of one or
522more I<locales> or I<contexts>, separated by spaces (other than as
523used to separate elements, spaces are to be ignored). A condition
524list overrides the normal behavior if all of the listed conditions are
525true. Case distinctions in the condition list are not significant.
526Conditions preceded by "NON_" represent the negation of the condition
527
528A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
529followed by a "_" and a 2-letter ISO language code (, possibly followed
530by a "_" and a variant code). You can find the list of those codes
531in L<Locale::Country> and L<Locale::Language>.
532
533A I<context> is one of the following choices:
534
535 FINAL The letter is not followed by a letter of
536 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
537 MODERN The mapping is only used for modern text
538 AFTER_i The last base character was "i" 0069
539
540For more information about case mappings see
541http://www.unicode.org/unicode/reports/tr21/
542
543=cut
544
545my %CASESPEC;
546
547sub _casespec {
548 unless (%CASESPEC) {
549 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
550 while (<$CASESPECFH>) {
551 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
552 my $code = hex($1);
553 $CASESPEC{$code} = { code => $1,
554 lower => $2,
555 title => $3,
556 upper => $4,
557 condition => $5 };
558 }
559 }
560 close($CASESPECFH);
561 }
562 }
563}
564
565sub casespec {
566 my $arg = shift;
567 my $code = _getcode($arg);
568
569 _casespec() unless %CASESPEC;
570
571 return $CASESPEC{$code};
572}
573
10a6ecd2 574=head2 Unicode::UCD::UnicodeVersion
575
576Unicode::UCD::UnicodeVersion() returns the version of the Unicode Character
577Database, in other words, the version of the Unicode standard the
578database implements.
579
580=cut
581
582my $UNICODEVERSION;
583
584sub UnicodeVersion {
585 unless (defined $UNICODEVERSION) {
586 openunicode(\$VERSIONFH, "version");
587 chomp($UNICODEVERSION = <$VERSIONFH>);
588 close($VERSIONFH);
589 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
590 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
591 }
592 return $UNICODEVERSION;
593}
3aa957f9 594
595=head2 Implementation Note
32c16050 596
ad9cab37 597The first use of charinfo() opens a read-only filehandle to the Unicode
598Character Database (the database is included in the Perl distribution).
599The filehandle is then kept open for further queries.
32c16050 600
561c79ed 601=head1 AUTHOR
602
603Jarkko Hietaniemi
604
605=cut
606
6071;