Typo fix in overload docs
[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed 2
3use strict;
4use warnings;
5
98225a64 6our $VERSION = '0.24';
561c79ed 7
741297c1 8use Storable qw(dclone);
9
561c79ed 10require Exporter;
11
12our @ISA = qw(Exporter);
74f8133e 13
10a6ecd2 14our @EXPORT_OK = qw(charinfo
15 charblock charscript
16 charblocks charscripts
b08cd201 17 charinrange
18 compexcl
a2bd7410 19 casefold casespec
20 namedseq);
561c79ed 21
22use Carp;
23
24=head1 NAME
25
55d7b906 26Unicode::UCD - Unicode character database
561c79ed 27
28=head1 SYNOPSIS
29
55d7b906 30 use Unicode::UCD 'charinfo';
b08cd201 31 my $charinfo = charinfo($codepoint);
561c79ed 32
55d7b906 33 use Unicode::UCD 'charblock';
e882dd67 34 my $charblock = charblock($codepoint);
35
55d7b906 36 use Unicode::UCD 'charscript';
65044554 37 my $charscript = charscript($codepoint);
561c79ed 38
55d7b906 39 use Unicode::UCD 'charblocks';
e145285f 40 my $charblocks = charblocks();
41
55d7b906 42 use Unicode::UCD 'charscripts';
e145285f 43 my %charscripts = charscripts();
44
55d7b906 45 use Unicode::UCD qw(charscript charinrange);
e145285f 46 my $range = charscript($script);
47 print "looks like $script\n" if charinrange($range, $codepoint);
48
55d7b906 49 use Unicode::UCD 'compexcl';
e145285f 50 my $compexcl = compexcl($codepoint);
51
a2bd7410 52 use Unicode::UCD 'namedseq';
53 my $namedseq = namedseq($named_sequence_name);
54
55d7b906 55 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 56
561c79ed 57=head1 DESCRIPTION
58
8b731da2 59The Unicode::UCD module offers a simple interface to the Unicode
60Character Database.
561c79ed 61
62=cut
63
10a6ecd2 64my $UNICODEFH;
65my $BLOCKSFH;
66my $SCRIPTSFH;
67my $VERSIONFH;
b08cd201 68my $COMPEXCLFH;
69my $CASEFOLDFH;
70my $CASESPECFH;
a2bd7410 71my $NAMEDSEQFH;
561c79ed 72
73sub openunicode {
74 my ($rfh, @path) = @_;
75 my $f;
76 unless (defined $$rfh) {
77 for my $d (@INC) {
78 use File::Spec;
55d7b906 79 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 80 last if open($$rfh, $f);
e882dd67 81 undef $f;
561c79ed 82 }
e882dd67 83 croak __PACKAGE__, ": failed to find ",
84 File::Spec->catfile(@path), " in @INC"
85 unless defined $f;
561c79ed 86 }
87 return $f;
88}
89
90=head2 charinfo
91
55d7b906 92 use Unicode::UCD 'charinfo';
561c79ed 93
b08cd201 94 my $charinfo = charinfo(0x41);
561c79ed 95
b08cd201 96charinfo() returns a reference to a hash that has the following fields
97as defined by the Unicode standard:
561c79ed 98
99 key
100
101 code code point with at least four hexdigits
102 name name of the character IN UPPER CASE
103 category general category of the character
104 combining classes used in the Canonical Ordering Algorithm
105 bidi bidirectional category
106 decomposition character decomposition mapping
107 decimal if decimal digit this is the integer numeric value
108 digit if digit this is the numeric value
109 numeric if numeric is the integer or rational numeric value
110 mirrored if mirrored in bidirectional text
111 unicode10 Unicode 1.0 name if existed and different
112 comment ISO 10646 comment field
113 upper uppercase equivalent mapping
114 lower lowercase equivalent mapping
115 title titlecase equivalent mapping
e882dd67 116
561c79ed 117 block block the character belongs to (used in \p{In...})
eb0cc9e3 118 script script the character belongs to
561c79ed 119
b08cd201 120If no match is found, a reference to an empty hash is returned.
561c79ed 121
d1be9408 122The C<block> property is the same as returned by charinfo(). It is
32c16050 123not defined in the Unicode Character Database proper (Chapter 4 of the
78bf21c2 124Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
125(Chapter 14 of TUS3). Similarly for the C<script> property.
32c16050 126
127Note that you cannot do (de)composition and casing based solely on the
128above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 129you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed 130
131=cut
132
0616d9cf 133# NB: This function is duplicated in charnames.pm
10a6ecd2 134sub _getcode {
135 my $arg = shift;
136
dc0a4417 137 if ($arg =~ /^[1-9]\d*$/) {
10a6ecd2 138 return $arg;
dc0a4417 139 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
10a6ecd2 140 return hex($1);
141 }
142
143 return;
144}
145
ac5ea531 146# Lingua::KO::Hangul::Util not part of the standard distribution
147# but it will be used if available.
148
149eval { require Lingua::KO::Hangul::Util };
150my $hasHangulUtil = ! $@;
151if ($hasHangulUtil) {
152 Lingua::KO::Hangul::Util->import();
153}
9087a70b 154
155sub hangul_decomp { # internal: called from charinfo
ac5ea531 156 if ($hasHangulUtil) {
157 my @tmp = decomposeHangul(shift);
158 return sprintf("%04X %04X", @tmp) if @tmp == 2;
159 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
160 }
161 return;
162}
163
164sub hangul_charname { # internal: called from charinfo
165 return sprintf("HANGUL SYLLABLE-%04X", shift);
a6fa416b 166}
167
9087a70b 168sub han_charname { # internal: called from charinfo
169 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
a6fa416b 170}
171
172my @CharinfoRanges = (
173# block name
174# [ first, last, coderef to name, coderef to decompose ],
175# CJK Ideographs Extension A
176 [ 0x3400, 0x4DB5, \&han_charname, undef ],
177# CJK Ideographs
178 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
179# Hangul Syllables
ac5ea531 180 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
a6fa416b 181# Non-Private Use High Surrogates
182 [ 0xD800, 0xDB7F, undef, undef ],
183# Private Use High Surrogates
184 [ 0xDB80, 0xDBFF, undef, undef ],
185# Low Surrogates
186 [ 0xDC00, 0xDFFF, undef, undef ],
187# The Private Use Area
188 [ 0xE000, 0xF8FF, undef, undef ],
189# CJK Ideographs Extension B
190 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
191# Plane 15 Private Use Area
192 [ 0xF0000, 0xFFFFD, undef, undef ],
193# Plane 16 Private Use Area
194 [ 0x100000, 0x10FFFD, undef, undef ],
195);
196
561c79ed 197sub charinfo {
10a6ecd2 198 my $arg = shift;
199 my $code = _getcode($arg);
200 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
201 unless defined $code;
e63dbbf9 202 my $hexk = sprintf("%06X", $code);
a6fa416b 203 my($rcode,$rname,$rdec);
204 foreach my $range (@CharinfoRanges){
74f8133e 205 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 206 $rcode = $hexk;
e63dbbf9 207 $rcode =~ s/^0+//;
208 $rcode = sprintf("%04X", hex($rcode));
a6fa416b 209 $rname = $range->[2] ? $range->[2]->($code) : '';
210 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 211 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b 212 last;
213 }
214 }
551b6b6f 215 openunicode(\$UNICODEFH, "UnicodeData.txt");
10a6ecd2 216 if (defined $UNICODEFH) {
e63dbbf9 217 use Search::Dict 1.02;
218 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 219 my $line = <$UNICODEFH>;
c5a29f40 220 return unless defined $line;
561c79ed 221 chomp $line;
222 my %prop;
223 @prop{qw(
224 code name category
225 combining bidi decomposition
226 decimal digit numeric
227 mirrored unicode10 comment
228 upper lower title
229 )} = split(/;/, $line, -1);
e63dbbf9 230 $hexk =~ s/^0+//;
231 $hexk = sprintf("%04X", hex($hexk));
561c79ed 232 if ($prop{code} eq $hexk) {
a196fbfd 233 $prop{block} = charblock($code);
234 $prop{script} = charscript($code);
a6fa416b 235 if(defined $rname){
236 $prop{code} = $rcode;
237 $prop{name} = $rname;
238 $prop{decomposition} = $rdec;
239 }
b08cd201 240 return \%prop;
561c79ed 241 }
242 }
243 }
244 return;
245}
246
e882dd67 247sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
248 my ($table, $lo, $hi, $code) = @_;
249
250 return if $lo > $hi;
251
252 my $mid = int(($lo+$hi) / 2);
253
254 if ($table->[$mid]->[0] < $code) {
10a6ecd2 255 if ($table->[$mid]->[1] >= $code) {
e882dd67 256 return $table->[$mid]->[2];
257 } else {
258 _search($table, $mid + 1, $hi, $code);
259 }
260 } elsif ($table->[$mid]->[0] > $code) {
261 _search($table, $lo, $mid - 1, $code);
262 } else {
263 return $table->[$mid]->[2];
264 }
265}
266
10a6ecd2 267sub charinrange {
268 my ($range, $arg) = @_;
269 my $code = _getcode($arg);
270 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
271 unless defined $code;
272 _search($range, 0, $#$range, $code);
273}
274
354a27bf 275=head2 charblock
561c79ed 276
55d7b906 277 use Unicode::UCD 'charblock';
561c79ed 278
279 my $charblock = charblock(0x41);
10a6ecd2 280 my $charblock = charblock(1234);
281 my $charblock = charblock("0x263a");
282 my $charblock = charblock("U+263a");
283
78bf21c2 284 my $range = charblock('Armenian');
10a6ecd2 285
78bf21c2 286With a B<code point argument> charblock() returns the I<block> the character
10a6ecd2 287belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 288positions within all blocks are defined.
10a6ecd2 289
78bf21c2 290See also L</Blocks versus Scripts>.
291
eb0cc9e3 292If supplied with an argument that can't be a code point, charblock() tries
293to do the opposite and interpret the argument as a character block. The
294return value is a I<range>: an anonymous list of lists that contain
a2bd7410 295I<start-of-range>, I<end-of-range> code point pairs. You can test whether
296a code point is in a range using the L</charinrange> function. If the
3c4b39be 297argument is not a known character block, C<undef> is returned.
561c79ed 298
561c79ed 299=cut
300
301my @BLOCKS;
10a6ecd2 302my %BLOCKS;
561c79ed 303
10a6ecd2 304sub _charblocks {
561c79ed 305 unless (@BLOCKS) {
10a6ecd2 306 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
6c8d78fb 307 local $_;
10a6ecd2 308 while (<$BLOCKSFH>) {
2796c109 309 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2 310 my ($lo, $hi) = (hex($1), hex($2));
311 my $subrange = [ $lo, $hi, $3 ];
312 push @BLOCKS, $subrange;
313 push @{$BLOCKS{$3}}, $subrange;
561c79ed 314 }
315 }
10a6ecd2 316 close($BLOCKSFH);
561c79ed 317 }
318 }
10a6ecd2 319}
320
321sub charblock {
322 my $arg = shift;
323
324 _charblocks() unless @BLOCKS;
325
326 my $code = _getcode($arg);
561c79ed 327
10a6ecd2 328 if (defined $code) {
329 _search(\@BLOCKS, 0, $#BLOCKS, $code);
330 } else {
331 if (exists $BLOCKS{$arg}) {
741297c1 332 return dclone $BLOCKS{$arg};
10a6ecd2 333 } else {
334 return;
335 }
336 }
e882dd67 337}
338
339=head2 charscript
340
55d7b906 341 use Unicode::UCD 'charscript';
e882dd67 342
343 my $charscript = charscript(0x41);
10a6ecd2 344 my $charscript = charscript(1234);
345 my $charscript = charscript("U+263a");
e882dd67 346
78bf21c2 347 my $range = charscript('Thai');
10a6ecd2 348
78bf21c2 349With a B<code point argument> charscript() returns the I<script> the
b08cd201 350character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2 351
78bf21c2 352See also L</Blocks versus Scripts>.
353
eb0cc9e3 354If supplied with an argument that can't be a code point, charscript() tries
355to do the opposite and interpret the argument as a character script. The
356return value is a I<range>: an anonymous list of lists that contain
357I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
358code point is in a range using the L</charinrange> function. If the
3c4b39be 359argument is not a known character script, C<undef> is returned.
e882dd67 360
e882dd67 361=cut
362
363my @SCRIPTS;
10a6ecd2 364my %SCRIPTS;
e882dd67 365
10a6ecd2 366sub _charscripts {
e882dd67 367 unless (@SCRIPTS) {
10a6ecd2 368 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
6c8d78fb 369 local $_;
10a6ecd2 370 while (<$SCRIPTSFH>) {
e882dd67 371 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2 372 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
373 my $script = lc($3);
374 $script =~ s/\b(\w)/uc($1)/ge;
375 my $subrange = [ $lo, $hi, $script ];
376 push @SCRIPTS, $subrange;
377 push @{$SCRIPTS{$script}}, $subrange;
e882dd67 378 }
379 }
10a6ecd2 380 close($SCRIPTSFH);
e882dd67 381 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
382 }
383 }
10a6ecd2 384}
385
386sub charscript {
387 my $arg = shift;
388
389 _charscripts() unless @SCRIPTS;
e882dd67 390
10a6ecd2 391 my $code = _getcode($arg);
392
393 if (defined $code) {
394 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
395 } else {
396 if (exists $SCRIPTS{$arg}) {
741297c1 397 return dclone $SCRIPTS{$arg};
10a6ecd2 398 } else {
399 return;
400 }
401 }
402}
403
404=head2 charblocks
405
55d7b906 406 use Unicode::UCD 'charblocks';
10a6ecd2 407
b08cd201 408 my $charblocks = charblocks();
10a6ecd2 409
b08cd201 410charblocks() returns a reference to a hash with the known block names
411as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2 412
78bf21c2 413See also L</Blocks versus Scripts>.
414
10a6ecd2 415=cut
416
417sub charblocks {
b08cd201 418 _charblocks() unless %BLOCKS;
741297c1 419 return dclone \%BLOCKS;
10a6ecd2 420}
421
422=head2 charscripts
423
55d7b906 424 use Unicode::UCD 'charscripts';
10a6ecd2 425
426 my %charscripts = charscripts();
427
428charscripts() returns a hash with the known script names as the keys,
429and the code point ranges (see L</charscript>) as the values.
430
78bf21c2 431See also L</Blocks versus Scripts>.
432
10a6ecd2 433=cut
434
435sub charscripts {
b08cd201 436 _charscripts() unless %SCRIPTS;
741297c1 437 return dclone \%SCRIPTS;
561c79ed 438}
439
10a6ecd2 440=head2 Blocks versus Scripts
ad9cab37 441
10a6ecd2 442The difference between a block and a script is that scripts are closer
443to the linguistic notion of a set of characters required to present
444languages, while block is more of an artifact of the Unicode character
eb0cc9e3 445numbering and separation into blocks of (mostly) 256 characters.
3aa957f9 446
447For example the Latin B<script> is spread over several B<blocks>, such
448as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
449C<Latin Extended-B>. On the other hand, the Latin script does not
450contain all the characters of the C<Basic Latin> block (also known as
eb0cc9e3 451the ASCII): it includes only the letters, and not, for example, the digits
3aa957f9 452or the punctuation.
ad9cab37 453
3aa957f9 454For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37 455
456For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
457
3aa957f9 458=head2 Matching Scripts and Blocks
459
eb0cc9e3 460Scripts are matched with the regular-expression construct
461C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
462while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
463any of the 256 code points in the Tibetan block).
10a6ecd2 464
b08cd201 465=head2 Code Point Arguments
466
92e830a9 467A I<code point argument> is either a decimal or a hexadecimal scalar
468designating a Unicode character, or C<U+> followed by hexadecimals
dc0a4417 469designating a Unicode character. In other words, if you want a code
470point to be interpreted as a hexadecimal number, you must prefix it
43adb1d9 471with either C<0x> or C<U+>, because a string like e.g. C<123> will
dc0a4417 472be interpreted as a decimal code point. Also note that Unicode is
473B<not> limited to 16 bits (the number of Unicode characters is
474open-ended, in theory unlimited): you may have more than 4 hexdigits.
b08cd201 475
10a6ecd2 476=head2 charinrange
477
478In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
479can also test whether a code point is in the I<range> as returned by
480L</charblock> and L</charscript> or as the values of the hash returned
e618509d 481by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 482
55d7b906 483 use Unicode::UCD qw(charscript charinrange);
10a6ecd2 484
485 $range = charscript('Hiragana');
e145285f 486 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2 487
488=cut
489
b08cd201 490=head2 compexcl
491
55d7b906 492 use Unicode::UCD 'compexcl';
b08cd201 493
494 my $compexcl = compexcl("09dc");
495
496The compexcl() returns the composition exclusion (that is, if the
9046a8ae 497character should not be produced during a precomposition) of the
498character specified by a B<code point argument>.
b08cd201 499
500If there is a composition exclusion for the character, true is
501returned. Otherwise, false is returned.
502
503=cut
504
505my %COMPEXCL;
506
507sub _compexcl {
508 unless (%COMPEXCL) {
551b6b6f 509 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
6c8d78fb 510 local $_;
b08cd201 511 while (<$COMPEXCLFH>) {
822ebcc8 512 if (/^([0-9A-F]+)\s+\#\s+/) {
b08cd201 513 my $code = hex($1);
514 $COMPEXCL{$code} = undef;
515 }
516 }
517 close($COMPEXCLFH);
518 }
519 }
520}
521
522sub compexcl {
523 my $arg = shift;
524 my $code = _getcode($arg);
74f8133e 525 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
526 unless defined $code;
b08cd201 527
528 _compexcl() unless %COMPEXCL;
529
530 return exists $COMPEXCL{$code};
531}
532
533=head2 casefold
534
55d7b906 535 use Unicode::UCD 'casefold';
b08cd201 536
82c0b05b 537 my $casefold = casefold("00DF");
b08cd201 538
539The casefold() returns the locale-independent case folding of the
540character specified by a B<code point argument>.
541
542If there is a case folding for that character, a reference to a hash
543with the following fields is returned:
544
545 key
546
547 code code point with at least four hexdigits
548 status "C", "F", "S", or "I"
549 mapping one or more codes separated by spaces
550
551The meaning of the I<status> is as follows:
552
553 C common case folding, common mappings shared
554 by both simple and full mappings
555 F full case folding, mappings that cause strings
556 to grow in length. Multiple characters are separated
557 by spaces
558 S simple case folding, mappings to single characters
559 where different from F
560 I special case for dotted uppercase I and
561 dotless lowercase i
562 - If this mapping is included, the result is
563 case-insensitive, but dotless and dotted I's
564 are not distinguished
565 - If this mapping is excluded, the result is not
566 fully case-insensitive, but dotless and dotted
567 I's are distinguished
568
569If there is no case folding for that character, C<undef> is returned.
570
571For more information about case mappings see
572http://www.unicode.org/unicode/reports/tr21/
573
574=cut
575
576my %CASEFOLD;
577
578sub _casefold {
579 unless (%CASEFOLD) {
551b6b6f 580 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
6c8d78fb 581 local $_;
b08cd201 582 while (<$CASEFOLDFH>) {
583 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
584 my $code = hex($1);
585 $CASEFOLD{$code} = { code => $1,
586 status => $2,
587 mapping => $3 };
588 }
589 }
590 close($CASEFOLDFH);
591 }
592 }
593}
594
595sub casefold {
596 my $arg = shift;
597 my $code = _getcode($arg);
74f8133e 598 croak __PACKAGE__, "::casefold: unknown code '$arg'"
599 unless defined $code;
b08cd201 600
601 _casefold() unless %CASEFOLD;
602
603 return $CASEFOLD{$code};
604}
605
606=head2 casespec
607
55d7b906 608 use Unicode::UCD 'casespec';
b08cd201 609
82c0b05b 610 my $casespec = casespec("FB00");
b08cd201 611
612The casespec() returns the potentially locale-dependent case mapping
613of the character specified by a B<code point argument>. The mapping
614may change the length of the string (which the basic Unicode case
615mappings as returned by charinfo() never do).
616
617If there is a case folding for that character, a reference to a hash
618with the following fields is returned:
619
620 key
621
622 code code point with at least four hexdigits
623 lower lowercase
624 title titlecase
625 upper uppercase
626 condition condition list (may be undef)
627
628The C<condition> is optional. Where present, it consists of one or
629more I<locales> or I<contexts>, separated by spaces (other than as
630used to separate elements, spaces are to be ignored). A condition
631list overrides the normal behavior if all of the listed conditions are
632true. Case distinctions in the condition list are not significant.
82c0b05b 633Conditions preceded by "NON_" represent the negation of the condition.
b08cd201 634
f499c386 635Note that when there are multiple case folding definitions for a
636single code point because of different locales, the value returned by
637casespec() is a hash reference which has the locales as the keys and
638hash references as described above as the values.
639
b08cd201 640A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d 641followed by a "_" and a 2-letter ISO language code (possibly followed
642by a "_" and a variant code). You can find the lists of those codes,
643see L<Locale::Country> and L<Locale::Language>.
b08cd201 644
645A I<context> is one of the following choices:
646
647 FINAL The letter is not followed by a letter of
648 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
649 MODERN The mapping is only used for modern text
e618509d 650 AFTER_i The last base character was "i" (U+0069)
b08cd201 651
652For more information about case mappings see
653http://www.unicode.org/unicode/reports/tr21/
654
655=cut
656
657my %CASESPEC;
658
659sub _casespec {
660 unless (%CASESPEC) {
551b6b6f 661 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
6c8d78fb 662 local $_;
b08cd201 663 while (<$CASESPECFH>) {
664 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386 665 my ($hexcode, $lower, $title, $upper, $condition) =
666 ($1, $2, $3, $4, $5);
667 my $code = hex($hexcode);
668 if (exists $CASESPEC{$code}) {
669 if (exists $CASESPEC{$code}->{code}) {
670 my ($oldlower,
671 $oldtitle,
672 $oldupper,
673 $oldcondition) =
674 @{$CASESPEC{$code}}{qw(lower
675 title
676 upper
677 condition)};
822ebcc8 678 if (defined $oldcondition) {
679 my ($oldlocale) =
f499c386 680 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386 681 delete $CASESPEC{$code};
682 $CASESPEC{$code}->{$oldlocale} =
683 { code => $hexcode,
684 lower => $oldlower,
685 title => $oldtitle,
686 upper => $oldupper,
687 condition => $oldcondition };
f499c386 688 }
689 }
690 my ($locale) =
691 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
692 $CASESPEC{$code}->{$locale} =
693 { code => $hexcode,
694 lower => $lower,
695 title => $title,
696 upper => $upper,
697 condition => $condition };
698 } else {
699 $CASESPEC{$code} =
700 { code => $hexcode,
701 lower => $lower,
702 title => $title,
703 upper => $upper,
704 condition => $condition };
705 }
b08cd201 706 }
707 }
708 close($CASESPECFH);
709 }
710 }
711}
712
713sub casespec {
714 my $arg = shift;
715 my $code = _getcode($arg);
74f8133e 716 croak __PACKAGE__, "::casespec: unknown code '$arg'"
717 unless defined $code;
b08cd201 718
719 _casespec() unless %CASESPEC;
720
741297c1 721 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
b08cd201 722}
723
a2bd7410 724=head2 namedseq()
725
726 use Unicode::UCD 'namedseq';
727
728 my $namedseq = namedseq("KATAKANA LETTER AINU P");
729 my @namedseq = namedseq("KATAKANA LETTER AINU P");
730 my %namedseq = namedseq();
731
732If used with a single argument in a scalar context, returns the string
733consisting of the code points of the named sequence, or C<undef> if no
734named sequence by that name exists. If used with a single argument in
735a list context, returns list of the code points. If used with no
736arguments in a list context, returns a hash with the names of the
737named sequences as the keys and the named sequences as strings as
738the values. Otherwise, returns C<undef> or empty list depending
739on the context.
740
741(New from Unicode 4.1.0)
742
743=cut
744
745my %NAMEDSEQ;
746
747sub _namedseq {
748 unless (%NAMEDSEQ) {
749 if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
750 local $_;
751 while (<$NAMEDSEQFH>) {
752 if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
753 my ($n, $s) = ($1, $2);
754 my @s = map { chr(hex($_)) } split(' ', $s);
755 $NAMEDSEQ{$n} = join("", @s);
756 }
757 }
758 close($NAMEDSEQFH);
759 }
760 }
761}
762
763sub namedseq {
764 _namedseq() unless %NAMEDSEQ;
765 my $wantarray = wantarray();
766 if (defined $wantarray) {
767 if ($wantarray) {
768 if (@_ == 0) {
769 return %NAMEDSEQ;
770 } elsif (@_ == 1) {
771 my $s = $NAMEDSEQ{ $_[0] };
772 return defined $s ? map { ord($_) } split('', $s) : ();
773 }
774 } elsif (@_ == 1) {
775 return $NAMEDSEQ{ $_[0] };
776 }
777 }
778 return;
779}
780
55d7b906 781=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 782
55d7b906 783Unicode::UCD::UnicodeVersion() returns the version of the Unicode
784Character Database, in other words, the version of the Unicode
78bf21c2 785standard the database implements. The version is a string
786of numbers delimited by dots (C<'.'>).
10a6ecd2 787
788=cut
789
790my $UNICODEVERSION;
791
792sub UnicodeVersion {
793 unless (defined $UNICODEVERSION) {
794 openunicode(\$VERSIONFH, "version");
795 chomp($UNICODEVERSION = <$VERSIONFH>);
796 close($VERSIONFH);
797 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
798 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
799 }
800 return $UNICODEVERSION;
801}
3aa957f9 802
803=head2 Implementation Note
32c16050 804
ad9cab37 805The first use of charinfo() opens a read-only filehandle to the Unicode
806Character Database (the database is included in the Perl distribution).
78bf21c2 807The filehandle is then kept open for further queries. In other words,
808if you are wondering where one of your filehandles went, that's where.
32c16050 809
8b731da2 810=head1 BUGS
811
812Does not yet support EBCDIC platforms.
813
561c79ed 814=head1 AUTHOR
815
816Jarkko Hietaniemi
817
818=cut
819
8201;