Clarify the doc (and the code) for Unicode code points.
[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed 2
3use strict;
4use warnings;
5
74f8133e 6our $VERSION = '0.2';
561c79ed 7
8require Exporter;
9
10our @ISA = qw(Exporter);
74f8133e 11
10a6ecd2 12our @EXPORT_OK = qw(charinfo
13 charblock charscript
14 charblocks charscripts
b08cd201 15 charinrange
16 compexcl
17 casefold casespec);
561c79ed 18
19use Carp;
20
21=head1 NAME
22
55d7b906 23Unicode::UCD - Unicode character database
561c79ed 24
25=head1 SYNOPSIS
26
55d7b906 27 use Unicode::UCD 'charinfo';
b08cd201 28 my $charinfo = charinfo($codepoint);
561c79ed 29
55d7b906 30 use Unicode::UCD 'charblock';
e882dd67 31 my $charblock = charblock($codepoint);
32
55d7b906 33 use Unicode::UCD 'charscript';
e882dd67 34 my $charscript = charblock($codepoint);
561c79ed 35
55d7b906 36 use Unicode::UCD 'charblocks';
e145285f 37 my $charblocks = charblocks();
38
55d7b906 39 use Unicode::UCD 'charscripts';
e145285f 40 my %charscripts = charscripts();
41
55d7b906 42 use Unicode::UCD qw(charscript charinrange);
e145285f 43 my $range = charscript($script);
44 print "looks like $script\n" if charinrange($range, $codepoint);
45
55d7b906 46 use Unicode::UCD 'compexcl';
e145285f 47 my $compexcl = compexcl($codepoint);
48
55d7b906 49 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 50
561c79ed 51=head1 DESCRIPTION
52
8b731da2 53The Unicode::UCD module offers a simple interface to the Unicode
54Character Database.
561c79ed 55
56=cut
57
10a6ecd2 58my $UNICODEFH;
59my $BLOCKSFH;
60my $SCRIPTSFH;
61my $VERSIONFH;
b08cd201 62my $COMPEXCLFH;
63my $CASEFOLDFH;
64my $CASESPECFH;
561c79ed 65
66sub openunicode {
67 my ($rfh, @path) = @_;
68 my $f;
69 unless (defined $$rfh) {
70 for my $d (@INC) {
71 use File::Spec;
55d7b906 72 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 73 last if open($$rfh, $f);
e882dd67 74 undef $f;
561c79ed 75 }
e882dd67 76 croak __PACKAGE__, ": failed to find ",
77 File::Spec->catfile(@path), " in @INC"
78 unless defined $f;
561c79ed 79 }
80 return $f;
81}
82
83=head2 charinfo
84
55d7b906 85 use Unicode::UCD 'charinfo';
561c79ed 86
b08cd201 87 my $charinfo = charinfo(0x41);
561c79ed 88
b08cd201 89charinfo() returns a reference to a hash that has the following fields
90as defined by the Unicode standard:
561c79ed 91
92 key
93
94 code code point with at least four hexdigits
95 name name of the character IN UPPER CASE
96 category general category of the character
97 combining classes used in the Canonical Ordering Algorithm
98 bidi bidirectional category
99 decomposition character decomposition mapping
100 decimal if decimal digit this is the integer numeric value
101 digit if digit this is the numeric value
102 numeric if numeric is the integer or rational numeric value
103 mirrored if mirrored in bidirectional text
104 unicode10 Unicode 1.0 name if existed and different
105 comment ISO 10646 comment field
106 upper uppercase equivalent mapping
107 lower lowercase equivalent mapping
108 title titlecase equivalent mapping
e882dd67 109
561c79ed 110 block block the character belongs to (used in \p{In...})
eb0cc9e3 111 script script the character belongs to
561c79ed 112
b08cd201 113If no match is found, a reference to an empty hash is returned.
561c79ed 114
d1be9408 115The C<block> property is the same as returned by charinfo(). It is
32c16050 116not defined in the Unicode Character Database proper (Chapter 4 of the
78bf21c2 117Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
118(Chapter 14 of TUS3). Similarly for the C<script> property.
32c16050 119
120Note that you cannot do (de)composition and casing based solely on the
121above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 122you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed 123
124=cut
125
10a6ecd2 126sub _getcode {
127 my $arg = shift;
128
dc0a4417 129 if ($arg =~ /^[1-9]\d*$/) {
10a6ecd2 130 return $arg;
dc0a4417 131 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
10a6ecd2 132 return hex($1);
133 }
134
135 return;
136}
137
ac5ea531 138# Lingua::KO::Hangul::Util not part of the standard distribution
139# but it will be used if available.
140
141eval { require Lingua::KO::Hangul::Util };
142my $hasHangulUtil = ! $@;
143if ($hasHangulUtil) {
144 Lingua::KO::Hangul::Util->import();
145}
9087a70b 146
147sub hangul_decomp { # internal: called from charinfo
ac5ea531 148 if ($hasHangulUtil) {
149 my @tmp = decomposeHangul(shift);
150 return sprintf("%04X %04X", @tmp) if @tmp == 2;
151 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
152 }
153 return;
154}
155
156sub hangul_charname { # internal: called from charinfo
157 return sprintf("HANGUL SYLLABLE-%04X", shift);
a6fa416b 158}
159
9087a70b 160sub han_charname { # internal: called from charinfo
161 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
a6fa416b 162}
163
164my @CharinfoRanges = (
165# block name
166# [ first, last, coderef to name, coderef to decompose ],
167# CJK Ideographs Extension A
168 [ 0x3400, 0x4DB5, \&han_charname, undef ],
169# CJK Ideographs
170 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
171# Hangul Syllables
ac5ea531 172 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
a6fa416b 173# Non-Private Use High Surrogates
174 [ 0xD800, 0xDB7F, undef, undef ],
175# Private Use High Surrogates
176 [ 0xDB80, 0xDBFF, undef, undef ],
177# Low Surrogates
178 [ 0xDC00, 0xDFFF, undef, undef ],
179# The Private Use Area
180 [ 0xE000, 0xF8FF, undef, undef ],
181# CJK Ideographs Extension B
182 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
183# Plane 15 Private Use Area
184 [ 0xF0000, 0xFFFFD, undef, undef ],
185# Plane 16 Private Use Area
186 [ 0x100000, 0x10FFFD, undef, undef ],
187);
188
561c79ed 189sub charinfo {
10a6ecd2 190 my $arg = shift;
191 my $code = _getcode($arg);
192 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
193 unless defined $code;
e63dbbf9 194 my $hexk = sprintf("%06X", $code);
a6fa416b 195 my($rcode,$rname,$rdec);
196 foreach my $range (@CharinfoRanges){
74f8133e 197 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 198 $rcode = $hexk;
e63dbbf9 199 $rcode =~ s/^0+//;
200 $rcode = sprintf("%04X", hex($rcode));
a6fa416b 201 $rname = $range->[2] ? $range->[2]->($code) : '';
202 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 203 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b 204 last;
205 }
206 }
551b6b6f 207 openunicode(\$UNICODEFH, "UnicodeData.txt");
10a6ecd2 208 if (defined $UNICODEFH) {
e63dbbf9 209 use Search::Dict 1.02;
210 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 211 my $line = <$UNICODEFH>;
561c79ed 212 chomp $line;
213 my %prop;
214 @prop{qw(
215 code name category
216 combining bidi decomposition
217 decimal digit numeric
218 mirrored unicode10 comment
219 upper lower title
220 )} = split(/;/, $line, -1);
e63dbbf9 221 $hexk =~ s/^0+//;
222 $hexk = sprintf("%04X", hex($hexk));
561c79ed 223 if ($prop{code} eq $hexk) {
a196fbfd 224 $prop{block} = charblock($code);
225 $prop{script} = charscript($code);
a6fa416b 226 if(defined $rname){
227 $prop{code} = $rcode;
228 $prop{name} = $rname;
229 $prop{decomposition} = $rdec;
230 }
b08cd201 231 return \%prop;
561c79ed 232 }
233 }
234 }
235 return;
236}
237
e882dd67 238sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
239 my ($table, $lo, $hi, $code) = @_;
240
241 return if $lo > $hi;
242
243 my $mid = int(($lo+$hi) / 2);
244
245 if ($table->[$mid]->[0] < $code) {
10a6ecd2 246 if ($table->[$mid]->[1] >= $code) {
e882dd67 247 return $table->[$mid]->[2];
248 } else {
249 _search($table, $mid + 1, $hi, $code);
250 }
251 } elsif ($table->[$mid]->[0] > $code) {
252 _search($table, $lo, $mid - 1, $code);
253 } else {
254 return $table->[$mid]->[2];
255 }
256}
257
10a6ecd2 258sub charinrange {
259 my ($range, $arg) = @_;
260 my $code = _getcode($arg);
261 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
262 unless defined $code;
263 _search($range, 0, $#$range, $code);
264}
265
354a27bf 266=head2 charblock
561c79ed 267
55d7b906 268 use Unicode::UCD 'charblock';
561c79ed 269
270 my $charblock = charblock(0x41);
10a6ecd2 271 my $charblock = charblock(1234);
272 my $charblock = charblock("0x263a");
273 my $charblock = charblock("U+263a");
274
78bf21c2 275 my $range = charblock('Armenian');
10a6ecd2 276
78bf21c2 277With a B<code point argument> charblock() returns the I<block> the character
10a6ecd2 278belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 279positions within all blocks are defined.
10a6ecd2 280
78bf21c2 281See also L</Blocks versus Scripts>.
282
eb0cc9e3 283If supplied with an argument that can't be a code point, charblock() tries
284to do the opposite and interpret the argument as a character block. The
285return value is a I<range>: an anonymous list of lists that contain
286I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
287code point is in a range using the L</charinrange> function. If the
288argument is not a known charater block, C<undef> is returned.
561c79ed 289
561c79ed 290=cut
291
292my @BLOCKS;
10a6ecd2 293my %BLOCKS;
561c79ed 294
10a6ecd2 295sub _charblocks {
561c79ed 296 unless (@BLOCKS) {
10a6ecd2 297 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
6c8d78fb 298 local $_;
10a6ecd2 299 while (<$BLOCKSFH>) {
2796c109 300 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2 301 my ($lo, $hi) = (hex($1), hex($2));
302 my $subrange = [ $lo, $hi, $3 ];
303 push @BLOCKS, $subrange;
304 push @{$BLOCKS{$3}}, $subrange;
561c79ed 305 }
306 }
10a6ecd2 307 close($BLOCKSFH);
561c79ed 308 }
309 }
10a6ecd2 310}
311
312sub charblock {
313 my $arg = shift;
314
315 _charblocks() unless @BLOCKS;
316
317 my $code = _getcode($arg);
561c79ed 318
10a6ecd2 319 if (defined $code) {
320 _search(\@BLOCKS, 0, $#BLOCKS, $code);
321 } else {
322 if (exists $BLOCKS{$arg}) {
323 return $BLOCKS{$arg};
324 } else {
325 return;
326 }
327 }
e882dd67 328}
329
330=head2 charscript
331
55d7b906 332 use Unicode::UCD 'charscript';
e882dd67 333
334 my $charscript = charscript(0x41);
10a6ecd2 335 my $charscript = charscript(1234);
336 my $charscript = charscript("U+263a");
e882dd67 337
78bf21c2 338 my $range = charscript('Thai');
10a6ecd2 339
78bf21c2 340With a B<code point argument> charscript() returns the I<script> the
b08cd201 341character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2 342
78bf21c2 343See also L</Blocks versus Scripts>.
344
eb0cc9e3 345If supplied with an argument that can't be a code point, charscript() tries
346to do the opposite and interpret the argument as a character script. The
347return value is a I<range>: an anonymous list of lists that contain
348I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
349code point is in a range using the L</charinrange> function. If the
350argument is not a known charater script, C<undef> is returned.
e882dd67 351
e882dd67 352=cut
353
354my @SCRIPTS;
10a6ecd2 355my %SCRIPTS;
e882dd67 356
10a6ecd2 357sub _charscripts {
e882dd67 358 unless (@SCRIPTS) {
10a6ecd2 359 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
6c8d78fb 360 local $_;
10a6ecd2 361 while (<$SCRIPTSFH>) {
e882dd67 362 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2 363 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
364 my $script = lc($3);
365 $script =~ s/\b(\w)/uc($1)/ge;
366 my $subrange = [ $lo, $hi, $script ];
367 push @SCRIPTS, $subrange;
368 push @{$SCRIPTS{$script}}, $subrange;
e882dd67 369 }
370 }
10a6ecd2 371 close($SCRIPTSFH);
e882dd67 372 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
373 }
374 }
10a6ecd2 375}
376
377sub charscript {
378 my $arg = shift;
379
380 _charscripts() unless @SCRIPTS;
e882dd67 381
10a6ecd2 382 my $code = _getcode($arg);
383
384 if (defined $code) {
385 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
386 } else {
387 if (exists $SCRIPTS{$arg}) {
388 return $SCRIPTS{$arg};
389 } else {
390 return;
391 }
392 }
393}
394
395=head2 charblocks
396
55d7b906 397 use Unicode::UCD 'charblocks';
10a6ecd2 398
b08cd201 399 my $charblocks = charblocks();
10a6ecd2 400
b08cd201 401charblocks() returns a reference to a hash with the known block names
402as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2 403
78bf21c2 404See also L</Blocks versus Scripts>.
405
10a6ecd2 406=cut
407
408sub charblocks {
b08cd201 409 _charblocks() unless %BLOCKS;
410 return \%BLOCKS;
10a6ecd2 411}
412
413=head2 charscripts
414
55d7b906 415 use Unicode::UCD 'charscripts';
10a6ecd2 416
417 my %charscripts = charscripts();
418
419charscripts() returns a hash with the known script names as the keys,
420and the code point ranges (see L</charscript>) as the values.
421
78bf21c2 422See also L</Blocks versus Scripts>.
423
10a6ecd2 424=cut
425
426sub charscripts {
b08cd201 427 _charscripts() unless %SCRIPTS;
428 return \%SCRIPTS;
561c79ed 429}
430
10a6ecd2 431=head2 Blocks versus Scripts
ad9cab37 432
10a6ecd2 433The difference between a block and a script is that scripts are closer
434to the linguistic notion of a set of characters required to present
435languages, while block is more of an artifact of the Unicode character
eb0cc9e3 436numbering and separation into blocks of (mostly) 256 characters.
3aa957f9 437
438For example the Latin B<script> is spread over several B<blocks>, such
439as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
440C<Latin Extended-B>. On the other hand, the Latin script does not
441contain all the characters of the C<Basic Latin> block (also known as
eb0cc9e3 442the ASCII): it includes only the letters, and not, for example, the digits
3aa957f9 443or the punctuation.
ad9cab37 444
3aa957f9 445For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37 446
447For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
448
3aa957f9 449=head2 Matching Scripts and Blocks
450
eb0cc9e3 451Scripts are matched with the regular-expression construct
452C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
453while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
454any of the 256 code points in the Tibetan block).
10a6ecd2 455
b08cd201 456=head2 Code Point Arguments
457
92e830a9 458A I<code point argument> is either a decimal or a hexadecimal scalar
459designating a Unicode character, or C<U+> followed by hexadecimals
dc0a4417 460designating a Unicode character. In other words, if you want a code
461point to be interpreted as a hexadecimal number, you must prefix it
462with either C<0x> or C<U+>, becauseq a string like e.g. C<123> will
463be interpreted as a decimal code point. Also note that Unicode is
464B<not> limited to 16 bits (the number of Unicode characters is
465open-ended, in theory unlimited): you may have more than 4 hexdigits.
b08cd201 466
10a6ecd2 467=head2 charinrange
468
469In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
470can also test whether a code point is in the I<range> as returned by
471L</charblock> and L</charscript> or as the values of the hash returned
e618509d 472by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 473
55d7b906 474 use Unicode::UCD qw(charscript charinrange);
10a6ecd2 475
476 $range = charscript('Hiragana');
e145285f 477 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2 478
479=cut
480
b08cd201 481=head2 compexcl
482
55d7b906 483 use Unicode::UCD 'compexcl';
b08cd201 484
485 my $compexcl = compexcl("09dc");
486
487The compexcl() returns the composition exclusion (that is, if the
9046a8ae 488character should not be produced during a precomposition) of the
489character specified by a B<code point argument>.
b08cd201 490
491If there is a composition exclusion for the character, true is
492returned. Otherwise, false is returned.
493
494=cut
495
496my %COMPEXCL;
497
498sub _compexcl {
499 unless (%COMPEXCL) {
551b6b6f 500 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
6c8d78fb 501 local $_;
b08cd201 502 while (<$COMPEXCLFH>) {
822ebcc8 503 if (/^([0-9A-F]+)\s+\#\s+/) {
b08cd201 504 my $code = hex($1);
505 $COMPEXCL{$code} = undef;
506 }
507 }
508 close($COMPEXCLFH);
509 }
510 }
511}
512
513sub compexcl {
514 my $arg = shift;
515 my $code = _getcode($arg);
74f8133e 516 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
517 unless defined $code;
b08cd201 518
519 _compexcl() unless %COMPEXCL;
520
521 return exists $COMPEXCL{$code};
522}
523
524=head2 casefold
525
55d7b906 526 use Unicode::UCD 'casefold';
b08cd201 527
82c0b05b 528 my $casefold = casefold("00DF");
b08cd201 529
530The casefold() returns the locale-independent case folding of the
531character specified by a B<code point argument>.
532
533If there is a case folding for that character, a reference to a hash
534with the following fields is returned:
535
536 key
537
538 code code point with at least four hexdigits
539 status "C", "F", "S", or "I"
540 mapping one or more codes separated by spaces
541
542The meaning of the I<status> is as follows:
543
544 C common case folding, common mappings shared
545 by both simple and full mappings
546 F full case folding, mappings that cause strings
547 to grow in length. Multiple characters are separated
548 by spaces
549 S simple case folding, mappings to single characters
550 where different from F
551 I special case for dotted uppercase I and
552 dotless lowercase i
553 - If this mapping is included, the result is
554 case-insensitive, but dotless and dotted I's
555 are not distinguished
556 - If this mapping is excluded, the result is not
557 fully case-insensitive, but dotless and dotted
558 I's are distinguished
559
560If there is no case folding for that character, C<undef> is returned.
561
562For more information about case mappings see
563http://www.unicode.org/unicode/reports/tr21/
564
565=cut
566
567my %CASEFOLD;
568
569sub _casefold {
570 unless (%CASEFOLD) {
551b6b6f 571 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
6c8d78fb 572 local $_;
b08cd201 573 while (<$CASEFOLDFH>) {
574 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
575 my $code = hex($1);
576 $CASEFOLD{$code} = { code => $1,
577 status => $2,
578 mapping => $3 };
579 }
580 }
581 close($CASEFOLDFH);
582 }
583 }
584}
585
586sub casefold {
587 my $arg = shift;
588 my $code = _getcode($arg);
74f8133e 589 croak __PACKAGE__, "::casefold: unknown code '$arg'"
590 unless defined $code;
b08cd201 591
592 _casefold() unless %CASEFOLD;
593
594 return $CASEFOLD{$code};
595}
596
597=head2 casespec
598
55d7b906 599 use Unicode::UCD 'casespec';
b08cd201 600
82c0b05b 601 my $casespec = casespec("FB00");
b08cd201 602
603The casespec() returns the potentially locale-dependent case mapping
604of the character specified by a B<code point argument>. The mapping
605may change the length of the string (which the basic Unicode case
606mappings as returned by charinfo() never do).
607
608If there is a case folding for that character, a reference to a hash
609with the following fields is returned:
610
611 key
612
613 code code point with at least four hexdigits
614 lower lowercase
615 title titlecase
616 upper uppercase
617 condition condition list (may be undef)
618
619The C<condition> is optional. Where present, it consists of one or
620more I<locales> or I<contexts>, separated by spaces (other than as
621used to separate elements, spaces are to be ignored). A condition
622list overrides the normal behavior if all of the listed conditions are
623true. Case distinctions in the condition list are not significant.
82c0b05b 624Conditions preceded by "NON_" represent the negation of the condition.
b08cd201 625
f499c386 626Note that when there are multiple case folding definitions for a
627single code point because of different locales, the value returned by
628casespec() is a hash reference which has the locales as the keys and
629hash references as described above as the values.
630
b08cd201 631A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d 632followed by a "_" and a 2-letter ISO language code (possibly followed
633by a "_" and a variant code). You can find the lists of those codes,
634see L<Locale::Country> and L<Locale::Language>.
b08cd201 635
636A I<context> is one of the following choices:
637
638 FINAL The letter is not followed by a letter of
639 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
640 MODERN The mapping is only used for modern text
e618509d 641 AFTER_i The last base character was "i" (U+0069)
b08cd201 642
643For more information about case mappings see
644http://www.unicode.org/unicode/reports/tr21/
645
646=cut
647
648my %CASESPEC;
649
650sub _casespec {
651 unless (%CASESPEC) {
551b6b6f 652 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
6c8d78fb 653 local $_;
b08cd201 654 while (<$CASESPECFH>) {
655 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386 656 my ($hexcode, $lower, $title, $upper, $condition) =
657 ($1, $2, $3, $4, $5);
658 my $code = hex($hexcode);
659 if (exists $CASESPEC{$code}) {
660 if (exists $CASESPEC{$code}->{code}) {
661 my ($oldlower,
662 $oldtitle,
663 $oldupper,
664 $oldcondition) =
665 @{$CASESPEC{$code}}{qw(lower
666 title
667 upper
668 condition)};
822ebcc8 669 if (defined $oldcondition) {
670 my ($oldlocale) =
f499c386 671 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386 672 delete $CASESPEC{$code};
673 $CASESPEC{$code}->{$oldlocale} =
674 { code => $hexcode,
675 lower => $oldlower,
676 title => $oldtitle,
677 upper => $oldupper,
678 condition => $oldcondition };
f499c386 679 }
680 }
681 my ($locale) =
682 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
683 $CASESPEC{$code}->{$locale} =
684 { code => $hexcode,
685 lower => $lower,
686 title => $title,
687 upper => $upper,
688 condition => $condition };
689 } else {
690 $CASESPEC{$code} =
691 { code => $hexcode,
692 lower => $lower,
693 title => $title,
694 upper => $upper,
695 condition => $condition };
696 }
b08cd201 697 }
698 }
699 close($CASESPECFH);
700 }
701 }
702}
703
704sub casespec {
705 my $arg = shift;
706 my $code = _getcode($arg);
74f8133e 707 croak __PACKAGE__, "::casespec: unknown code '$arg'"
708 unless defined $code;
b08cd201 709
710 _casespec() unless %CASESPEC;
711
712 return $CASESPEC{$code};
713}
714
55d7b906 715=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 716
55d7b906 717Unicode::UCD::UnicodeVersion() returns the version of the Unicode
718Character Database, in other words, the version of the Unicode
78bf21c2 719standard the database implements. The version is a string
720of numbers delimited by dots (C<'.'>).
10a6ecd2 721
722=cut
723
724my $UNICODEVERSION;
725
726sub UnicodeVersion {
727 unless (defined $UNICODEVERSION) {
728 openunicode(\$VERSIONFH, "version");
729 chomp($UNICODEVERSION = <$VERSIONFH>);
730 close($VERSIONFH);
731 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
732 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
733 }
734 return $UNICODEVERSION;
735}
3aa957f9 736
737=head2 Implementation Note
32c16050 738
ad9cab37 739The first use of charinfo() opens a read-only filehandle to the Unicode
740Character Database (the database is included in the Perl distribution).
78bf21c2 741The filehandle is then kept open for further queries. In other words,
742if you are wondering where one of your filehandles went, that's where.
32c16050 743
8b731da2 744=head1 BUGS
745
746Does not yet support EBCDIC platforms.
747
561c79ed 748=head1 AUTHOR
749
750Jarkko Hietaniemi
751
752=cut
753
7541;