Add Encode's META.yml.
[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed 2
3use strict;
4use warnings;
5
74f8133e 6our $VERSION = '0.2';
561c79ed 7
8require Exporter;
9
10our @ISA = qw(Exporter);
74f8133e 11
10a6ecd2 12our @EXPORT_OK = qw(charinfo
13 charblock charscript
14 charblocks charscripts
b08cd201 15 charinrange
16 compexcl
17 casefold casespec);
561c79ed 18
19use Carp;
20
21=head1 NAME
22
55d7b906 23Unicode::UCD - Unicode character database
561c79ed 24
25=head1 SYNOPSIS
26
55d7b906 27 use Unicode::UCD 'charinfo';
b08cd201 28 my $charinfo = charinfo($codepoint);
561c79ed 29
55d7b906 30 use Unicode::UCD 'charblock';
e882dd67 31 my $charblock = charblock($codepoint);
32
55d7b906 33 use Unicode::UCD 'charscript';
e882dd67 34 my $charscript = charblock($codepoint);
561c79ed 35
55d7b906 36 use Unicode::UCD 'charblocks';
e145285f 37 my $charblocks = charblocks();
38
55d7b906 39 use Unicode::UCD 'charscripts';
e145285f 40 my %charscripts = charscripts();
41
55d7b906 42 use Unicode::UCD qw(charscript charinrange);
e145285f 43 my $range = charscript($script);
44 print "looks like $script\n" if charinrange($range, $codepoint);
45
55d7b906 46 use Unicode::UCD 'compexcl';
e145285f 47 my $compexcl = compexcl($codepoint);
48
55d7b906 49 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 50
561c79ed 51=head1 DESCRIPTION
52
8b731da2 53The Unicode::UCD module offers a simple interface to the Unicode
54Character Database.
561c79ed 55
56=cut
57
10a6ecd2 58my $UNICODEFH;
59my $BLOCKSFH;
60my $SCRIPTSFH;
61my $VERSIONFH;
b08cd201 62my $COMPEXCLFH;
63my $CASEFOLDFH;
64my $CASESPECFH;
561c79ed 65
66sub openunicode {
67 my ($rfh, @path) = @_;
68 my $f;
69 unless (defined $$rfh) {
70 for my $d (@INC) {
71 use File::Spec;
55d7b906 72 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 73 last if open($$rfh, $f);
e882dd67 74 undef $f;
561c79ed 75 }
e882dd67 76 croak __PACKAGE__, ": failed to find ",
77 File::Spec->catfile(@path), " in @INC"
78 unless defined $f;
561c79ed 79 }
80 return $f;
81}
82
83=head2 charinfo
84
55d7b906 85 use Unicode::UCD 'charinfo';
561c79ed 86
b08cd201 87 my $charinfo = charinfo(0x41);
561c79ed 88
b08cd201 89charinfo() returns a reference to a hash that has the following fields
90as defined by the Unicode standard:
561c79ed 91
92 key
93
94 code code point with at least four hexdigits
95 name name of the character IN UPPER CASE
96 category general category of the character
97 combining classes used in the Canonical Ordering Algorithm
98 bidi bidirectional category
99 decomposition character decomposition mapping
100 decimal if decimal digit this is the integer numeric value
101 digit if digit this is the numeric value
102 numeric if numeric is the integer or rational numeric value
103 mirrored if mirrored in bidirectional text
104 unicode10 Unicode 1.0 name if existed and different
105 comment ISO 10646 comment field
106 upper uppercase equivalent mapping
107 lower lowercase equivalent mapping
108 title titlecase equivalent mapping
e882dd67 109
561c79ed 110 block block the character belongs to (used in \p{In...})
eb0cc9e3 111 script script the character belongs to
561c79ed 112
b08cd201 113If no match is found, a reference to an empty hash is returned.
561c79ed 114
d1be9408 115The C<block> property is the same as returned by charinfo(). It is
32c16050 116not defined in the Unicode Character Database proper (Chapter 4 of the
78bf21c2 117Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
118(Chapter 14 of TUS3). Similarly for the C<script> property.
32c16050 119
120Note that you cannot do (de)composition and casing based solely on the
121above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 122you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed 123
124=cut
125
10a6ecd2 126sub _getcode {
127 my $arg = shift;
128
129 if ($arg =~ /^\d+$/) {
130 return $arg;
131 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
132 return hex($1);
133 }
134
135 return;
136}
137
ac5ea531 138# Lingua::KO::Hangul::Util not part of the standard distribution
139# but it will be used if available.
140
141eval { require Lingua::KO::Hangul::Util };
142my $hasHangulUtil = ! $@;
143if ($hasHangulUtil) {
144 Lingua::KO::Hangul::Util->import();
145}
9087a70b 146
147sub hangul_decomp { # internal: called from charinfo
ac5ea531 148 if ($hasHangulUtil) {
149 my @tmp = decomposeHangul(shift);
150 return sprintf("%04X %04X", @tmp) if @tmp == 2;
151 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
152 }
153 return;
154}
155
156sub hangul_charname { # internal: called from charinfo
157 return sprintf("HANGUL SYLLABLE-%04X", shift);
a6fa416b 158}
159
9087a70b 160sub han_charname { # internal: called from charinfo
161 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
a6fa416b 162}
163
164my @CharinfoRanges = (
165# block name
166# [ first, last, coderef to name, coderef to decompose ],
167# CJK Ideographs Extension A
168 [ 0x3400, 0x4DB5, \&han_charname, undef ],
169# CJK Ideographs
170 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
171# Hangul Syllables
ac5ea531 172 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
a6fa416b 173# Non-Private Use High Surrogates
174 [ 0xD800, 0xDB7F, undef, undef ],
175# Private Use High Surrogates
176 [ 0xDB80, 0xDBFF, undef, undef ],
177# Low Surrogates
178 [ 0xDC00, 0xDFFF, undef, undef ],
179# The Private Use Area
180 [ 0xE000, 0xF8FF, undef, undef ],
181# CJK Ideographs Extension B
182 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
183# Plane 15 Private Use Area
184 [ 0xF0000, 0xFFFFD, undef, undef ],
185# Plane 16 Private Use Area
186 [ 0x100000, 0x10FFFD, undef, undef ],
187);
188
561c79ed 189sub charinfo {
10a6ecd2 190 my $arg = shift;
191 my $code = _getcode($arg);
192 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
193 unless defined $code;
e63dbbf9 194 my $hexk = sprintf("%06X", $code);
a6fa416b 195 my($rcode,$rname,$rdec);
196 foreach my $range (@CharinfoRanges){
74f8133e 197 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 198 $rcode = $hexk;
e63dbbf9 199 $rcode =~ s/^0+//;
200 $rcode = sprintf("%04X", hex($rcode));
a6fa416b 201 $rname = $range->[2] ? $range->[2]->($code) : '';
202 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 203 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b 204 last;
205 }
206 }
551b6b6f 207 openunicode(\$UNICODEFH, "UnicodeData.txt");
10a6ecd2 208 if (defined $UNICODEFH) {
e63dbbf9 209 use Search::Dict 1.02;
210 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 211 my $line = <$UNICODEFH>;
561c79ed 212 chomp $line;
213 my %prop;
214 @prop{qw(
215 code name category
216 combining bidi decomposition
217 decimal digit numeric
218 mirrored unicode10 comment
219 upper lower title
220 )} = split(/;/, $line, -1);
e63dbbf9 221 $hexk =~ s/^0+//;
222 $hexk = sprintf("%04X", hex($hexk));
561c79ed 223 if ($prop{code} eq $hexk) {
a196fbfd 224 $prop{block} = charblock($code);
225 $prop{script} = charscript($code);
a6fa416b 226 if(defined $rname){
227 $prop{code} = $rcode;
228 $prop{name} = $rname;
229 $prop{decomposition} = $rdec;
230 }
b08cd201 231 return \%prop;
561c79ed 232 }
233 }
234 }
235 return;
236}
237
e882dd67 238sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
239 my ($table, $lo, $hi, $code) = @_;
240
241 return if $lo > $hi;
242
243 my $mid = int(($lo+$hi) / 2);
244
245 if ($table->[$mid]->[0] < $code) {
10a6ecd2 246 if ($table->[$mid]->[1] >= $code) {
e882dd67 247 return $table->[$mid]->[2];
248 } else {
249 _search($table, $mid + 1, $hi, $code);
250 }
251 } elsif ($table->[$mid]->[0] > $code) {
252 _search($table, $lo, $mid - 1, $code);
253 } else {
254 return $table->[$mid]->[2];
255 }
256}
257
10a6ecd2 258sub charinrange {
259 my ($range, $arg) = @_;
260 my $code = _getcode($arg);
261 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
262 unless defined $code;
263 _search($range, 0, $#$range, $code);
264}
265
354a27bf 266=head2 charblock
561c79ed 267
55d7b906 268 use Unicode::UCD 'charblock';
561c79ed 269
270 my $charblock = charblock(0x41);
10a6ecd2 271 my $charblock = charblock(1234);
272 my $charblock = charblock("0x263a");
273 my $charblock = charblock("U+263a");
274
78bf21c2 275 my $range = charblock('Armenian');
10a6ecd2 276
78bf21c2 277With a B<code point argument> charblock() returns the I<block> the character
10a6ecd2 278belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 279positions within all blocks are defined.
10a6ecd2 280
78bf21c2 281See also L</Blocks versus Scripts>.
282
eb0cc9e3 283If supplied with an argument that can't be a code point, charblock() tries
284to do the opposite and interpret the argument as a character block. The
285return value is a I<range>: an anonymous list of lists that contain
286I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
287code point is in a range using the L</charinrange> function. If the
288argument is not a known charater block, C<undef> is returned.
561c79ed 289
561c79ed 290=cut
291
292my @BLOCKS;
10a6ecd2 293my %BLOCKS;
561c79ed 294
10a6ecd2 295sub _charblocks {
561c79ed 296 unless (@BLOCKS) {
10a6ecd2 297 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
6c8d78fb 298 local $_;
10a6ecd2 299 while (<$BLOCKSFH>) {
2796c109 300 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2 301 my ($lo, $hi) = (hex($1), hex($2));
302 my $subrange = [ $lo, $hi, $3 ];
303 push @BLOCKS, $subrange;
304 push @{$BLOCKS{$3}}, $subrange;
561c79ed 305 }
306 }
10a6ecd2 307 close($BLOCKSFH);
561c79ed 308 }
309 }
10a6ecd2 310}
311
312sub charblock {
313 my $arg = shift;
314
315 _charblocks() unless @BLOCKS;
316
317 my $code = _getcode($arg);
561c79ed 318
10a6ecd2 319 if (defined $code) {
320 _search(\@BLOCKS, 0, $#BLOCKS, $code);
321 } else {
322 if (exists $BLOCKS{$arg}) {
323 return $BLOCKS{$arg};
324 } else {
325 return;
326 }
327 }
e882dd67 328}
329
330=head2 charscript
331
55d7b906 332 use Unicode::UCD 'charscript';
e882dd67 333
334 my $charscript = charscript(0x41);
10a6ecd2 335 my $charscript = charscript(1234);
336 my $charscript = charscript("U+263a");
e882dd67 337
78bf21c2 338 my $range = charscript('Thai');
10a6ecd2 339
78bf21c2 340With a B<code point argument> charscript() returns the I<script> the
b08cd201 341character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2 342
78bf21c2 343See also L</Blocks versus Scripts>.
344
eb0cc9e3 345If supplied with an argument that can't be a code point, charscript() tries
346to do the opposite and interpret the argument as a character script. The
347return value is a I<range>: an anonymous list of lists that contain
348I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
349code point is in a range using the L</charinrange> function. If the
350argument is not a known charater script, C<undef> is returned.
e882dd67 351
e882dd67 352=cut
353
354my @SCRIPTS;
10a6ecd2 355my %SCRIPTS;
e882dd67 356
10a6ecd2 357sub _charscripts {
e882dd67 358 unless (@SCRIPTS) {
10a6ecd2 359 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
6c8d78fb 360 local $_;
10a6ecd2 361 while (<$SCRIPTSFH>) {
e882dd67 362 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2 363 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
364 my $script = lc($3);
365 $script =~ s/\b(\w)/uc($1)/ge;
366 my $subrange = [ $lo, $hi, $script ];
367 push @SCRIPTS, $subrange;
368 push @{$SCRIPTS{$script}}, $subrange;
e882dd67 369 }
370 }
10a6ecd2 371 close($SCRIPTSFH);
e882dd67 372 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
373 }
374 }
10a6ecd2 375}
376
377sub charscript {
378 my $arg = shift;
379
380 _charscripts() unless @SCRIPTS;
e882dd67 381
10a6ecd2 382 my $code = _getcode($arg);
383
384 if (defined $code) {
385 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
386 } else {
387 if (exists $SCRIPTS{$arg}) {
388 return $SCRIPTS{$arg};
389 } else {
390 return;
391 }
392 }
393}
394
395=head2 charblocks
396
55d7b906 397 use Unicode::UCD 'charblocks';
10a6ecd2 398
b08cd201 399 my $charblocks = charblocks();
10a6ecd2 400
b08cd201 401charblocks() returns a reference to a hash with the known block names
402as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2 403
78bf21c2 404See also L</Blocks versus Scripts>.
405
10a6ecd2 406=cut
407
408sub charblocks {
b08cd201 409 _charblocks() unless %BLOCKS;
410 return \%BLOCKS;
10a6ecd2 411}
412
413=head2 charscripts
414
55d7b906 415 use Unicode::UCD 'charscripts';
10a6ecd2 416
417 my %charscripts = charscripts();
418
419charscripts() returns a hash with the known script names as the keys,
420and the code point ranges (see L</charscript>) as the values.
421
78bf21c2 422See also L</Blocks versus Scripts>.
423
10a6ecd2 424=cut
425
426sub charscripts {
b08cd201 427 _charscripts() unless %SCRIPTS;
428 return \%SCRIPTS;
561c79ed 429}
430
10a6ecd2 431=head2 Blocks versus Scripts
ad9cab37 432
10a6ecd2 433The difference between a block and a script is that scripts are closer
434to the linguistic notion of a set of characters required to present
435languages, while block is more of an artifact of the Unicode character
eb0cc9e3 436numbering and separation into blocks of (mostly) 256 characters.
3aa957f9 437
438For example the Latin B<script> is spread over several B<blocks>, such
439as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
440C<Latin Extended-B>. On the other hand, the Latin script does not
441contain all the characters of the C<Basic Latin> block (also known as
eb0cc9e3 442the ASCII): it includes only the letters, and not, for example, the digits
3aa957f9 443or the punctuation.
ad9cab37 444
3aa957f9 445For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37 446
447For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
448
3aa957f9 449=head2 Matching Scripts and Blocks
450
eb0cc9e3 451Scripts are matched with the regular-expression construct
452C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
453while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
454any of the 256 code points in the Tibetan block).
10a6ecd2 455
b08cd201 456=head2 Code Point Arguments
457
92e830a9 458A I<code point argument> is either a decimal or a hexadecimal scalar
459designating a Unicode character, or C<U+> followed by hexadecimals
78bf21c2 460designating a Unicode character. Note that Unicode is B<not> limited
461to 16 bits (the number of Unicode characters is open-ended, in theory
462unlimited): you may have more than 4 hexdigits.
b08cd201 463
10a6ecd2 464=head2 charinrange
465
466In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
467can also test whether a code point is in the I<range> as returned by
468L</charblock> and L</charscript> or as the values of the hash returned
e618509d 469by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 470
55d7b906 471 use Unicode::UCD qw(charscript charinrange);
10a6ecd2 472
473 $range = charscript('Hiragana');
e145285f 474 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2 475
476=cut
477
b08cd201 478=head2 compexcl
479
55d7b906 480 use Unicode::UCD 'compexcl';
b08cd201 481
482 my $compexcl = compexcl("09dc");
483
484The compexcl() returns the composition exclusion (that is, if the
9046a8ae 485character should not be produced during a precomposition) of the
486character specified by a B<code point argument>.
b08cd201 487
488If there is a composition exclusion for the character, true is
489returned. Otherwise, false is returned.
490
491=cut
492
493my %COMPEXCL;
494
495sub _compexcl {
496 unless (%COMPEXCL) {
551b6b6f 497 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
6c8d78fb 498 local $_;
b08cd201 499 while (<$COMPEXCLFH>) {
822ebcc8 500 if (/^([0-9A-F]+)\s+\#\s+/) {
b08cd201 501 my $code = hex($1);
502 $COMPEXCL{$code} = undef;
503 }
504 }
505 close($COMPEXCLFH);
506 }
507 }
508}
509
510sub compexcl {
511 my $arg = shift;
512 my $code = _getcode($arg);
74f8133e 513 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
514 unless defined $code;
b08cd201 515
516 _compexcl() unless %COMPEXCL;
517
518 return exists $COMPEXCL{$code};
519}
520
521=head2 casefold
522
55d7b906 523 use Unicode::UCD 'casefold';
b08cd201 524
11785058 525 my $casefold = casefold("09dc");
b08cd201 526
527The casefold() returns the locale-independent case folding of the
528character specified by a B<code point argument>.
529
530If there is a case folding for that character, a reference to a hash
531with the following fields is returned:
532
533 key
534
535 code code point with at least four hexdigits
536 status "C", "F", "S", or "I"
537 mapping one or more codes separated by spaces
538
539The meaning of the I<status> is as follows:
540
541 C common case folding, common mappings shared
542 by both simple and full mappings
543 F full case folding, mappings that cause strings
544 to grow in length. Multiple characters are separated
545 by spaces
546 S simple case folding, mappings to single characters
547 where different from F
548 I special case for dotted uppercase I and
549 dotless lowercase i
550 - If this mapping is included, the result is
551 case-insensitive, but dotless and dotted I's
552 are not distinguished
553 - If this mapping is excluded, the result is not
554 fully case-insensitive, but dotless and dotted
555 I's are distinguished
556
557If there is no case folding for that character, C<undef> is returned.
558
559For more information about case mappings see
560http://www.unicode.org/unicode/reports/tr21/
561
562=cut
563
564my %CASEFOLD;
565
566sub _casefold {
567 unless (%CASEFOLD) {
551b6b6f 568 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
6c8d78fb 569 local $_;
b08cd201 570 while (<$CASEFOLDFH>) {
571 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
572 my $code = hex($1);
573 $CASEFOLD{$code} = { code => $1,
574 status => $2,
575 mapping => $3 };
576 }
577 }
578 close($CASEFOLDFH);
579 }
580 }
581}
582
583sub casefold {
584 my $arg = shift;
585 my $code = _getcode($arg);
74f8133e 586 croak __PACKAGE__, "::casefold: unknown code '$arg'"
587 unless defined $code;
b08cd201 588
589 _casefold() unless %CASEFOLD;
590
591 return $CASEFOLD{$code};
592}
593
594=head2 casespec
595
55d7b906 596 use Unicode::UCD 'casespec';
b08cd201 597
11785058 598 my $casespec = casespec("09dc");
b08cd201 599
600The casespec() returns the potentially locale-dependent case mapping
601of the character specified by a B<code point argument>. The mapping
602may change the length of the string (which the basic Unicode case
603mappings as returned by charinfo() never do).
604
605If there is a case folding for that character, a reference to a hash
606with the following fields is returned:
607
608 key
609
610 code code point with at least four hexdigits
611 lower lowercase
612 title titlecase
613 upper uppercase
614 condition condition list (may be undef)
615
616The C<condition> is optional. Where present, it consists of one or
617more I<locales> or I<contexts>, separated by spaces (other than as
618used to separate elements, spaces are to be ignored). A condition
619list overrides the normal behavior if all of the listed conditions are
620true. Case distinctions in the condition list are not significant.
621Conditions preceded by "NON_" represent the negation of the condition
622
f499c386 623Note that when there are multiple case folding definitions for a
624single code point because of different locales, the value returned by
625casespec() is a hash reference which has the locales as the keys and
626hash references as described above as the values.
627
b08cd201 628A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d 629followed by a "_" and a 2-letter ISO language code (possibly followed
630by a "_" and a variant code). You can find the lists of those codes,
631see L<Locale::Country> and L<Locale::Language>.
b08cd201 632
633A I<context> is one of the following choices:
634
635 FINAL The letter is not followed by a letter of
636 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
637 MODERN The mapping is only used for modern text
e618509d 638 AFTER_i The last base character was "i" (U+0069)
b08cd201 639
640For more information about case mappings see
641http://www.unicode.org/unicode/reports/tr21/
642
643=cut
644
645my %CASESPEC;
646
647sub _casespec {
648 unless (%CASESPEC) {
551b6b6f 649 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
6c8d78fb 650 local $_;
b08cd201 651 while (<$CASESPECFH>) {
652 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386 653 my ($hexcode, $lower, $title, $upper, $condition) =
654 ($1, $2, $3, $4, $5);
655 my $code = hex($hexcode);
656 if (exists $CASESPEC{$code}) {
657 if (exists $CASESPEC{$code}->{code}) {
658 my ($oldlower,
659 $oldtitle,
660 $oldupper,
661 $oldcondition) =
662 @{$CASESPEC{$code}}{qw(lower
663 title
664 upper
665 condition)};
822ebcc8 666 if (defined $oldcondition) {
667 my ($oldlocale) =
f499c386 668 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386 669 delete $CASESPEC{$code};
670 $CASESPEC{$code}->{$oldlocale} =
671 { code => $hexcode,
672 lower => $oldlower,
673 title => $oldtitle,
674 upper => $oldupper,
675 condition => $oldcondition };
f499c386 676 }
677 }
678 my ($locale) =
679 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
680 $CASESPEC{$code}->{$locale} =
681 { code => $hexcode,
682 lower => $lower,
683 title => $title,
684 upper => $upper,
685 condition => $condition };
686 } else {
687 $CASESPEC{$code} =
688 { code => $hexcode,
689 lower => $lower,
690 title => $title,
691 upper => $upper,
692 condition => $condition };
693 }
b08cd201 694 }
695 }
696 close($CASESPECFH);
697 }
698 }
699}
700
701sub casespec {
702 my $arg = shift;
703 my $code = _getcode($arg);
74f8133e 704 croak __PACKAGE__, "::casespec: unknown code '$arg'"
705 unless defined $code;
b08cd201 706
707 _casespec() unless %CASESPEC;
708
709 return $CASESPEC{$code};
710}
711
55d7b906 712=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 713
55d7b906 714Unicode::UCD::UnicodeVersion() returns the version of the Unicode
715Character Database, in other words, the version of the Unicode
78bf21c2 716standard the database implements. The version is a string
717of numbers delimited by dots (C<'.'>).
10a6ecd2 718
719=cut
720
721my $UNICODEVERSION;
722
723sub UnicodeVersion {
724 unless (defined $UNICODEVERSION) {
725 openunicode(\$VERSIONFH, "version");
726 chomp($UNICODEVERSION = <$VERSIONFH>);
727 close($VERSIONFH);
728 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
729 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
730 }
731 return $UNICODEVERSION;
732}
3aa957f9 733
734=head2 Implementation Note
32c16050 735
ad9cab37 736The first use of charinfo() opens a read-only filehandle to the Unicode
737Character Database (the database is included in the Perl distribution).
78bf21c2 738The filehandle is then kept open for further queries. In other words,
739if you are wondering where one of your filehandles went, that's where.
32c16050 740
8b731da2 741=head1 BUGS
742
743Does not yet support EBCDIC platforms.
744
561c79ed 745=head1 AUTHOR
746
747Jarkko Hietaniemi
748
749=cut
750
7511;