lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '0.2';
   7
   8 use Storable qw(dclone);
   9
  10 require Exporter;
  11
  12 our @ISA = qw(Exporter);
  13
  14 our @EXPORT_OK = qw(charinfo
  15                     charblock charscript
  16                     charblocks charscripts
  17                     charinrange
  18                     compexcl
  19                     casefold casespec);
  20
  21 use Carp;
  22
  23 =head1 NAME
  24
  25 Unicode::UCD - Unicode character database
  26
  27 =head1 SYNOPSIS
  28
  29     use Unicode::UCD 'charinfo';
  30     my $charinfo   = charinfo($codepoint);
  31
  32     use Unicode::UCD 'charblock';
  33     my $charblock  = charblock($codepoint);
  34
  35     use Unicode::UCD 'charscript';
  36     my $charscript = charscript($codepoint);
  37
  38     use Unicode::UCD 'charblocks';
  39     my $charblocks = charblocks();
  40
  41     use Unicode::UCD 'charscripts';
  42     my %charscripts = charscripts();
  43
  44     use Unicode::UCD qw(charscript charinrange);
  45     my $range = charscript($script);
  46     print "looks like $script\n" if charinrange($range, $codepoint);
  47
  48     use Unicode::UCD 'compexcl';
  49     my $compexcl = compexcl($codepoint);
  50
  51     my $unicode_version = Unicode::UCD::UnicodeVersion();
  52
  53 =head1 DESCRIPTION
  54
  55 The Unicode::UCD module offers a simple interface to the Unicode
  56 Character Database.
  57
  58 =cut
  59
  60 my $UNICODEFH;
  61 my $BLOCKSFH;
  62 my $SCRIPTSFH;
  63 my $VERSIONFH;
  64 my $COMPEXCLFH;
  65 my $CASEFOLDFH;
  66 my $CASESPECFH;
  67
  68 sub openunicode {
  69     my ($rfh, @path) = @_;
  70     my $f;
  71     unless (defined $$rfh) {
  72         for my $d (@INC) {
  73             use File::Spec;
  74             $f = File::Spec->catfile($d, "unicore", @path);
  75             last if open($$rfh, $f);
  76             undef $f;
  77         }
  78         croak __PACKAGE__, ": failed to find ",
  79               File::Spec->catfile(@path), " in @INC"
  80             unless defined $f;
  81     }
  82     return $f;
  83 }
  84
  85 =head2 charinfo
  86
  87     use Unicode::UCD 'charinfo';
  88
  89     my $charinfo = charinfo(0x41);
  90
  91 charinfo() returns a reference to a hash that has the following fields
  92 as defined by the Unicode standard:
  93
  94     key
  95
  96     code             code point with at least four hexdigits
  97     name             name of the character IN UPPER CASE
  98     category         general category of the character
  99     combining        classes used in the Canonical Ordering Algorithm
 100     bidi             bidirectional category
 101     decomposition    character decomposition mapping
 102     decimal          if decimal digit this is the integer numeric value
 103     digit            if digit this is the numeric value
 104     numeric          if numeric is the integer or rational numeric value
 105     mirrored         if mirrored in bidirectional text
 106     unicode10        Unicode 1.0 name if existed and different
 107     comment          ISO 10646 comment field
 108     upper            uppercase equivalent mapping
 109     lower            lowercase equivalent mapping
 110     title            titlecase equivalent mapping
 111
 112     block            block the character belongs to (used in \p{In...})
 113     script           script the character belongs to
 114
 115 If no match is found, a reference to an empty hash is returned.
 116
 117 The C<block> property is the same as returned by charinfo().  It is
 118 not defined in the Unicode Character Database proper (Chapter 4 of the
 119 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
 120 (Chapter 14 of TUS3).  Similarly for the C<script> property.
 121
 122 Note that you cannot do (de)composition and casing based solely on the
 123 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
 124 you will need also the compexcl(), casefold(), and casespec() functions.
 125
 126 =cut
 127
 128 sub _getcode {
 129     my $arg = shift;
 130
 131     if ($arg =~ /^[1-9]\d*$/) {
 132         return $arg;
 133     } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
 134         return hex($1);
 135     }
 136
 137     return;
 138 }
 139
 140 # Lingua::KO::Hangul::Util not part of the standard distribution
 141 # but it will be used if available.
 142
 143 eval { require Lingua::KO::Hangul::Util };
 144 my $hasHangulUtil = ! $@;
 145 if ($hasHangulUtil) {
 146     Lingua::KO::Hangul::Util->import();
 147 }
 148
 149 sub hangul_decomp { # internal: called from charinfo
 150     if ($hasHangulUtil) {
 151         my @tmp = decomposeHangul(shift);
 152         return sprintf("%04X %04X",      @tmp) if @tmp == 2;
 153         return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
 154     }
 155     return;
 156 }
 157
 158 sub hangul_charname { # internal: called from charinfo
 159     return sprintf("HANGUL SYLLABLE-%04X", shift);
 160 }
 161
 162 sub han_charname { # internal: called from charinfo
 163     return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
 164 }
 165
 166 my @CharinfoRanges = (
 167 # block name
 168 # [ first, last, coderef to name, coderef to decompose ],
 169 # CJK Ideographs Extension A
 170   [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
 171 # CJK Ideographs
 172   [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
 173 # Hangul Syllables
 174   [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
 175 # Non-Private Use High Surrogates
 176   [ 0xD800,   0xDB7F,   undef,   undef  ],
 177 # Private Use High Surrogates
 178   [ 0xDB80,   0xDBFF,   undef,   undef  ],
 179 # Low Surrogates
 180   [ 0xDC00,   0xDFFF,   undef,   undef  ],
 181 # The Private Use Area
 182   [ 0xE000,   0xF8FF,   undef,   undef  ],
 183 # CJK Ideographs Extension B
 184   [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
 185 # Plane 15 Private Use Area
 186   [ 0xF0000,  0xFFFFD,  undef,   undef  ],
 187 # Plane 16 Private Use Area
 188   [ 0x100000, 0x10FFFD, undef,   undef  ],
 189 );
 190
 191 sub charinfo {
 192     my $arg  = shift;
 193     my $code = _getcode($arg);
 194     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 195         unless defined $code;
 196     my $hexk = sprintf("%06X", $code);
 197     my($rcode,$rname,$rdec);
 198     foreach my $range (@CharinfoRanges){
 199       if ($range->[0] <= $code && $code <= $range->[1]) {
 200         $rcode = $hexk;
 201         $rcode =~ s/^0+//;
 202         $rcode =  sprintf("%04X", hex($rcode));
 203         $rname = $range->[2] ? $range->[2]->($code) : '';
 204         $rdec  = $range->[3] ? $range->[3]->($code) : '';
 205         $hexk  = sprintf("%06X", $range->[0]); # replace by the first
 206         last;
 207       }
 208     }
 209     openunicode(\$UNICODEFH, "UnicodeData.txt");
 210     if (defined $UNICODEFH) {
 211         use Search::Dict 1.02;
 212         if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
 213             my $line = <$UNICODEFH>;
 214             return unless defined $line;
 215             chomp $line;
 216             my %prop;
 217             @prop{qw(
 218                      code name category
 219                      combining bidi decomposition
 220                      decimal digit numeric
 221                      mirrored unicode10 comment
 222                      upper lower title
 223                     )} = split(/;/, $line, -1);
 224             $hexk =~ s/^0+//;
 225             $hexk =  sprintf("%04X", hex($hexk));
 226             if ($prop{code} eq $hexk) {
 227                 $prop{block}  = charblock($code);
 228                 $prop{script} = charscript($code);
 229                 if(defined $rname){
 230                     $prop{code} = $rcode;
 231                     $prop{name} = $rname;
 232                     $prop{decomposition} = $rdec;
 233                 }
 234                 return \%prop;
 235             }
 236         }
 237     }
 238     return;
 239 }
 240
 241 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 242     my ($table, $lo, $hi, $code) = @_;
 243
 244     return if $lo > $hi;
 245
 246     my $mid = int(($lo+$hi) / 2);
 247
 248     if ($table->[$mid]->[0] < $code) {
 249         if ($table->[$mid]->[1] >= $code) {
 250             return $table->[$mid]->[2];
 251         } else {
 252             _search($table, $mid + 1, $hi, $code);
 253         }
 254     } elsif ($table->[$mid]->[0] > $code) {
 255         _search($table, $lo, $mid - 1, $code);
 256     } else {
 257         return $table->[$mid]->[2];
 258     }
 259 }
 260
 261 sub charinrange {
 262     my ($range, $arg) = @_;
 263     my $code = _getcode($arg);
 264     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 265         unless defined $code;
 266     _search($range, 0, $#$range, $code);
 267 }
 268
 269 =head2 charblock
 270
 271     use Unicode::UCD 'charblock';
 272
 273     my $charblock = charblock(0x41);
 274     my $charblock = charblock(1234);
 275     my $charblock = charblock("0x263a");
 276     my $charblock = charblock("U+263a");
 277
 278     my $range     = charblock('Armenian');
 279
 280 With a B<code point argument> charblock() returns the I<block> the character
 281 belongs to, e.g.  C<Basic Latin>.  Note that not all the character
 282 positions within all blocks are defined.
 283
 284 See also L</Blocks versus Scripts>.
 285
 286 If supplied with an argument that can't be a code point, charblock() tries
 287 to do the opposite and interpret the argument as a character block. The
 288 return value is a I<range>: an anonymous list of lists that contain
 289 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
 290 code point is in a range using the L</charinrange> function. If the
 291 argument is not a known charater block, C<undef> is returned.
 292
 293 =cut
 294
 295 my @BLOCKS;
 296 my %BLOCKS;
 297
 298 sub _charblocks {
 299     unless (@BLOCKS) {
 300         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 301             local $_;
 302             while (<$BLOCKSFH>) {
 303                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 304                     my ($lo, $hi) = (hex($1), hex($2));
 305                     my $subrange = [ $lo, $hi, $3 ];
 306                     push @BLOCKS, $subrange;
 307                     push @{$BLOCKS{$3}}, $subrange;
 308                 }
 309             }
 310             close($BLOCKSFH);
 311         }
 312     }
 313 }
 314
 315 sub charblock {
 316     my $arg = shift;
 317
 318     _charblocks() unless @BLOCKS;
 319
 320     my $code = _getcode($arg);
 321
 322     if (defined $code) {
 323         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 324     } else {
 325         if (exists $BLOCKS{$arg}) {
 326             return dclone $BLOCKS{$arg};
 327         } else {
 328             return;
 329         }
 330     }
 331 }
 332
 333 =head2 charscript
 334
 335     use Unicode::UCD 'charscript';
 336
 337     my $charscript = charscript(0x41);
 338     my $charscript = charscript(1234);
 339     my $charscript = charscript("U+263a");
 340
 341     my $range      = charscript('Thai');
 342
 343 With a B<code point argument> charscript() returns the I<script> the
 344 character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 345
 346 See also L</Blocks versus Scripts>.
 347
 348 If supplied with an argument that can't be a code point, charscript() tries
 349 to do the opposite and interpret the argument as a character script. The
 350 return value is a I<range>: an anonymous list of lists that contain
 351 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
 352 code point is in a range using the L</charinrange> function. If the
 353 argument is not a known charater script, C<undef> is returned.
 354
 355 =cut
 356
 357 my @SCRIPTS;
 358 my %SCRIPTS;
 359
 360 sub _charscripts {
 361     unless (@SCRIPTS) {
 362         if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 363             local $_;
 364             while (<$SCRIPTSFH>) {
 365                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 366                     my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 367                     my $script = lc($3);
 368                     $script =~ s/\b(\w)/uc($1)/ge;
 369                     my $subrange = [ $lo, $hi, $script ];
 370                     push @SCRIPTS, $subrange;
 371                     push @{$SCRIPTS{$script}}, $subrange;
 372                 }
 373             }
 374             close($SCRIPTSFH);
 375             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 376         }
 377     }
 378 }
 379
 380 sub charscript {
 381     my $arg = shift;
 382
 383     _charscripts() unless @SCRIPTS;
 384
 385     my $code = _getcode($arg);
 386
 387     if (defined $code) {
 388         _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 389     } else {
 390         if (exists $SCRIPTS{$arg}) {
 391             return dclone $SCRIPTS{$arg};
 392         } else {
 393             return;
 394         }
 395     }
 396 }
 397
 398 =head2 charblocks
 399
 400     use Unicode::UCD 'charblocks';
 401
 402     my $charblocks = charblocks();
 403
 404 charblocks() returns a reference to a hash with the known block names
 405 as the keys, and the code point ranges (see L</charblock>) as the values.
 406
 407 See also L</Blocks versus Scripts>.
 408
 409 =cut
 410
 411 sub charblocks {
 412     _charblocks() unless %BLOCKS;
 413     return dclone \%BLOCKS;
 414 }
 415
 416 =head2 charscripts
 417
 418     use Unicode::UCD 'charscripts';
 419
 420     my %charscripts = charscripts();
 421
 422 charscripts() returns a hash with the known script names as the keys,
 423 and the code point ranges (see L</charscript>) as the values.
 424
 425 See also L</Blocks versus Scripts>.
 426
 427 =cut
 428
 429 sub charscripts {
 430     _charscripts() unless %SCRIPTS;
 431     return dclone \%SCRIPTS;
 432 }
 433
 434 =head2 Blocks versus Scripts
 435
 436 The difference between a block and a script is that scripts are closer
 437 to the linguistic notion of a set of characters required to present
 438 languages, while block is more of an artifact of the Unicode character
 439 numbering and separation into blocks of (mostly) 256 characters.
 440
 441 For example the Latin B<script> is spread over several B<blocks>, such
 442 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
 443 C<Latin Extended-B>.  On the other hand, the Latin script does not
 444 contain all the characters of the C<Basic Latin> block (also known as
 445 the ASCII): it includes only the letters, and not, for example, the digits
 446 or the punctuation.
 447
 448 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
 449
 450 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
 451
 452 =head2 Matching Scripts and Blocks
 453
 454 Scripts are matched with the regular-expression construct
 455 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
 456 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
 457 any of the 256 code points in the Tibetan block).
 458
 459 =head2 Code Point Arguments
 460
 461 A I<code point argument> is either a decimal or a hexadecimal scalar
 462 designating a Unicode character, or C<U+> followed by hexadecimals
 463 designating a Unicode character.  In other words, if you want a code
 464 point to be interpreted as a hexadecimal number, you must prefix it
 465 with either C<0x> or C<U+>, because a string like e.g. C<123> will
 466 be interpreted as a decimal code point.  Also note that Unicode is
 467 B<not> limited to 16 bits (the number of Unicode characters is
 468 open-ended, in theory unlimited): you may have more than 4 hexdigits.
 469
 470 =head2 charinrange
 471
 472 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 473 can also test whether a code point is in the I<range> as returned by
 474 L</charblock> and L</charscript> or as the values of the hash returned
 475 by L</charblocks> and L</charscripts> by using charinrange():
 476
 477     use Unicode::UCD qw(charscript charinrange);
 478
 479     $range = charscript('Hiragana');
 480     print "looks like hiragana\n" if charinrange($range, $codepoint);
 481
 482 =cut
 483
 484 =head2 compexcl
 485
 486     use Unicode::UCD 'compexcl';
 487
 488     my $compexcl = compexcl("09dc");
 489
 490 The compexcl() returns the composition exclusion (that is, if the
 491 character should not be produced during a precomposition) of the
 492 character specified by a B<code point argument>.
 493
 494 If there is a composition exclusion for the character, true is
 495 returned.  Otherwise, false is returned.
 496
 497 =cut
 498
 499 my %COMPEXCL;
 500
 501 sub _compexcl {
 502     unless (%COMPEXCL) {
 503         if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
 504             local $_;
 505             while (<$COMPEXCLFH>) {
 506                 if (/^([0-9A-F]+)\s+\#\s+/) {
 507                     my $code = hex($1);
 508                     $COMPEXCL{$code} = undef;
 509                 }
 510             }
 511             close($COMPEXCLFH);
 512         }
 513     }
 514 }
 515
 516 sub compexcl {
 517     my $arg  = shift;
 518     my $code = _getcode($arg);
 519     croak __PACKAGE__, "::compexcl: unknown code '$arg'"
 520         unless defined $code;
 521
 522     _compexcl() unless %COMPEXCL;
 523
 524     return exists $COMPEXCL{$code};
 525 }
 526
 527 =head2 casefold
 528
 529     use Unicode::UCD 'casefold';
 530
 531     my $casefold = casefold("00DF");
 532
 533 The casefold() returns the locale-independent case folding of the
 534 character specified by a B<code point argument>.
 535
 536 If there is a case folding for that character, a reference to a hash
 537 with the following fields is returned:
 538
 539     key
 540
 541     code             code point with at least four hexdigits
 542     status           "C", "F", "S", or "I"
 543     mapping          one or more codes separated by spaces
 544
 545 The meaning of the I<status> is as follows:
 546
 547    C                 common case folding, common mappings shared
 548                      by both simple and full mappings
 549    F                 full case folding, mappings that cause strings
 550                      to grow in length. Multiple characters are separated
 551                      by spaces
 552    S                 simple case folding, mappings to single characters
 553                      where different from F
 554    I                 special case for dotted uppercase I and
 555                      dotless lowercase i
 556                      - If this mapping is included, the result is
 557                        case-insensitive, but dotless and dotted I's
 558                        are not distinguished
 559                      - If this mapping is excluded, the result is not
 560                        fully case-insensitive, but dotless and dotted
 561                        I's are distinguished
 562
 563 If there is no case folding for that character, C<undef> is returned.
 564
 565 For more information about case mappings see
 566 http://www.unicode.org/unicode/reports/tr21/
 567
 568 =cut
 569
 570 my %CASEFOLD;
 571
 572 sub _casefold {
 573     unless (%CASEFOLD) {
 574         if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
 575             local $_;
 576             while (<$CASEFOLDFH>) {
 577                 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 578                     my $code = hex($1);
 579                     $CASEFOLD{$code} = { code    => $1,
 580                                          status  => $2,
 581                                          mapping => $3 };
 582                 }
 583             }
 584             close($CASEFOLDFH);
 585         }
 586     }
 587 }
 588
 589 sub casefold {
 590     my $arg  = shift;
 591     my $code = _getcode($arg);
 592     croak __PACKAGE__, "::casefold: unknown code '$arg'"
 593         unless defined $code;
 594
 595     _casefold() unless %CASEFOLD;
 596
 597     return $CASEFOLD{$code};
 598 }
 599
 600 =head2 casespec
 601
 602     use Unicode::UCD 'casespec';
 603
 604     my $casespec = casespec("FB00");
 605
 606 The casespec() returns the potentially locale-dependent case mapping
 607 of the character specified by a B<code point argument>.  The mapping
 608 may change the length of the string (which the basic Unicode case
 609 mappings as returned by charinfo() never do).
 610
 611 If there is a case folding for that character, a reference to a hash
 612 with the following fields is returned:
 613
 614     key
 615
 616     code             code point with at least four hexdigits
 617     lower            lowercase
 618     title            titlecase
 619     upper            uppercase
 620     condition        condition list (may be undef)
 621
 622 The C<condition> is optional.  Where present, it consists of one or
 623 more I<locales> or I<contexts>, separated by spaces (other than as
 624 used to separate elements, spaces are to be ignored).  A condition
 625 list overrides the normal behavior if all of the listed conditions are
 626 true.  Case distinctions in the condition list are not significant.
 627 Conditions preceded by "NON_" represent the negation of the condition.
 628
 629 Note that when there are multiple case folding definitions for a
 630 single code point because of different locales, the value returned by
 631 casespec() is a hash reference which has the locales as the keys and
 632 hash references as described above as the values.
 633
 634 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
 635 followed by a "_" and a 2-letter ISO language code (possibly followed
 636 by a "_" and a variant code).  You can find the lists of those codes,
 637 see L<Locale::Country> and L<Locale::Language>.
 638
 639 A I<context> is one of the following choices:
 640
 641     FINAL            The letter is not followed by a letter of
 642                      general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
 643     MODERN           The mapping is only used for modern text
 644     AFTER_i          The last base character was "i" (U+0069)
 645
 646 For more information about case mappings see
 647 http://www.unicode.org/unicode/reports/tr21/
 648
 649 =cut
 650
 651 my %CASESPEC;
 652
 653 sub _casespec {
 654     unless (%CASESPEC) {
 655         if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
 656             local $_;
 657             while (<$CASESPECFH>) {
 658                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
 659                     my ($hexcode, $lower, $title, $upper, $condition) =
 660                         ($1, $2, $3, $4, $5);
 661                     my $code = hex($hexcode);
 662                     if (exists $CASESPEC{$code}) {
 663                         if (exists $CASESPEC{$code}->{code}) {
 664                             my ($oldlower,
 665                                 $oldtitle,
 666                                 $oldupper,
 667                                 $oldcondition) =
 668                                     @{$CASESPEC{$code}}{qw(lower
 669                                                            title
 670                                                            upper
 671                                                            condition)};
 672                             if (defined $oldcondition) {
 673                                 my ($oldlocale) =
 674                                 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
 675                                 delete $CASESPEC{$code};
 676                                 $CASESPEC{$code}->{$oldlocale} =
 677                                 { code      => $hexcode,
 678                                   lower     => $oldlower,
 679                                   title     => $oldtitle,
 680                                   upper     => $oldupper,
 681                                   condition => $oldcondition };
 682                             }
 683                         }
 684                         my ($locale) =
 685                             ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
 686                         $CASESPEC{$code}->{$locale} =
 687                         { code      => $hexcode,
 688                           lower     => $lower,
 689                           title     => $title,
 690                           upper     => $upper,
 691                           condition => $condition };
 692                     } else {
 693                         $CASESPEC{$code} =
 694                         { code      => $hexcode,
 695                           lower     => $lower,
 696                           title     => $title,
 697                           upper     => $upper,
 698                           condition => $condition };
 699                     }
 700                 }
 701             }
 702             close($CASESPECFH);
 703         }
 704     }
 705 }
 706
 707 sub casespec {
 708     my $arg  = shift;
 709     my $code = _getcode($arg);
 710     croak __PACKAGE__, "::casespec: unknown code '$arg'"
 711         unless defined $code;
 712
 713     _casespec() unless %CASESPEC;
 714
 715     return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
 716 }
 717
 718 =head2 Unicode::UCD::UnicodeVersion
 719
 720 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
 721 Character Database, in other words, the version of the Unicode
 722 standard the database implements.  The version is a string
 723 of numbers delimited by dots (C<'.'>).
 724
 725 =cut
 726
 727 my $UNICODEVERSION;
 728
 729 sub UnicodeVersion {
 730     unless (defined $UNICODEVERSION) {
 731         openunicode(\$VERSIONFH, "version");
 732         chomp($UNICODEVERSION = <$VERSIONFH>);
 733         close($VERSIONFH);
 734         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
 735             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
 736     }
 737     return $UNICODEVERSION;
 738 }
 739
 740 =head2 Implementation Note
 741
 742 The first use of charinfo() opens a read-only filehandle to the Unicode
 743 Character Database (the database is included in the Perl distribution).
 744 The filehandle is then kept open for further queries.  In other words,
 745 if you are wondering where one of your filehandles went, that's where.
 746
 747 =head1 BUGS
 748
 749 Does not yet support EBCDIC platforms.
 750
 751 =head1 AUTHOR
 752
 753 Jarkko Hietaniemi
 754
 755 =cut
 756
 757 1;