lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '0.2';
   7
   8 use Storable qw(dclone);
   9
  10 require Exporter;
  11
  12 our @ISA = qw(Exporter);
  13
  14 our @EXPORT_OK = qw(charinfo
  15                     charblock charscript
  16                     charblocks charscripts
  17                     charinrange
  18                     compexcl
  19                     casefold casespec);
  20
  21 use Carp;
  22
  23 =head1 NAME
  24
  25 Unicode::UCD - Unicode character database
  26
  27 =head1 SYNOPSIS
  28
  29     use Unicode::UCD 'charinfo';
  30     my $charinfo   = charinfo($codepoint);
  31
  32     use Unicode::UCD 'charblock';
  33     my $charblock  = charblock($codepoint);
  34
  35     use Unicode::UCD 'charscript';
  36     my $charscript = charscript($codepoint);
  37
  38     use Unicode::UCD 'charblocks';
  39     my $charblocks = charblocks();
  40
  41     use Unicode::UCD 'charscripts';
  42     my %charscripts = charscripts();
  43
  44     use Unicode::UCD qw(charscript charinrange);
  45     my $range = charscript($script);
  46     print "looks like $script\n" if charinrange($range, $codepoint);
  47
  48     use Unicode::UCD 'compexcl';
  49     my $compexcl = compexcl($codepoint);
  50
  51     my $unicode_version = Unicode::UCD::UnicodeVersion();
  52
  53 =head1 DESCRIPTION
  54
  55 The Unicode::UCD module offers a simple interface to the Unicode
  56 Character Database.
  57
  58 =cut
  59
  60 my $UNICODEFH;
  61 my $BLOCKSFH;
  62 my $SCRIPTSFH;
  63 my $VERSIONFH;
  64 my $COMPEXCLFH;
  65 my $CASEFOLDFH;
  66 my $CASESPECFH;
  67
  68 sub openunicode {
  69     my ($rfh, @path) = @_;
  70     my $f;
  71     unless (defined $$rfh) {
  72         for my $d (@INC) {
  73             use File::Spec;
  74             $f = File::Spec->catfile($d, "unicore", @path);
  75             last if open($$rfh, $f);
  76             undef $f;
  77         }
  78         croak __PACKAGE__, ": failed to find ",
  79               File::Spec->catfile(@path), " in @INC"
  80             unless defined $f;
  81     }
  82     return $f;
  83 }
  84
  85 =head2 charinfo
  86
  87     use Unicode::UCD 'charinfo';
  88
  89     my $charinfo = charinfo(0x41);
  90
  91 charinfo() returns a reference to a hash that has the following fields
  92 as defined by the Unicode standard:
  93
  94     key
  95
  96     code             code point with at least four hexdigits
  97     name             name of the character IN UPPER CASE
  98     category         general category of the character
  99     combining        classes used in the Canonical Ordering Algorithm
 100     bidi             bidirectional category
 101     decomposition    character decomposition mapping
 102     decimal          if decimal digit this is the integer numeric value
 103     digit            if digit this is the numeric value
 104     numeric          if numeric is the integer or rational numeric value
 105     mirrored         if mirrored in bidirectional text
 106     unicode10        Unicode 1.0 name if existed and different
 107     comment          ISO 10646 comment field
 108     upper            uppercase equivalent mapping
 109     lower            lowercase equivalent mapping
 110     title            titlecase equivalent mapping
 111
 112     block            block the character belongs to (used in \p{In...})
 113     script           script the character belongs to
 114
 115 If no match is found, a reference to an empty hash is returned.
 116
 117 The C<block> property is the same as returned by charinfo().  It is
 118 not defined in the Unicode Character Database proper (Chapter 4 of the
 119 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
 120 (Chapter 14 of TUS3).  Similarly for the C<script> property.
 121
 122 Note that you cannot do (de)composition and casing based solely on the
 123 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
 124 you will need also the compexcl(), casefold(), and casespec() functions.
 125
 126 =cut
 127
 128 sub _getcode {
 129     my $arg = shift;
 130
 131     if ($arg =~ /^[1-9]\d*$/) {
 132         return $arg;
 133     } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
 134         return hex($1);
 135     }
 136
 137     return;
 138 }
 139
 140 # Lingua::KO::Hangul::Util not part of the standard distribution
 141 # but it will be used if available.
 142
 143 eval { require Lingua::KO::Hangul::Util };
 144 my $hasHangulUtil = ! $@;
 145 if ($hasHangulUtil) {
 146     Lingua::KO::Hangul::Util->import();
 147 }
 148
 149 sub hangul_decomp { # internal: called from charinfo
 150     if ($hasHangulUtil) {
 151         my @tmp = decomposeHangul(shift);
 152         return sprintf("%04X %04X",      @tmp) if @tmp == 2;
 153         return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
 154     }
 155     return;
 156 }
 157
 158 sub hangul_charname { # internal: called from charinfo
 159     return sprintf("HANGUL SYLLABLE-%04X", shift);
 160 }
 161
 162 sub han_charname { # internal: called from charinfo
 163     return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
 164 }
 165
 166 my @CharinfoRanges = (
 167 # block name
 168 # [ first, last, coderef to name, coderef to decompose ],
 169 # CJK Ideographs Extension A
 170   [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
 171 # CJK Ideographs
 172   [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
 173 # Hangul Syllables
 174   [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
 175 # Non-Private Use High Surrogates
 176   [ 0xD800,   0xDB7F,   undef,   undef  ],
 177 # Private Use High Surrogates
 178   [ 0xDB80,   0xDBFF,   undef,   undef  ],
 179 # Low Surrogates
 180   [ 0xDC00,   0xDFFF,   undef,   undef  ],
 181 # The Private Use Area
 182   [ 0xE000,   0xF8FF,   undef,   undef  ],
 183 # CJK Ideographs Extension B
 184   [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
 185 # Plane 15 Private Use Area
 186   [ 0xF0000,  0xFFFFD,  undef,   undef  ],
 187 # Plane 16 Private Use Area
 188   [ 0x100000, 0x10FFFD, undef,   undef  ],
 189 );
 190
 191 sub charinfo {
 192     my $arg  = shift;
 193     my $code = _getcode($arg);
 194     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 195         unless defined $code;
 196     my $hexk = sprintf("%06X", $code);
 197     my($rcode,$rname,$rdec);
 198     foreach my $range (@CharinfoRanges){
 199       if ($range->[0] <= $code && $code <= $range->[1]) {
 200         $rcode = $hexk;
 201         $rcode =~ s/^0+//;
 202         $rcode =  sprintf("%04X", hex($rcode));
 203         $rname = $range->[2] ? $range->[2]->($code) : '';
 204         $rdec  = $range->[3] ? $range->[3]->($code) : '';
 205         $hexk  = sprintf("%06X", $range->[0]); # replace by the first
 206         last;
 207       }
 208     }
 209     openunicode(\$UNICODEFH, "UnicodeData.txt");
 210     if (defined $UNICODEFH) {
 211         use Search::Dict 1.02;
 212         if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
 213             my $line = <$UNICODEFH>;
 214             chomp $line;
 215             my %prop;
 216             @prop{qw(
 217                      code name category
 218                      combining bidi decomposition
 219                      decimal digit numeric
 220                      mirrored unicode10 comment
 221                      upper lower title
 222                     )} = split(/;/, $line, -1);
 223             $hexk =~ s/^0+//;
 224             $hexk =  sprintf("%04X", hex($hexk));
 225             if ($prop{code} eq $hexk) {
 226                 $prop{block}  = charblock($code);
 227                 $prop{script} = charscript($code);
 228                 if(defined $rname){
 229                     $prop{code} = $rcode;
 230                     $prop{name} = $rname;
 231                     $prop{decomposition} = $rdec;
 232                 }
 233                 return \%prop;
 234             }
 235         }
 236     }
 237     return;
 238 }
 239
 240 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 241     my ($table, $lo, $hi, $code) = @_;
 242
 243     return if $lo > $hi;
 244
 245     my $mid = int(($lo+$hi) / 2);
 246
 247     if ($table->[$mid]->[0] < $code) {
 248         if ($table->[$mid]->[1] >= $code) {
 249             return $table->[$mid]->[2];
 250         } else {
 251             _search($table, $mid + 1, $hi, $code);
 252         }
 253     } elsif ($table->[$mid]->[0] > $code) {
 254         _search($table, $lo, $mid - 1, $code);
 255     } else {
 256         return $table->[$mid]->[2];
 257     }
 258 }
 259
 260 sub charinrange {
 261     my ($range, $arg) = @_;
 262     my $code = _getcode($arg);
 263     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 264         unless defined $code;
 265     _search($range, 0, $#$range, $code);
 266 }
 267
 268 =head2 charblock
 269
 270     use Unicode::UCD 'charblock';
 271
 272     my $charblock = charblock(0x41);
 273     my $charblock = charblock(1234);
 274     my $charblock = charblock("0x263a");
 275     my $charblock = charblock("U+263a");
 276
 277     my $range     = charblock('Armenian');
 278
 279 With a B<code point argument> charblock() returns the I<block> the character
 280 belongs to, e.g.  C<Basic Latin>.  Note that not all the character
 281 positions within all blocks are defined.
 282
 283 See also L</Blocks versus Scripts>.
 284
 285 If supplied with an argument that can't be a code point, charblock() tries
 286 to do the opposite and interpret the argument as a character block. The
 287 return value is a I<range>: an anonymous list of lists that contain
 288 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
 289 code point is in a range using the L</charinrange> function. If the
 290 argument is not a known charater block, C<undef> is returned.
 291
 292 =cut
 293
 294 my @BLOCKS;
 295 my %BLOCKS;
 296
 297 sub _charblocks {
 298     unless (@BLOCKS) {
 299         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 300             local $_;
 301             while (<$BLOCKSFH>) {
 302                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 303                     my ($lo, $hi) = (hex($1), hex($2));
 304                     my $subrange = [ $lo, $hi, $3 ];
 305                     push @BLOCKS, $subrange;
 306                     push @{$BLOCKS{$3}}, $subrange;
 307                 }
 308             }
 309             close($BLOCKSFH);
 310         }
 311     }
 312 }
 313
 314 sub charblock {
 315     my $arg = shift;
 316
 317     _charblocks() unless @BLOCKS;
 318
 319     my $code = _getcode($arg);
 320
 321     if (defined $code) {
 322         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 323     } else {
 324         if (exists $BLOCKS{$arg}) {
 325             return dclone $BLOCKS{$arg};
 326         } else {
 327             return;
 328         }
 329     }
 330 }
 331
 332 =head2 charscript
 333
 334     use Unicode::UCD 'charscript';
 335
 336     my $charscript = charscript(0x41);
 337     my $charscript = charscript(1234);
 338     my $charscript = charscript("U+263a");
 339
 340     my $range      = charscript('Thai');
 341
 342 With a B<code point argument> charscript() returns the I<script> the
 343 character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 344
 345 See also L</Blocks versus Scripts>.
 346
 347 If supplied with an argument that can't be a code point, charscript() tries
 348 to do the opposite and interpret the argument as a character script. The
 349 return value is a I<range>: an anonymous list of lists that contain
 350 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
 351 code point is in a range using the L</charinrange> function. If the
 352 argument is not a known charater script, C<undef> is returned.
 353
 354 =cut
 355
 356 my @SCRIPTS;
 357 my %SCRIPTS;
 358
 359 sub _charscripts {
 360     unless (@SCRIPTS) {
 361         if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 362             local $_;
 363             while (<$SCRIPTSFH>) {
 364                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 365                     my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 366                     my $script = lc($3);
 367                     $script =~ s/\b(\w)/uc($1)/ge;
 368                     my $subrange = [ $lo, $hi, $script ];
 369                     push @SCRIPTS, $subrange;
 370                     push @{$SCRIPTS{$script}}, $subrange;
 371                 }
 372             }
 373             close($SCRIPTSFH);
 374             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 375         }
 376     }
 377 }
 378
 379 sub charscript {
 380     my $arg = shift;
 381
 382     _charscripts() unless @SCRIPTS;
 383
 384     my $code = _getcode($arg);
 385
 386     if (defined $code) {
 387         _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 388     } else {
 389         if (exists $SCRIPTS{$arg}) {
 390             return dclone $SCRIPTS{$arg};
 391         } else {
 392             return;
 393         }
 394     }
 395 }
 396
 397 =head2 charblocks
 398
 399     use Unicode::UCD 'charblocks';
 400
 401     my $charblocks = charblocks();
 402
 403 charblocks() returns a reference to a hash with the known block names
 404 as the keys, and the code point ranges (see L</charblock>) as the values.
 405
 406 See also L</Blocks versus Scripts>.
 407
 408 =cut
 409
 410 sub charblocks {
 411     _charblocks() unless %BLOCKS;
 412     return dclone \%BLOCKS;
 413 }
 414
 415 =head2 charscripts
 416
 417     use Unicode::UCD 'charscripts';
 418
 419     my %charscripts = charscripts();
 420
 421 charscripts() returns a hash with the known script names as the keys,
 422 and the code point ranges (see L</charscript>) as the values.
 423
 424 See also L</Blocks versus Scripts>.
 425
 426 =cut
 427
 428 sub charscripts {
 429     _charscripts() unless %SCRIPTS;
 430     return dclone \%SCRIPTS;
 431 }
 432
 433 =head2 Blocks versus Scripts
 434
 435 The difference between a block and a script is that scripts are closer
 436 to the linguistic notion of a set of characters required to present
 437 languages, while block is more of an artifact of the Unicode character
 438 numbering and separation into blocks of (mostly) 256 characters.
 439
 440 For example the Latin B<script> is spread over several B<blocks>, such
 441 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
 442 C<Latin Extended-B>.  On the other hand, the Latin script does not
 443 contain all the characters of the C<Basic Latin> block (also known as
 444 the ASCII): it includes only the letters, and not, for example, the digits
 445 or the punctuation.
 446
 447 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
 448
 449 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
 450
 451 =head2 Matching Scripts and Blocks
 452
 453 Scripts are matched with the regular-expression construct
 454 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
 455 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
 456 any of the 256 code points in the Tibetan block).
 457
 458 =head2 Code Point Arguments
 459
 460 A I<code point argument> is either a decimal or a hexadecimal scalar
 461 designating a Unicode character, or C<U+> followed by hexadecimals
 462 designating a Unicode character.  In other words, if you want a code
 463 point to be interpreted as a hexadecimal number, you must prefix it
 464 with either C<0x> or C<U+>, because a string like e.g. C<123> will
 465 be interpreted as a decimal code point.  Also note that Unicode is
 466 B<not> limited to 16 bits (the number of Unicode characters is
 467 open-ended, in theory unlimited): you may have more than 4 hexdigits.
 468
 469 =head2 charinrange
 470
 471 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 472 can also test whether a code point is in the I<range> as returned by
 473 L</charblock> and L</charscript> or as the values of the hash returned
 474 by L</charblocks> and L</charscripts> by using charinrange():
 475
 476     use Unicode::UCD qw(charscript charinrange);
 477
 478     $range = charscript('Hiragana');
 479     print "looks like hiragana\n" if charinrange($range, $codepoint);
 480
 481 =cut
 482
 483 =head2 compexcl
 484
 485     use Unicode::UCD 'compexcl';
 486
 487     my $compexcl = compexcl("09dc");
 488
 489 The compexcl() returns the composition exclusion (that is, if the
 490 character should not be produced during a precomposition) of the
 491 character specified by a B<code point argument>.
 492
 493 If there is a composition exclusion for the character, true is
 494 returned.  Otherwise, false is returned.
 495
 496 =cut
 497
 498 my %COMPEXCL;
 499
 500 sub _compexcl {
 501     unless (%COMPEXCL) {
 502         if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
 503             local $_;
 504             while (<$COMPEXCLFH>) {
 505                 if (/^([0-9A-F]+)\s+\#\s+/) {
 506                     my $code = hex($1);
 507                     $COMPEXCL{$code} = undef;
 508                 }
 509             }
 510             close($COMPEXCLFH);
 511         }
 512     }
 513 }
 514
 515 sub compexcl {
 516     my $arg  = shift;
 517     my $code = _getcode($arg);
 518     croak __PACKAGE__, "::compexcl: unknown code '$arg'"
 519         unless defined $code;
 520
 521     _compexcl() unless %COMPEXCL;
 522
 523     return exists $COMPEXCL{$code};
 524 }
 525
 526 =head2 casefold
 527
 528     use Unicode::UCD 'casefold';
 529
 530     my $casefold = casefold("00DF");
 531
 532 The casefold() returns the locale-independent case folding of the
 533 character specified by a B<code point argument>.
 534
 535 If there is a case folding for that character, a reference to a hash
 536 with the following fields is returned:
 537
 538     key
 539
 540     code             code point with at least four hexdigits
 541     status           "C", "F", "S", or "I"
 542     mapping          one or more codes separated by spaces
 543
 544 The meaning of the I<status> is as follows:
 545
 546    C                 common case folding, common mappings shared
 547                      by both simple and full mappings
 548    F                 full case folding, mappings that cause strings
 549                      to grow in length. Multiple characters are separated
 550                      by spaces
 551    S                 simple case folding, mappings to single characters
 552                      where different from F
 553    I                 special case for dotted uppercase I and
 554                      dotless lowercase i
 555                      - If this mapping is included, the result is
 556                        case-insensitive, but dotless and dotted I's
 557                        are not distinguished
 558                      - If this mapping is excluded, the result is not
 559                        fully case-insensitive, but dotless and dotted
 560                        I's are distinguished
 561
 562 If there is no case folding for that character, C<undef> is returned.
 563
 564 For more information about case mappings see
 565 http://www.unicode.org/unicode/reports/tr21/
 566
 567 =cut
 568
 569 my %CASEFOLD;
 570
 571 sub _casefold {
 572     unless (%CASEFOLD) {
 573         if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
 574             local $_;
 575             while (<$CASEFOLDFH>) {
 576                 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 577                     my $code = hex($1);
 578                     $CASEFOLD{$code} = { code    => $1,
 579                                          status  => $2,
 580                                          mapping => $3 };
 581                 }
 582             }
 583             close($CASEFOLDFH);
 584         }
 585     }
 586 }
 587
 588 sub casefold {
 589     my $arg  = shift;
 590     my $code = _getcode($arg);
 591     croak __PACKAGE__, "::casefold: unknown code '$arg'"
 592         unless defined $code;
 593
 594     _casefold() unless %CASEFOLD;
 595
 596     return $CASEFOLD{$code};
 597 }
 598
 599 =head2 casespec
 600
 601     use Unicode::UCD 'casespec';
 602
 603     my $casespec = casespec("FB00");
 604
 605 The casespec() returns the potentially locale-dependent case mapping
 606 of the character specified by a B<code point argument>.  The mapping
 607 may change the length of the string (which the basic Unicode case
 608 mappings as returned by charinfo() never do).
 609
 610 If there is a case folding for that character, a reference to a hash
 611 with the following fields is returned:
 612
 613     key
 614
 615     code             code point with at least four hexdigits
 616     lower            lowercase
 617     title            titlecase
 618     upper            uppercase
 619     condition        condition list (may be undef)
 620
 621 The C<condition> is optional.  Where present, it consists of one or
 622 more I<locales> or I<contexts>, separated by spaces (other than as
 623 used to separate elements, spaces are to be ignored).  A condition
 624 list overrides the normal behavior if all of the listed conditions are
 625 true.  Case distinctions in the condition list are not significant.
 626 Conditions preceded by "NON_" represent the negation of the condition.
 627
 628 Note that when there are multiple case folding definitions for a
 629 single code point because of different locales, the value returned by
 630 casespec() is a hash reference which has the locales as the keys and
 631 hash references as described above as the values.
 632
 633 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
 634 followed by a "_" and a 2-letter ISO language code (possibly followed
 635 by a "_" and a variant code).  You can find the lists of those codes,
 636 see L<Locale::Country> and L<Locale::Language>.
 637
 638 A I<context> is one of the following choices:
 639
 640     FINAL            The letter is not followed by a letter of
 641                      general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
 642     MODERN           The mapping is only used for modern text
 643     AFTER_i          The last base character was "i" (U+0069)
 644
 645 For more information about case mappings see
 646 http://www.unicode.org/unicode/reports/tr21/
 647
 648 =cut
 649
 650 my %CASESPEC;
 651
 652 sub _casespec {
 653     unless (%CASESPEC) {
 654         if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
 655             local $_;
 656             while (<$CASESPECFH>) {
 657                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
 658                     my ($hexcode, $lower, $title, $upper, $condition) =
 659                         ($1, $2, $3, $4, $5);
 660                     my $code = hex($hexcode);
 661                     if (exists $CASESPEC{$code}) {
 662                         if (exists $CASESPEC{$code}->{code}) {
 663                             my ($oldlower,
 664                                 $oldtitle,
 665                                 $oldupper,
 666                                 $oldcondition) =
 667                                     @{$CASESPEC{$code}}{qw(lower
 668                                                            title
 669                                                            upper
 670                                                            condition)};
 671                             if (defined $oldcondition) {
 672                                 my ($oldlocale) =
 673                                 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
 674                                 delete $CASESPEC{$code};
 675                                 $CASESPEC{$code}->{$oldlocale} =
 676                                 { code      => $hexcode,
 677                                   lower     => $oldlower,
 678                                   title     => $oldtitle,
 679                                   upper     => $oldupper,
 680                                   condition => $oldcondition };
 681                             }
 682                         }
 683                         my ($locale) =
 684                             ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
 685                         $CASESPEC{$code}->{$locale} =
 686                         { code      => $hexcode,
 687                           lower     => $lower,
 688                           title     => $title,
 689                           upper     => $upper,
 690                           condition => $condition };
 691                     } else {
 692                         $CASESPEC{$code} =
 693                         { code      => $hexcode,
 694                           lower     => $lower,
 695                           title     => $title,
 696                           upper     => $upper,
 697                           condition => $condition };
 698                     }
 699                 }
 700             }
 701             close($CASESPECFH);
 702         }
 703     }
 704 }
 705
 706 sub casespec {
 707     my $arg  = shift;
 708     my $code = _getcode($arg);
 709     croak __PACKAGE__, "::casespec: unknown code '$arg'"
 710         unless defined $code;
 711
 712     _casespec() unless %CASESPEC;
 713
 714     return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
 715 }
 716
 717 =head2 Unicode::UCD::UnicodeVersion
 718
 719 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
 720 Character Database, in other words, the version of the Unicode
 721 standard the database implements.  The version is a string
 722 of numbers delimited by dots (C<'.'>).
 723
 724 =cut
 725
 726 my $UNICODEVERSION;
 727
 728 sub UnicodeVersion {
 729     unless (defined $UNICODEVERSION) {
 730         openunicode(\$VERSIONFH, "version");
 731         chomp($UNICODEVERSION = <$VERSIONFH>);
 732         close($VERSIONFH);
 733         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
 734             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
 735     }
 736     return $UNICODEVERSION;
 737 }
 738
 739 =head2 Implementation Note
 740
 741 The first use of charinfo() opens a read-only filehandle to the Unicode
 742 Character Database (the database is included in the Perl distribution).
 743 The filehandle is then kept open for further queries.  In other words,
 744 if you are wondering where one of your filehandles went, that's where.
 745
 746 =head1 BUGS
 747
 748 Does not yet support EBCDIC platforms.
 749
 750 =head1 AUTHOR
 751
 752 Jarkko Hietaniemi
 753
 754 =cut
 755
 756 1;