lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '0.22';
   7
   8 use Storable qw(dclone);
   9
  10 require Exporter;
  11
  12 our @ISA = qw(Exporter);
  13
  14 our @EXPORT_OK = qw(charinfo
  15                     charblock charscript
  16                     charblocks charscripts
  17                     charinrange
  18                     compexcl
  19                     casefold casespec);
  20
  21 use Carp;
  22
  23 =head1 NAME
  24
  25 Unicode::UCD - Unicode character database
  26
  27 =head1 SYNOPSIS
  28
  29     use Unicode::UCD 'charinfo';
  30     my $charinfo   = charinfo($codepoint);
  31
  32     use Unicode::UCD 'charblock';
  33     my $charblock  = charblock($codepoint);
  34
  35     use Unicode::UCD 'charscript';
  36     my $charscript = charscript($codepoint);
  37
  38     use Unicode::UCD 'charblocks';
  39     my $charblocks = charblocks();
  40
  41     use Unicode::UCD 'charscripts';
  42     my %charscripts = charscripts();
  43
  44     use Unicode::UCD qw(charscript charinrange);
  45     my $range = charscript($script);
  46     print "looks like $script\n" if charinrange($range, $codepoint);
  47
  48     use Unicode::UCD 'compexcl';
  49     my $compexcl = compexcl($codepoint);
  50
  51     my $unicode_version = Unicode::UCD::UnicodeVersion();
  52
  53 =head1 DESCRIPTION
  54
  55 The Unicode::UCD module offers a simple interface to the Unicode
  56 Character Database.
  57
  58 =cut
  59
  60 my $UNICODEFH;
  61 my $BLOCKSFH;
  62 my $SCRIPTSFH;
  63 my $VERSIONFH;
  64 my $COMPEXCLFH;
  65 my $CASEFOLDFH;
  66 my $CASESPECFH;
  67
  68 sub openunicode {
  69     my ($rfh, @path) = @_;
  70     my $f;
  71     unless (defined $$rfh) {
  72         for my $d (@INC) {
  73             use File::Spec;
  74             $f = File::Spec->catfile($d, "unicore", @path);
  75             last if open($$rfh, $f);
  76             undef $f;
  77         }
  78         croak __PACKAGE__, ": failed to find ",
  79               File::Spec->catfile(@path), " in @INC"
  80             unless defined $f;
  81     }
  82     return $f;
  83 }
  84
  85 =head2 charinfo
  86
  87     use Unicode::UCD 'charinfo';
  88
  89     my $charinfo = charinfo(0x41);
  90
  91 charinfo() returns a reference to a hash that has the following fields
  92 as defined by the Unicode standard:
  93
  94     key
  95
  96     code             code point with at least four hexdigits
  97     name             name of the character IN UPPER CASE
  98     category         general category of the character
  99     combining        classes used in the Canonical Ordering Algorithm
 100     bidi             bidirectional category
 101     decomposition    character decomposition mapping
 102     decimal          if decimal digit this is the integer numeric value
 103     digit            if digit this is the numeric value
 104     numeric          if numeric is the integer or rational numeric value
 105     mirrored         if mirrored in bidirectional text
 106     unicode10        Unicode 1.0 name if existed and different
 107     comment          ISO 10646 comment field
 108     upper            uppercase equivalent mapping
 109     lower            lowercase equivalent mapping
 110     title            titlecase equivalent mapping
 111
 112     block            block the character belongs to (used in \p{In...})
 113     script           script the character belongs to
 114
 115 If no match is found, a reference to an empty hash is returned.
 116
 117 The C<block> property is the same as returned by charinfo().  It is
 118 not defined in the Unicode Character Database proper (Chapter 4 of the
 119 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
 120 (Chapter 14 of TUS3).  Similarly for the C<script> property.
 121
 122 Note that you cannot do (de)composition and casing based solely on the
 123 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
 124 you will need also the compexcl(), casefold(), and casespec() functions.
 125
 126 =cut
 127
 128 # NB: This function is duplicated in charnames.pm
 129 sub _getcode {
 130     my $arg = shift;
 131
 132     if ($arg =~ /^[1-9]\d*$/) {
 133         return $arg;
 134     } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
 135         return hex($1);
 136     }
 137
 138     return;
 139 }
 140
 141 # Lingua::KO::Hangul::Util not part of the standard distribution
 142 # but it will be used if available.
 143
 144 eval { require Lingua::KO::Hangul::Util };
 145 my $hasHangulUtil = ! $@;
 146 if ($hasHangulUtil) {
 147     Lingua::KO::Hangul::Util->import();
 148 }
 149
 150 sub hangul_decomp { # internal: called from charinfo
 151     if ($hasHangulUtil) {
 152         my @tmp = decomposeHangul(shift);
 153         return sprintf("%04X %04X",      @tmp) if @tmp == 2;
 154         return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
 155     }
 156     return;
 157 }
 158
 159 sub hangul_charname { # internal: called from charinfo
 160     return sprintf("HANGUL SYLLABLE-%04X", shift);
 161 }
 162
 163 sub han_charname { # internal: called from charinfo
 164     return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
 165 }
 166
 167 my @CharinfoRanges = (
 168 # block name
 169 # [ first, last, coderef to name, coderef to decompose ],
 170 # CJK Ideographs Extension A
 171   [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
 172 # CJK Ideographs
 173   [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
 174 # Hangul Syllables
 175   [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
 176 # Non-Private Use High Surrogates
 177   [ 0xD800,   0xDB7F,   undef,   undef  ],
 178 # Private Use High Surrogates
 179   [ 0xDB80,   0xDBFF,   undef,   undef  ],
 180 # Low Surrogates
 181   [ 0xDC00,   0xDFFF,   undef,   undef  ],
 182 # The Private Use Area
 183   [ 0xE000,   0xF8FF,   undef,   undef  ],
 184 # CJK Ideographs Extension B
 185   [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
 186 # Plane 15 Private Use Area
 187   [ 0xF0000,  0xFFFFD,  undef,   undef  ],
 188 # Plane 16 Private Use Area
 189   [ 0x100000, 0x10FFFD, undef,   undef  ],
 190 );
 191
 192 sub charinfo {
 193     my $arg  = shift;
 194     my $code = _getcode($arg);
 195     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 196         unless defined $code;
 197     my $hexk = sprintf("%06X", $code);
 198     my($rcode,$rname,$rdec);
 199     foreach my $range (@CharinfoRanges){
 200       if ($range->[0] <= $code && $code <= $range->[1]) {
 201         $rcode = $hexk;
 202         $rcode =~ s/^0+//;
 203         $rcode =  sprintf("%04X", hex($rcode));
 204         $rname = $range->[2] ? $range->[2]->($code) : '';
 205         $rdec  = $range->[3] ? $range->[3]->($code) : '';
 206         $hexk  = sprintf("%06X", $range->[0]); # replace by the first
 207         last;
 208       }
 209     }
 210     openunicode(\$UNICODEFH, "UnicodeData.txt");
 211     if (defined $UNICODEFH) {
 212         use Search::Dict 1.02;
 213         if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
 214             my $line = <$UNICODEFH>;
 215             return unless defined $line;
 216             chomp $line;
 217             my %prop;
 218             @prop{qw(
 219                      code name category
 220                      combining bidi decomposition
 221                      decimal digit numeric
 222                      mirrored unicode10 comment
 223                      upper lower title
 224                     )} = split(/;/, $line, -1);
 225             $hexk =~ s/^0+//;
 226             $hexk =  sprintf("%04X", hex($hexk));
 227             if ($prop{code} eq $hexk) {
 228                 $prop{block}  = charblock($code);
 229                 $prop{script} = charscript($code);
 230                 if(defined $rname){
 231                     $prop{code} = $rcode;
 232                     $prop{name} = $rname;
 233                     $prop{decomposition} = $rdec;
 234                 }
 235                 return \%prop;
 236             }
 237         }
 238     }
 239     return;
 240 }
 241
 242 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 243     my ($table, $lo, $hi, $code) = @_;
 244
 245     return if $lo > $hi;
 246
 247     my $mid = int(($lo+$hi) / 2);
 248
 249     if ($table->[$mid]->[0] < $code) {
 250         if ($table->[$mid]->[1] >= $code) {
 251             return $table->[$mid]->[2];
 252         } else {
 253             _search($table, $mid + 1, $hi, $code);
 254         }
 255     } elsif ($table->[$mid]->[0] > $code) {
 256         _search($table, $lo, $mid - 1, $code);
 257     } else {
 258         return $table->[$mid]->[2];
 259     }
 260 }
 261
 262 sub charinrange {
 263     my ($range, $arg) = @_;
 264     my $code = _getcode($arg);
 265     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 266         unless defined $code;
 267     _search($range, 0, $#$range, $code);
 268 }
 269
 270 =head2 charblock
 271
 272     use Unicode::UCD 'charblock';
 273
 274     my $charblock = charblock(0x41);
 275     my $charblock = charblock(1234);
 276     my $charblock = charblock("0x263a");
 277     my $charblock = charblock("U+263a");
 278
 279     my $range     = charblock('Armenian');
 280
 281 With a B<code point argument> charblock() returns the I<block> the character
 282 belongs to, e.g.  C<Basic Latin>.  Note that not all the character
 283 positions within all blocks are defined.
 284
 285 See also L</Blocks versus Scripts>.
 286
 287 If supplied with an argument that can't be a code point, charblock() tries
 288 to do the opposite and interpret the argument as a character block. The
 289 return value is a I<range>: an anonymous list of lists that contain
 290 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
 291 code point is in a range using the L</charinrange> function. If the
 292 argument is not a known charater block, C<undef> is returned.
 293
 294 =cut
 295
 296 my @BLOCKS;
 297 my %BLOCKS;
 298
 299 sub _charblocks {
 300     unless (@BLOCKS) {
 301         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 302             local $_;
 303             while (<$BLOCKSFH>) {
 304                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 305                     my ($lo, $hi) = (hex($1), hex($2));
 306                     my $subrange = [ $lo, $hi, $3 ];
 307                     push @BLOCKS, $subrange;
 308                     push @{$BLOCKS{$3}}, $subrange;
 309                 }
 310             }
 311             close($BLOCKSFH);
 312         }
 313     }
 314 }
 315
 316 sub charblock {
 317     my $arg = shift;
 318
 319     _charblocks() unless @BLOCKS;
 320
 321     my $code = _getcode($arg);
 322
 323     if (defined $code) {
 324         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 325     } else {
 326         if (exists $BLOCKS{$arg}) {
 327             return dclone $BLOCKS{$arg};
 328         } else {
 329             return;
 330         }
 331     }
 332 }
 333
 334 =head2 charscript
 335
 336     use Unicode::UCD 'charscript';
 337
 338     my $charscript = charscript(0x41);
 339     my $charscript = charscript(1234);
 340     my $charscript = charscript("U+263a");
 341
 342     my $range      = charscript('Thai');
 343
 344 With a B<code point argument> charscript() returns the I<script> the
 345 character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 346
 347 See also L</Blocks versus Scripts>.
 348
 349 If supplied with an argument that can't be a code point, charscript() tries
 350 to do the opposite and interpret the argument as a character script. The
 351 return value is a I<range>: an anonymous list of lists that contain
 352 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
 353 code point is in a range using the L</charinrange> function. If the
 354 argument is not a known charater script, C<undef> is returned.
 355
 356 =cut
 357
 358 my @SCRIPTS;
 359 my %SCRIPTS;
 360
 361 sub _charscripts {
 362     unless (@SCRIPTS) {
 363         if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 364             local $_;
 365             while (<$SCRIPTSFH>) {
 366                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 367                     my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 368                     my $script = lc($3);
 369                     $script =~ s/\b(\w)/uc($1)/ge;
 370                     my $subrange = [ $lo, $hi, $script ];
 371                     push @SCRIPTS, $subrange;
 372                     push @{$SCRIPTS{$script}}, $subrange;
 373                 }
 374             }
 375             close($SCRIPTSFH);
 376             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 377         }
 378     }
 379 }
 380
 381 sub charscript {
 382     my $arg = shift;
 383
 384     _charscripts() unless @SCRIPTS;
 385
 386     my $code = _getcode($arg);
 387
 388     if (defined $code) {
 389         _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 390     } else {
 391         if (exists $SCRIPTS{$arg}) {
 392             return dclone $SCRIPTS{$arg};
 393         } else {
 394             return;
 395         }
 396     }
 397 }
 398
 399 =head2 charblocks
 400
 401     use Unicode::UCD 'charblocks';
 402
 403     my $charblocks = charblocks();
 404
 405 charblocks() returns a reference to a hash with the known block names
 406 as the keys, and the code point ranges (see L</charblock>) as the values.
 407
 408 See also L</Blocks versus Scripts>.
 409
 410 =cut
 411
 412 sub charblocks {
 413     _charblocks() unless %BLOCKS;
 414     return dclone \%BLOCKS;
 415 }
 416
 417 =head2 charscripts
 418
 419     use Unicode::UCD 'charscripts';
 420
 421     my %charscripts = charscripts();
 422
 423 charscripts() returns a hash with the known script names as the keys,
 424 and the code point ranges (see L</charscript>) as the values.
 425
 426 See also L</Blocks versus Scripts>.
 427
 428 =cut
 429
 430 sub charscripts {
 431     _charscripts() unless %SCRIPTS;
 432     return dclone \%SCRIPTS;
 433 }
 434
 435 =head2 Blocks versus Scripts
 436
 437 The difference between a block and a script is that scripts are closer
 438 to the linguistic notion of a set of characters required to present
 439 languages, while block is more of an artifact of the Unicode character
 440 numbering and separation into blocks of (mostly) 256 characters.
 441
 442 For example the Latin B<script> is spread over several B<blocks>, such
 443 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
 444 C<Latin Extended-B>.  On the other hand, the Latin script does not
 445 contain all the characters of the C<Basic Latin> block (also known as
 446 the ASCII): it includes only the letters, and not, for example, the digits
 447 or the punctuation.
 448
 449 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
 450
 451 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
 452
 453 =head2 Matching Scripts and Blocks
 454
 455 Scripts are matched with the regular-expression construct
 456 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
 457 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
 458 any of the 256 code points in the Tibetan block).
 459
 460 =head2 Code Point Arguments
 461
 462 A I<code point argument> is either a decimal or a hexadecimal scalar
 463 designating a Unicode character, or C<U+> followed by hexadecimals
 464 designating a Unicode character.  In other words, if you want a code
 465 point to be interpreted as a hexadecimal number, you must prefix it
 466 with either C<0x> or C<U+>, because a string like e.g. C<123> will
 467 be interpreted as a decimal code point.  Also note that Unicode is
 468 B<not> limited to 16 bits (the number of Unicode characters is
 469 open-ended, in theory unlimited): you may have more than 4 hexdigits.
 470
 471 =head2 charinrange
 472
 473 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 474 can also test whether a code point is in the I<range> as returned by
 475 L</charblock> and L</charscript> or as the values of the hash returned
 476 by L</charblocks> and L</charscripts> by using charinrange():
 477
 478     use Unicode::UCD qw(charscript charinrange);
 479
 480     $range = charscript('Hiragana');
 481     print "looks like hiragana\n" if charinrange($range, $codepoint);
 482
 483 =cut
 484
 485 =head2 compexcl
 486
 487     use Unicode::UCD 'compexcl';
 488
 489     my $compexcl = compexcl("09dc");
 490
 491 The compexcl() returns the composition exclusion (that is, if the
 492 character should not be produced during a precomposition) of the
 493 character specified by a B<code point argument>.
 494
 495 If there is a composition exclusion for the character, true is
 496 returned.  Otherwise, false is returned.
 497
 498 =cut
 499
 500 my %COMPEXCL;
 501
 502 sub _compexcl {
 503     unless (%COMPEXCL) {
 504         if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
 505             local $_;
 506             while (<$COMPEXCLFH>) {
 507                 if (/^([0-9A-F]+)\s+\#\s+/) {
 508                     my $code = hex($1);
 509                     $COMPEXCL{$code} = undef;
 510                 }
 511             }
 512             close($COMPEXCLFH);
 513         }
 514     }
 515 }
 516
 517 sub compexcl {
 518     my $arg  = shift;
 519     my $code = _getcode($arg);
 520     croak __PACKAGE__, "::compexcl: unknown code '$arg'"
 521         unless defined $code;
 522
 523     _compexcl() unless %COMPEXCL;
 524
 525     return exists $COMPEXCL{$code};
 526 }
 527
 528 =head2 casefold
 529
 530     use Unicode::UCD 'casefold';
 531
 532     my $casefold = casefold("00DF");
 533
 534 The casefold() returns the locale-independent case folding of the
 535 character specified by a B<code point argument>.
 536
 537 If there is a case folding for that character, a reference to a hash
 538 with the following fields is returned:
 539
 540     key
 541
 542     code             code point with at least four hexdigits
 543     status           "C", "F", "S", or "I"
 544     mapping          one or more codes separated by spaces
 545
 546 The meaning of the I<status> is as follows:
 547
 548    C                 common case folding, common mappings shared
 549                      by both simple and full mappings
 550    F                 full case folding, mappings that cause strings
 551                      to grow in length. Multiple characters are separated
 552                      by spaces
 553    S                 simple case folding, mappings to single characters
 554                      where different from F
 555    I                 special case for dotted uppercase I and
 556                      dotless lowercase i
 557                      - If this mapping is included, the result is
 558                        case-insensitive, but dotless and dotted I's
 559                        are not distinguished
 560                      - If this mapping is excluded, the result is not
 561                        fully case-insensitive, but dotless and dotted
 562                        I's are distinguished
 563
 564 If there is no case folding for that character, C<undef> is returned.
 565
 566 For more information about case mappings see
 567 http://www.unicode.org/unicode/reports/tr21/
 568
 569 =cut
 570
 571 my %CASEFOLD;
 572
 573 sub _casefold {
 574     unless (%CASEFOLD) {
 575         if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
 576             local $_;
 577             while (<$CASEFOLDFH>) {
 578                 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 579                     my $code = hex($1);
 580                     $CASEFOLD{$code} = { code    => $1,
 581                                          status  => $2,
 582                                          mapping => $3 };
 583                 }
 584             }
 585             close($CASEFOLDFH);
 586         }
 587     }
 588 }
 589
 590 sub casefold {
 591     my $arg  = shift;
 592     my $code = _getcode($arg);
 593     croak __PACKAGE__, "::casefold: unknown code '$arg'"
 594         unless defined $code;
 595
 596     _casefold() unless %CASEFOLD;
 597
 598     return $CASEFOLD{$code};
 599 }
 600
 601 =head2 casespec
 602
 603     use Unicode::UCD 'casespec';
 604
 605     my $casespec = casespec("FB00");
 606
 607 The casespec() returns the potentially locale-dependent case mapping
 608 of the character specified by a B<code point argument>.  The mapping
 609 may change the length of the string (which the basic Unicode case
 610 mappings as returned by charinfo() never do).
 611
 612 If there is a case folding for that character, a reference to a hash
 613 with the following fields is returned:
 614
 615     key
 616
 617     code             code point with at least four hexdigits
 618     lower            lowercase
 619     title            titlecase
 620     upper            uppercase
 621     condition        condition list (may be undef)
 622
 623 The C<condition> is optional.  Where present, it consists of one or
 624 more I<locales> or I<contexts>, separated by spaces (other than as
 625 used to separate elements, spaces are to be ignored).  A condition
 626 list overrides the normal behavior if all of the listed conditions are
 627 true.  Case distinctions in the condition list are not significant.
 628 Conditions preceded by "NON_" represent the negation of the condition.
 629
 630 Note that when there are multiple case folding definitions for a
 631 single code point because of different locales, the value returned by
 632 casespec() is a hash reference which has the locales as the keys and
 633 hash references as described above as the values.
 634
 635 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
 636 followed by a "_" and a 2-letter ISO language code (possibly followed
 637 by a "_" and a variant code).  You can find the lists of those codes,
 638 see L<Locale::Country> and L<Locale::Language>.
 639
 640 A I<context> is one of the following choices:
 641
 642     FINAL            The letter is not followed by a letter of
 643                      general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
 644     MODERN           The mapping is only used for modern text
 645     AFTER_i          The last base character was "i" (U+0069)
 646
 647 For more information about case mappings see
 648 http://www.unicode.org/unicode/reports/tr21/
 649
 650 =cut
 651
 652 my %CASESPEC;
 653
 654 sub _casespec {
 655     unless (%CASESPEC) {
 656         if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
 657             local $_;
 658             while (<$CASESPECFH>) {
 659                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
 660                     my ($hexcode, $lower, $title, $upper, $condition) =
 661                         ($1, $2, $3, $4, $5);
 662                     my $code = hex($hexcode);
 663                     if (exists $CASESPEC{$code}) {
 664                         if (exists $CASESPEC{$code}->{code}) {
 665                             my ($oldlower,
 666                                 $oldtitle,
 667                                 $oldupper,
 668                                 $oldcondition) =
 669                                     @{$CASESPEC{$code}}{qw(lower
 670                                                            title
 671                                                            upper
 672                                                            condition)};
 673                             if (defined $oldcondition) {
 674                                 my ($oldlocale) =
 675                                 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
 676                                 delete $CASESPEC{$code};
 677                                 $CASESPEC{$code}->{$oldlocale} =
 678                                 { code      => $hexcode,
 679                                   lower     => $oldlower,
 680                                   title     => $oldtitle,
 681                                   upper     => $oldupper,
 682                                   condition => $oldcondition };
 683                             }
 684                         }
 685                         my ($locale) =
 686                             ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
 687                         $CASESPEC{$code}->{$locale} =
 688                         { code      => $hexcode,
 689                           lower     => $lower,
 690                           title     => $title,
 691                           upper     => $upper,
 692                           condition => $condition };
 693                     } else {
 694                         $CASESPEC{$code} =
 695                         { code      => $hexcode,
 696                           lower     => $lower,
 697                           title     => $title,
 698                           upper     => $upper,
 699                           condition => $condition };
 700                     }
 701                 }
 702             }
 703             close($CASESPECFH);
 704         }
 705     }
 706 }
 707
 708 sub casespec {
 709     my $arg  = shift;
 710     my $code = _getcode($arg);
 711     croak __PACKAGE__, "::casespec: unknown code '$arg'"
 712         unless defined $code;
 713
 714     _casespec() unless %CASESPEC;
 715
 716     return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
 717 }
 718
 719 =head2 Unicode::UCD::UnicodeVersion
 720
 721 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
 722 Character Database, in other words, the version of the Unicode
 723 standard the database implements.  The version is a string
 724 of numbers delimited by dots (C<'.'>).
 725
 726 =cut
 727
 728 my $UNICODEVERSION;
 729
 730 sub UnicodeVersion {
 731     unless (defined $UNICODEVERSION) {
 732         openunicode(\$VERSIONFH, "version");
 733         chomp($UNICODEVERSION = <$VERSIONFH>);
 734         close($VERSIONFH);
 735         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
 736             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
 737     }
 738     return $UNICODEVERSION;
 739 }
 740
 741 =head2 Implementation Note
 742
 743 The first use of charinfo() opens a read-only filehandle to the Unicode
 744 Character Database (the database is included in the Perl distribution).
 745 The filehandle is then kept open for further queries.  In other words,
 746 if you are wondering where one of your filehandles went, that's where.
 747
 748 =head1 BUGS
 749
 750 Does not yet support EBCDIC platforms.
 751
 752 =head1 AUTHOR
 753
 754 Jarkko Hietaniemi
 755
 756 =cut
 757
 758 1;