lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '0.2';
   7
   8 require Exporter;
   9
  10 our @ISA = qw(Exporter);
  11
  12 our @EXPORT_OK = qw(charinfo
  13                     charblock charscript
  14                     charblocks charscripts
  15                     charinrange
  16                     compexcl
  17                     casefold casespec);
  18
  19 use Carp;
  20
  21 =head1 NAME
  22
  23 Unicode::UCD - Unicode character database
  24
  25 =head1 SYNOPSIS
  26
  27     use Unicode::UCD 'charinfo';
  28     my $charinfo   = charinfo($codepoint);
  29
  30     use Unicode::UCD 'charblock';
  31     my $charblock  = charblock($codepoint);
  32
  33     use Unicode::UCD 'charscript';
  34     my $charscript = charblock($codepoint);
  35
  36     use Unicode::UCD 'charblocks';
  37     my $charblocks = charblocks();
  38
  39     use Unicode::UCD 'charscripts';
  40     my %charscripts = charscripts();
  41
  42     use Unicode::UCD qw(charscript charinrange);
  43     my $range = charscript($script);
  44     print "looks like $script\n" if charinrange($range, $codepoint);
  45
  46     use Unicode::UCD 'compexcl';
  47     my $compexcl = compexcl($codepoint);
  48
  49     my $unicode_version = Unicode::UCD::UnicodeVersion();
  50
  51 =head1 DESCRIPTION
  52
  53 The Unicode::UCD module offers a simple interface to the Unicode
  54 Character Database.
  55
  56 =cut
  57
  58 my $UNICODEFH;
  59 my $BLOCKSFH;
  60 my $SCRIPTSFH;
  61 my $VERSIONFH;
  62 my $COMPEXCLFH;
  63 my $CASEFOLDFH;
  64 my $CASESPECFH;
  65
  66 sub openunicode {
  67     my ($rfh, @path) = @_;
  68     my $f;
  69     unless (defined $$rfh) {
  70         for my $d (@INC) {
  71             use File::Spec;
  72             $f = File::Spec->catfile($d, "unicore", @path);
  73             last if open($$rfh, $f);
  74             undef $f;
  75         }
  76         croak __PACKAGE__, ": failed to find ",
  77               File::Spec->catfile(@path), " in @INC"
  78             unless defined $f;
  79     }
  80     return $f;
  81 }
  82
  83 =head2 charinfo
  84
  85     use Unicode::UCD 'charinfo';
  86
  87     my $charinfo = charinfo(0x41);
  88
  89 charinfo() returns a reference to a hash that has the following fields
  90 as defined by the Unicode standard:
  91
  92     key
  93
  94     code             code point with at least four hexdigits
  95     name             name of the character IN UPPER CASE
  96     category         general category of the character
  97     combining        classes used in the Canonical Ordering Algorithm
  98     bidi             bidirectional category
  99     decomposition    character decomposition mapping
 100     decimal          if decimal digit this is the integer numeric value
 101     digit            if digit this is the numeric value
 102     numeric          if numeric is the integer or rational numeric value
 103     mirrored         if mirrored in bidirectional text
 104     unicode10        Unicode 1.0 name if existed and different
 105     comment          ISO 10646 comment field
 106     upper            uppercase equivalent mapping
 107     lower            lowercase equivalent mapping
 108     title            titlecase equivalent mapping
 109
 110     block            block the character belongs to (used in \p{In...})
 111     script           script the character belongs to
 112
 113 If no match is found, a reference to an empty hash is returned.
 114
 115 The C<block> property is the same as as returned by charinfo().  It is
 116 not defined in the Unicode Character Database proper (Chapter 4 of the
 117 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
 118 (Chapter 14 of TUS3).  Similarly for the C<script> property.
 119
 120 Note that you cannot do (de)composition and casing based solely on the
 121 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
 122 you will need also the compexcl(), casefold(), and casespec() functions.
 123
 124 =cut
 125
 126 sub _getcode {
 127     my $arg = shift;
 128
 129     if ($arg =~ /^\d+$/) {
 130         return $arg;
 131     } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
 132         return hex($1);
 133     }
 134
 135     return;
 136 }
 137
 138 # Lingua::KO::Hangul::Util not part of the standard distribution
 139 # but it will be used if available.
 140
 141 eval { require Lingua::KO::Hangul::Util };
 142 my $hasHangulUtil = ! $@;
 143 if ($hasHangulUtil) {
 144     Lingua::KO::Hangul::Util->import();
 145 }
 146
 147 sub hangul_decomp { # internal: called from charinfo
 148     if ($hasHangulUtil) {
 149         my @tmp = decomposeHangul(shift);
 150         return sprintf("%04X %04X",      @tmp) if @tmp == 2;
 151         return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
 152     }
 153     return;
 154 }
 155
 156 sub hangul_charname { # internal: called from charinfo
 157     return sprintf("HANGUL SYLLABLE-%04X", shift);
 158 }
 159
 160 sub han_charname { # internal: called from charinfo
 161     return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
 162 }
 163
 164 my @CharinfoRanges = (
 165 # block name
 166 # [ first, last, coderef to name, coderef to decompose ],
 167 # CJK Ideographs Extension A
 168   [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
 169 # CJK Ideographs
 170   [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
 171 # Hangul Syllables
 172   [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
 173 # Non-Private Use High Surrogates
 174   [ 0xD800,   0xDB7F,   undef,   undef  ],
 175 # Private Use High Surrogates
 176   [ 0xDB80,   0xDBFF,   undef,   undef  ],
 177 # Low Surrogates
 178   [ 0xDC00,   0xDFFF,   undef,   undef  ],
 179 # The Private Use Area
 180   [ 0xE000,   0xF8FF,   undef,   undef  ],
 181 # CJK Ideographs Extension B
 182   [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
 183 # Plane 15 Private Use Area
 184   [ 0xF0000,  0xFFFFD,  undef,   undef  ],
 185 # Plane 16 Private Use Area
 186   [ 0x100000, 0x10FFFD, undef,   undef  ],
 187 );
 188
 189 sub charinfo {
 190     my $arg  = shift;
 191     my $code = _getcode($arg);
 192     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 193         unless defined $code;
 194     my $hexk = sprintf("%06X", $code);
 195     my($rcode,$rname,$rdec);
 196     foreach my $range (@CharinfoRanges){
 197       if ($range->[0] <= $code && $code <= $range->[1]) {
 198         $rcode = $hexk;
 199         $rcode =~ s/^0+//;
 200         $rcode =  sprintf("%04X", hex($rcode));
 201         $rname = $range->[2] ? $range->[2]->($code) : '';
 202         $rdec  = $range->[3] ? $range->[3]->($code) : '';
 203         $hexk  = sprintf("%06X", $range->[0]); # replace by the first
 204         last;
 205       }
 206     }
 207     openunicode(\$UNICODEFH, "Unicode.txt");
 208     if (defined $UNICODEFH) {
 209         use Search::Dict 1.02;
 210         if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
 211             my $line = <$UNICODEFH>;
 212             chomp $line;
 213             my %prop;
 214             @prop{qw(
 215                      code name category
 216                      combining bidi decomposition
 217                      decimal digit numeric
 218                      mirrored unicode10 comment
 219                      upper lower title
 220                     )} = split(/;/, $line, -1);
 221             $hexk =~ s/^0+//;
 222             $hexk =  sprintf("%04X", hex($hexk));
 223             if ($prop{code} eq $hexk) {
 224                 $prop{block}  = charblock($code);
 225                 $prop{script} = charscript($code);
 226                 if(defined $rname){
 227                     $prop{code} = $rcode;
 228                     $prop{name} = $rname;
 229                     $prop{decomposition} = $rdec;
 230                 }
 231                 return \%prop;
 232             }
 233         }
 234     }
 235     return;
 236 }
 237
 238 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 239     my ($table, $lo, $hi, $code) = @_;
 240
 241     return if $lo > $hi;
 242
 243     my $mid = int(($lo+$hi) / 2);
 244
 245     if ($table->[$mid]->[0] < $code) {
 246         if ($table->[$mid]->[1] >= $code) {
 247             return $table->[$mid]->[2];
 248         } else {
 249             _search($table, $mid + 1, $hi, $code);
 250         }
 251     } elsif ($table->[$mid]->[0] > $code) {
 252         _search($table, $lo, $mid - 1, $code);
 253     } else {
 254         return $table->[$mid]->[2];
 255     }
 256 }
 257
 258 sub charinrange {
 259     my ($range, $arg) = @_;
 260     my $code = _getcode($arg);
 261     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 262         unless defined $code;
 263     _search($range, 0, $#$range, $code);
 264 }
 265
 266 =head2 charblock
 267
 268     use Unicode::UCD 'charblock';
 269
 270     my $charblock = charblock(0x41);
 271     my $charblock = charblock(1234);
 272     my $charblock = charblock("0x263a");
 273     my $charblock = charblock("U+263a");
 274
 275     my $range     = charblock('Armenian');
 276
 277 With a B<code point argument> charblock() returns the I<block> the character
 278 belongs to, e.g.  C<Basic Latin>.  Note that not all the character
 279 positions within all blocks are defined.
 280
 281 See also L</Blocks versus Scripts>.
 282
 283 If supplied with an argument that can't be a code point, charblock()
 284 tries to do the opposite and interpret the argument as a character
 285 block.  The return value is a I<range>: an anonymous list that
 286 contains anonymous lists, which in turn contain I<start-of-range>,
 287 I<end-of-range> code point pairs.  You can test whether a code point
 288 is in a range using the L</charinrange> function.  If the argument is
 289 not a known charater block, C<undef> is returned.
 290
 291 =cut
 292
 293 my @BLOCKS;
 294 my %BLOCKS;
 295
 296 sub _charblocks {
 297     unless (@BLOCKS) {
 298         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 299             while (<$BLOCKSFH>) {
 300                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 301                     my ($lo, $hi) = (hex($1), hex($2));
 302                     my $subrange = [ $lo, $hi, $3 ];
 303                     push @BLOCKS, $subrange;
 304                     push @{$BLOCKS{$3}}, $subrange;
 305                 }
 306             }
 307             close($BLOCKSFH);
 308         }
 309     }
 310 }
 311
 312 sub charblock {
 313     my $arg = shift;
 314
 315     _charblocks() unless @BLOCKS;
 316
 317     my $code = _getcode($arg);
 318
 319     if (defined $code) {
 320         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 321     } else {
 322         if (exists $BLOCKS{$arg}) {
 323             return $BLOCKS{$arg};
 324         } else {
 325             return;
 326         }
 327     }
 328 }
 329
 330 =head2 charscript
 331
 332     use Unicode::UCD 'charscript';
 333
 334     my $charscript = charscript(0x41);
 335     my $charscript = charscript(1234);
 336     my $charscript = charscript("U+263a");
 337
 338     my $range      = charscript('Thai');
 339
 340 With a B<code point argument> charscript() returns the I<script> the
 341 character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 342
 343 See also L</Blocks versus Scripts>.
 344
 345 If supplied with an argument that can't be a code point, charscript()
 346 tries to do the opposite and interpret the argument as a character
 347 script.  The return value is a I<range>: an anonymous list that
 348 contains anonymous lists, which in turn contain I<start-of-range>,
 349 I<end-of-range> code point pairs.  You can test whether a code point
 350 is in a range using the L</charinrange> function.  If the argument is
 351 not a known charater script, C<undef> is returned.
 352
 353 =cut
 354
 355 my @SCRIPTS;
 356 my %SCRIPTS;
 357
 358 sub _charscripts {
 359     unless (@SCRIPTS) {
 360         if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 361             while (<$SCRIPTSFH>) {
 362                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 363                     my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 364                     my $script = lc($3);
 365                     $script =~ s/\b(\w)/uc($1)/ge;
 366                     my $subrange = [ $lo, $hi, $script ];
 367                     push @SCRIPTS, $subrange;
 368                     push @{$SCRIPTS{$script}}, $subrange;
 369                 }
 370             }
 371             close($SCRIPTSFH);
 372             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 373         }
 374     }
 375 }
 376
 377 sub charscript {
 378     my $arg = shift;
 379
 380     _charscripts() unless @SCRIPTS;
 381
 382     my $code = _getcode($arg);
 383
 384     if (defined $code) {
 385         _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 386     } else {
 387         if (exists $SCRIPTS{$arg}) {
 388             return $SCRIPTS{$arg};
 389         } else {
 390             return;
 391         }
 392     }
 393 }
 394
 395 =head2 charblocks
 396
 397     use Unicode::UCD 'charblocks';
 398
 399     my $charblocks = charblocks();
 400
 401 charblocks() returns a reference to a hash with the known block names
 402 as the keys, and the code point ranges (see L</charblock>) as the values.
 403
 404 See also L</Blocks versus Scripts>.
 405
 406 =cut
 407
 408 sub charblocks {
 409     _charblocks() unless %BLOCKS;
 410     return \%BLOCKS;
 411 }
 412
 413 =head2 charscripts
 414
 415     use Unicode::UCD 'charscripts';
 416
 417     my %charscripts = charscripts();
 418
 419 charscripts() returns a hash with the known script names as the keys,
 420 and the code point ranges (see L</charscript>) as the values.
 421
 422 See also L</Blocks versus Scripts>.
 423
 424 =cut
 425
 426 sub charscripts {
 427     _charscripts() unless %SCRIPTS;
 428     return \%SCRIPTS;
 429 }
 430
 431 =head2 Blocks versus Scripts
 432
 433 The difference between a block and a script is that scripts are closer
 434 to the linguistic notion of a set of characters required to present
 435 languages, while block is more of an artifact of the Unicode character
 436 numbering and separation into blocks of 256 characters.
 437
 438 For example the Latin B<script> is spread over several B<blocks>, such
 439 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
 440 C<Latin Extended-B>.  On the other hand, the Latin script does not
 441 contain all the characters of the C<Basic Latin> block (also known as
 442 the ASCII): it includes only the letters, not for example the digits
 443 or the punctuation.
 444
 445 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
 446
 447 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
 448
 449 =head2 Matching Scripts and Blocks
 450
 451 Both scripts and blocks can be matched using the regular expression
 452 construct C<\p{In...}> and its negation C<\P{In...}>.
 453
 454 The name of the script or the block comes after the C<In>, for example
 455 C<\p{InCyrillic}>, C<\P{InBasicLatin}>.  Spaces and dashes ('-') are
 456 removed from the names for the C<\p{In...}>, for example
 457 C<LatinExtendedA> instead of C<Latin Extended-A>.
 458
 459 There are a few cases where there is both a script and a block by the
 460 same name, in these cases the block version has C<Block> appended to
 461 its name: C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is
 462 the block.
 463
 464 =head2 Code Point Arguments
 465
 466 A <code point argument> is either a decimal or a hexadecimal scalar
 467 designating a Unicode character, or "U+" followed by hexadecimals
 468 designating a Unicode character.  Note that Unicode is B<not> limited
 469 to 16 bits (the number of Unicode characters is open-ended, in theory
 470 unlimited): you may have more than 4 hexdigits.
 471
 472 =head2 charinrange
 473
 474 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 475 can also test whether a code point is in the I<range> as returned by
 476 L</charblock> and L</charscript> or as the values of the hash returned
 477 by L</charblocks> and L</charscripts> by using charinrange():
 478
 479     use Unicode::UCD qw(charscript charinrange);
 480
 481     $range = charscript('Hiragana');
 482     print "looks like hiragana\n" if charinrange($range, $codepoint);
 483
 484 =cut
 485
 486 =head2 compexcl
 487
 488     use Unicode::UCD 'compexcl';
 489
 490     my $compexcl = compexcl("09dc");
 491
 492 The compexcl() returns the composition exclusion (that is, if the
 493 character should not be produced during a precomposition) of the
 494 character specified by a B<code point argument>.
 495
 496 If there is a composition exclusion for the character, true is
 497 returned.  Otherwise, false is returned.
 498
 499 =cut
 500
 501 my %COMPEXCL;
 502
 503 sub _compexcl {
 504     unless (%COMPEXCL) {
 505         if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
 506             while (<$COMPEXCLFH>) {
 507                 if (/^([0-9A-F]+) \# /) {
 508                     my $code = hex($1);
 509                     $COMPEXCL{$code} = undef;
 510                 }
 511             }
 512             close($COMPEXCLFH);
 513         }
 514     }
 515 }
 516
 517 sub compexcl {
 518     my $arg  = shift;
 519     my $code = _getcode($arg);
 520     croak __PACKAGE__, "::compexcl: unknown code '$arg'"
 521         unless defined $code;
 522
 523     _compexcl() unless %COMPEXCL;
 524
 525     return exists $COMPEXCL{$code};
 526 }
 527
 528 =head2 casefold
 529
 530     use Unicode::UCD 'casefold';
 531
 532     my %casefold = casefold("09dc");
 533
 534 The casefold() returns the locale-independent case folding of the
 535 character specified by a B<code point argument>.
 536
 537 If there is a case folding for that character, a reference to a hash
 538 with the following fields is returned:
 539
 540     key
 541
 542     code             code point with at least four hexdigits
 543     status           "C", "F", "S", or "I"
 544     mapping          one or more codes separated by spaces
 545
 546 The meaning of the I<status> is as follows:
 547
 548    C                 common case folding, common mappings shared
 549                      by both simple and full mappings
 550    F                 full case folding, mappings that cause strings
 551                      to grow in length. Multiple characters are separated
 552                      by spaces
 553    S                 simple case folding, mappings to single characters
 554                      where different from F
 555    I                 special case for dotted uppercase I and
 556                      dotless lowercase i
 557                      - If this mapping is included, the result is
 558                        case-insensitive, but dotless and dotted I's
 559                        are not distinguished
 560                      - If this mapping is excluded, the result is not
 561                        fully case-insensitive, but dotless and dotted
 562                        I's are distinguished
 563
 564 If there is no case folding for that character, C<undef> is returned.
 565
 566 For more information about case mappings see
 567 http://www.unicode.org/unicode/reports/tr21/
 568
 569 =cut
 570
 571 my %CASEFOLD;
 572
 573 sub _casefold {
 574     unless (%CASEFOLD) {
 575         if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
 576             while (<$CASEFOLDFH>) {
 577                 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 578                     my $code = hex($1);
 579                     $CASEFOLD{$code} = { code    => $1,
 580                                          status  => $2,
 581                                          mapping => $3 };
 582                 }
 583             }
 584             close($CASEFOLDFH);
 585         }
 586     }
 587 }
 588
 589 sub casefold {
 590     my $arg  = shift;
 591     my $code = _getcode($arg);
 592     croak __PACKAGE__, "::casefold: unknown code '$arg'"
 593         unless defined $code;
 594
 595     _casefold() unless %CASEFOLD;
 596
 597     return $CASEFOLD{$code};
 598 }
 599
 600 =head2 casespec
 601
 602     use Unicode::UCD 'casespec';
 603
 604     my %casespec = casespec("09dc");
 605
 606 The casespec() returns the potentially locale-dependent case mapping
 607 of the character specified by a B<code point argument>.  The mapping
 608 may change the length of the string (which the basic Unicode case
 609 mappings as returned by charinfo() never do).
 610
 611 If there is a case folding for that character, a reference to a hash
 612 with the following fields is returned:
 613
 614     key
 615
 616     code             code point with at least four hexdigits
 617     lower            lowercase
 618     title            titlecase
 619     upper            uppercase
 620     condition        condition list (may be undef)
 621
 622 The C<condition> is optional.  Where present, it consists of one or
 623 more I<locales> or I<contexts>, separated by spaces (other than as
 624 used to separate elements, spaces are to be ignored).  A condition
 625 list overrides the normal behavior if all of the listed conditions are
 626 true.  Case distinctions in the condition list are not significant.
 627 Conditions preceded by "NON_" represent the negation of the condition
 628
 629 Note that when there are multiple case folding definitions for a
 630 single code point because of different locales, the value returned by
 631 casespec() is a hash reference which has the locales as the keys and
 632 hash references as described above as the values.
 633
 634 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
 635 followed by a "_" and a 2-letter ISO language code (possibly followed
 636 by a "_" and a variant code).  You can find the lists of those codes,
 637 see L<Locale::Country> and L<Locale::Language>.
 638
 639 A I<context> is one of the following choices:
 640
 641     FINAL            The letter is not followed by a letter of
 642                      general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
 643     MODERN           The mapping is only used for modern text
 644     AFTER_i          The last base character was "i" (U+0069)
 645
 646 For more information about case mappings see
 647 http://www.unicode.org/unicode/reports/tr21/
 648
 649 =cut
 650
 651 my %CASESPEC;
 652
 653 sub _casespec {
 654     unless (%CASESPEC) {
 655         if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
 656             while (<$CASESPECFH>) {
 657                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
 658                     my ($hexcode, $lower, $title, $upper, $condition) =
 659                         ($1, $2, $3, $4, $5);
 660                     my $code = hex($hexcode);
 661                     if (exists $CASESPEC{$code}) {
 662                         if (exists $CASESPEC{$code}->{code}) {
 663                             my ($oldlower,
 664                                 $oldtitle,
 665                                 $oldupper,
 666                                 $oldcondition) =
 667                                     @{$CASESPEC{$code}}{qw(lower
 668                                                            title
 669                                                            upper
 670                                                            condition)};
 671                             my ($oldlocale) =
 672                                 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
 673                             if (defined $oldlocale) {
 674                                 delete $CASESPEC{$code};
 675                                 $CASESPEC{$code}->{$oldlocale} =
 676                                 { code      => $hexcode,
 677                                   lower     => $oldlower,
 678                                   title     => $oldtitle,
 679                                   upper     => $oldupper,
 680                                   condition => $oldcondition };
 681                             } else {
 682                                 warn __PACKAGE__, ": SpecCase.txt:", $., ": No oldlocale for 0x$hexcode\n"
 683                             }
 684                         }
 685                         my ($locale) =
 686                             ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
 687                         $CASESPEC{$code}->{$locale} =
 688                         { code      => $hexcode,
 689                           lower     => $lower,
 690                           title     => $title,
 691                           upper     => $upper,
 692                           condition => $condition };
 693                     } else {
 694                         $CASESPEC{$code} =
 695                         { code      => $hexcode,
 696                           lower     => $lower,
 697                           title     => $title,
 698                           upper     => $upper,
 699                           condition => $condition };
 700                     }
 701                 }
 702             }
 703             close($CASESPECFH);
 704         }
 705     }
 706 }
 707
 708 sub casespec {
 709     my $arg  = shift;
 710     my $code = _getcode($arg);
 711     croak __PACKAGE__, "::casespec: unknown code '$arg'"
 712         unless defined $code;
 713
 714     _casespec() unless %CASESPEC;
 715
 716     return $CASESPEC{$code};
 717 }
 718
 719 =head2 Unicode::UCD::UnicodeVersion
 720
 721 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
 722 Character Database, in other words, the version of the Unicode
 723 standard the database implements.  The version is a string
 724 of numbers delimited by dots (C<'.'>).
 725
 726 =cut
 727
 728 my $UNICODEVERSION;
 729
 730 sub UnicodeVersion {
 731     unless (defined $UNICODEVERSION) {
 732         openunicode(\$VERSIONFH, "version");
 733         chomp($UNICODEVERSION = <$VERSIONFH>);
 734         close($VERSIONFH);
 735         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
 736             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
 737     }
 738     return $UNICODEVERSION;
 739 }
 740
 741 =head2 Implementation Note
 742
 743 The first use of charinfo() opens a read-only filehandle to the Unicode
 744 Character Database (the database is included in the Perl distribution).
 745 The filehandle is then kept open for further queries.  In other words,
 746 if you are wondering where one of your filehandles went, that's where.
 747
 748 =head1 BUGS
 749
 750 Does not yet support EBCDIC platforms.
 751
 752 =head1 AUTHOR
 753
 754 Jarkko Hietaniemi
 755
 756 =cut
 757
 758 1;