lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '0.2';
   7
   8 require Exporter;
   9
  10 our @ISA = qw(Exporter);
  11
  12 our @EXPORT_OK = qw(charinfo
  13                     charblock charscript
  14                     charblocks charscripts
  15                     charinrange
  16                     compexcl
  17                     casefold casespec);
  18
  19 use Carp;
  20
  21 =head1 NAME
  22
  23 Unicode::UCD - Unicode character database
  24
  25 =head1 SYNOPSIS
  26
  27     use Unicode::UCD 'charinfo';
  28     my $charinfo   = charinfo($codepoint);
  29
  30     use Unicode::UCD 'charblock';
  31     my $charblock  = charblock($codepoint);
  32
  33     use Unicode::UCD 'charscript';
  34     my $charscript = charblock($codepoint);
  35
  36     use Unicode::UCD 'charblocks';
  37     my $charblocks = charblocks();
  38
  39     use Unicode::UCD 'charscripts';
  40     my %charscripts = charscripts();
  41
  42     use Unicode::UCD qw(charscript charinrange);
  43     my $range = charscript($script);
  44     print "looks like $script\n" if charinrange($range, $codepoint);
  45
  46     use Unicode::UCD 'compexcl';
  47     my $compexcl = compexcl($codepoint);
  48
  49     my $unicode_version = Unicode::UCD::UnicodeVersion();
  50
  51 =head1 DESCRIPTION
  52
  53 The Unicode::UCD module offers a simple interface to the Unicode
  54 Character Database.
  55
  56 =cut
  57
  58 my $UNICODEFH;
  59 my $BLOCKSFH;
  60 my $SCRIPTSFH;
  61 my $VERSIONFH;
  62 my $COMPEXCLFH;
  63 my $CASEFOLDFH;
  64 my $CASESPECFH;
  65
  66 sub openunicode {
  67     my ($rfh, @path) = @_;
  68     my $f;
  69     unless (defined $$rfh) {
  70         for my $d (@INC) {
  71             use File::Spec;
  72             $f = File::Spec->catfile($d, "unicore", @path);
  73             last if open($$rfh, $f);
  74             undef $f;
  75         }
  76         croak __PACKAGE__, ": failed to find ",
  77               File::Spec->catfile(@path), " in @INC"
  78             unless defined $f;
  79     }
  80     return $f;
  81 }
  82
  83 =head2 charinfo
  84
  85     use Unicode::UCD 'charinfo';
  86
  87     my $charinfo = charinfo(0x41);
  88
  89 charinfo() returns a reference to a hash that has the following fields
  90 as defined by the Unicode standard:
  91
  92     key
  93
  94     code             code point with at least four hexdigits
  95     name             name of the character IN UPPER CASE
  96     category         general category of the character
  97     combining        classes used in the Canonical Ordering Algorithm
  98     bidi             bidirectional category
  99     decomposition    character decomposition mapping
 100     decimal          if decimal digit this is the integer numeric value
 101     digit            if digit this is the numeric value
 102     numeric          if numeric is the integer or rational numeric value
 103     mirrored         if mirrored in bidirectional text
 104     unicode10        Unicode 1.0 name if existed and different
 105     comment          ISO 10646 comment field
 106     upper            uppercase equivalent mapping
 107     lower            lowercase equivalent mapping
 108     title            titlecase equivalent mapping
 109
 110     block            block the character belongs to (used in \p{In...})
 111     script           script the character belongs to
 112
 113 If no match is found, a reference to an empty hash is returned.
 114
 115 The C<block> property is the same as as returned by charinfo().  It is
 116 not defined in the Unicode Character Database proper (Chapter 4 of the
 117 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
 118 (Chapter 14 of TUS3).  Similarly for the C<script> property.
 119
 120 Note that you cannot do (de)composition and casing based solely on the
 121 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
 122 you will need also the compexcl(), casefold(), and casespec() functions.
 123
 124 =cut
 125
 126 sub _getcode {
 127     my $arg = shift;
 128
 129     if ($arg =~ /^\d+$/) {
 130         return $arg;
 131     } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
 132         return hex($1);
 133     }
 134
 135     return;
 136 }
 137
 138 use Lingua::KO::Hangul::Util;
 139
 140 sub hangul_decomp { # internal: called from charinfo
 141   my @tmp = decomposeHangul(shift);
 142   return
 143     @tmp == 2 ? sprintf("%04X %04X",      @tmp) :
 144     @tmp == 3 ? sprintf("%04X %04X %04X", @tmp) :
 145       undef;
 146 }
 147
 148 sub han_charname { # internal: called from charinfo
 149     return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
 150 }
 151
 152 my @CharinfoRanges = (
 153 # block name
 154 # [ first, last, coderef to name, coderef to decompose ],
 155 # CJK Ideographs Extension A
 156   [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
 157 # CJK Ideographs
 158   [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
 159 # Hangul Syllables
 160   [ 0xAC00,   0xD7A3,   \&getHangulName,  \&hangul_decomp ],
 161 # Non-Private Use High Surrogates
 162   [ 0xD800,   0xDB7F,   undef,   undef  ],
 163 # Private Use High Surrogates
 164   [ 0xDB80,   0xDBFF,   undef,   undef  ],
 165 # Low Surrogates
 166   [ 0xDC00,   0xDFFF,   undef,   undef  ],
 167 # The Private Use Area
 168   [ 0xE000,   0xF8FF,   undef,   undef  ],
 169 # CJK Ideographs Extension B
 170   [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
 171 # Plane 15 Private Use Area
 172   [ 0xF0000,  0xFFFFD,  undef,   undef  ],
 173 # Plane 16 Private Use Area
 174   [ 0x100000, 0x10FFFD, undef,   undef  ],
 175 );
 176
 177 sub charinfo {
 178     my $arg  = shift;
 179     my $code = _getcode($arg);
 180     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 181         unless defined $code;
 182     my $hexk = sprintf("%06X", $code);
 183     my($rcode,$rname,$rdec);
 184     foreach my $range (@CharinfoRanges){
 185       if ($range->[0] <= $code && $code <= $range->[1]) {
 186         $rcode = $hexk;
 187         $rcode =~ s/^0+//;
 188         $rcode =  sprintf("%04X", hex($rcode));
 189         $rname = $range->[2] ? $range->[2]->($code) : '';
 190         $rdec  = $range->[3] ? $range->[3]->($code) : '';
 191         $hexk  = sprintf("%06X", $range->[0]); # replace by the first
 192         last;
 193       }
 194     }
 195     openunicode(\$UNICODEFH, "Unicode.txt");
 196     if (defined $UNICODEFH) {
 197         use Search::Dict 1.02;
 198         if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
 199             my $line = <$UNICODEFH>;
 200             chomp $line;
 201             my %prop;
 202             @prop{qw(
 203                      code name category
 204                      combining bidi decomposition
 205                      decimal digit numeric
 206                      mirrored unicode10 comment
 207                      upper lower title
 208                     )} = split(/;/, $line, -1);
 209             $hexk =~ s/^0+//;
 210             $hexk =  sprintf("%04X", hex($hexk));
 211             if ($prop{code} eq $hexk) {
 212                 $prop{block}  = charblock($code);
 213                 $prop{script} = charscript($code);
 214                 if(defined $rname){
 215                     $prop{code} = $rcode;
 216                     $prop{name} = $rname;
 217                     $prop{decomposition} = $rdec;
 218                 }
 219                 return \%prop;
 220             }
 221         }
 222     }
 223     return;
 224 }
 225
 226 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 227     my ($table, $lo, $hi, $code) = @_;
 228
 229     return if $lo > $hi;
 230
 231     my $mid = int(($lo+$hi) / 2);
 232
 233     if ($table->[$mid]->[0] < $code) {
 234         if ($table->[$mid]->[1] >= $code) {
 235             return $table->[$mid]->[2];
 236         } else {
 237             _search($table, $mid + 1, $hi, $code);
 238         }
 239     } elsif ($table->[$mid]->[0] > $code) {
 240         _search($table, $lo, $mid - 1, $code);
 241     } else {
 242         return $table->[$mid]->[2];
 243     }
 244 }
 245
 246 sub charinrange {
 247     my ($range, $arg) = @_;
 248     my $code = _getcode($arg);
 249     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 250         unless defined $code;
 251     _search($range, 0, $#$range, $code);
 252 }
 253
 254 =head2 charblock
 255
 256     use Unicode::UCD 'charblock';
 257
 258     my $charblock = charblock(0x41);
 259     my $charblock = charblock(1234);
 260     my $charblock = charblock("0x263a");
 261     my $charblock = charblock("U+263a");
 262
 263     my $range     = charblock('Armenian');
 264
 265 With a B<code point argument> charblock() returns the I<block> the character
 266 belongs to, e.g.  C<Basic Latin>.  Note that not all the character
 267 positions within all blocks are defined.
 268
 269 See also L</Blocks versus Scripts>.
 270
 271 If supplied with an argument that can't be a code point, charblock()
 272 tries to do the opposite and interpret the argument as a character
 273 block.  The return value is a I<range>: an anonymous list that
 274 contains anonymous lists, which in turn contain I<start-of-range>,
 275 I<end-of-range> code point pairs.  You can test whether a code point
 276 is in a range using the L</charinrange> function.  If the argument is
 277 not a known charater block, C<undef> is returned.
 278
 279 =cut
 280
 281 my @BLOCKS;
 282 my %BLOCKS;
 283
 284 sub _charblocks {
 285     unless (@BLOCKS) {
 286         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 287             while (<$BLOCKSFH>) {
 288                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 289                     my ($lo, $hi) = (hex($1), hex($2));
 290                     my $subrange = [ $lo, $hi, $3 ];
 291                     push @BLOCKS, $subrange;
 292                     push @{$BLOCKS{$3}}, $subrange;
 293                 }
 294             }
 295             close($BLOCKSFH);
 296         }
 297     }
 298 }
 299
 300 sub charblock {
 301     my $arg = shift;
 302
 303     _charblocks() unless @BLOCKS;
 304
 305     my $code = _getcode($arg);
 306
 307     if (defined $code) {
 308         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 309     } else {
 310         if (exists $BLOCKS{$arg}) {
 311             return $BLOCKS{$arg};
 312         } else {
 313             return;
 314         }
 315     }
 316 }
 317
 318 =head2 charscript
 319
 320     use Unicode::UCD 'charscript';
 321
 322     my $charscript = charscript(0x41);
 323     my $charscript = charscript(1234);
 324     my $charscript = charscript("U+263a");
 325
 326     my $range      = charscript('Thai');
 327
 328 With a B<code point argument> charscript() returns the I<script> the
 329 character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 330
 331 See also L</Blocks versus Scripts>.
 332
 333 If supplied with an argument that can't be a code point, charscript()
 334 tries to do the opposite and interpret the argument as a character
 335 script.  The return value is a I<range>: an anonymous list that
 336 contains anonymous lists, which in turn contain I<start-of-range>,
 337 I<end-of-range> code point pairs.  You can test whether a code point
 338 is in a range using the L</charinrange> function.  If the argument is
 339 not a known charater script, C<undef> is returned.
 340
 341 =cut
 342
 343 my @SCRIPTS;
 344 my %SCRIPTS;
 345
 346 sub _charscripts {
 347     unless (@SCRIPTS) {
 348         if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 349             while (<$SCRIPTSFH>) {
 350                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 351                     my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 352                     my $script = lc($3);
 353                     $script =~ s/\b(\w)/uc($1)/ge;
 354                     my $subrange = [ $lo, $hi, $script ];
 355                     push @SCRIPTS, $subrange;
 356                     push @{$SCRIPTS{$script}}, $subrange;
 357                 }
 358             }
 359             close($SCRIPTSFH);
 360             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 361         }
 362     }
 363 }
 364
 365 sub charscript {
 366     my $arg = shift;
 367
 368     _charscripts() unless @SCRIPTS;
 369
 370     my $code = _getcode($arg);
 371
 372     if (defined $code) {
 373         _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 374     } else {
 375         if (exists $SCRIPTS{$arg}) {
 376             return $SCRIPTS{$arg};
 377         } else {
 378             return;
 379         }
 380     }
 381 }
 382
 383 =head2 charblocks
 384
 385     use Unicode::UCD 'charblocks';
 386
 387     my $charblocks = charblocks();
 388
 389 charblocks() returns a reference to a hash with the known block names
 390 as the keys, and the code point ranges (see L</charblock>) as the values.
 391
 392 See also L</Blocks versus Scripts>.
 393
 394 =cut
 395
 396 sub charblocks {
 397     _charblocks() unless %BLOCKS;
 398     return \%BLOCKS;
 399 }
 400
 401 =head2 charscripts
 402
 403     use Unicode::UCD 'charscripts';
 404
 405     my %charscripts = charscripts();
 406
 407 charscripts() returns a hash with the known script names as the keys,
 408 and the code point ranges (see L</charscript>) as the values.
 409
 410 See also L</Blocks versus Scripts>.
 411
 412 =cut
 413
 414 sub charscripts {
 415     _charscripts() unless %SCRIPTS;
 416     return \%SCRIPTS;
 417 }
 418
 419 =head2 Blocks versus Scripts
 420
 421 The difference between a block and a script is that scripts are closer
 422 to the linguistic notion of a set of characters required to present
 423 languages, while block is more of an artifact of the Unicode character
 424 numbering and separation into blocks of 256 characters.
 425
 426 For example the Latin B<script> is spread over several B<blocks>, such
 427 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
 428 C<Latin Extended-B>.  On the other hand, the Latin script does not
 429 contain all the characters of the C<Basic Latin> block (also known as
 430 the ASCII): it includes only the letters, not for example the digits
 431 or the punctuation.
 432
 433 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
 434
 435 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
 436
 437 =head2 Matching Scripts and Blocks
 438
 439 Both scripts and blocks can be matched using the regular expression
 440 construct C<\p{In...}> and its negation C<\P{In...}>.
 441
 442 The name of the script or the block comes after the C<In>, for example
 443 C<\p{InCyrillic}>, C<\P{InBasicLatin}>.  Spaces and dashes ('-') are
 444 removed from the names for the C<\p{In...}>, for example
 445 C<LatinExtendedA> instead of C<Latin Extended-A>.
 446
 447 There are a few cases where there is both a script and a block by the
 448 same name, in these cases the block version has C<Block> appended to
 449 its name: C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is
 450 the block.
 451
 452 =head2 Code Point Arguments
 453
 454 A <code point argument> is either a decimal or a hexadecimal scalar
 455 designating a Unicode character, or "U+" followed by hexadecimals
 456 designating a Unicode character.  Note that Unicode is B<not> limited
 457 to 16 bits (the number of Unicode characters is open-ended, in theory
 458 unlimited): you may have more than 4 hexdigits.
 459
 460 =head2 charinrange
 461
 462 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 463 can also test whether a code point is in the I<range> as returned by
 464 L</charblock> and L</charscript> or as the values of the hash returned
 465 by L</charblocks> and L</charscripts> by using charinrange():
 466
 467     use Unicode::UCD qw(charscript charinrange);
 468
 469     $range = charscript('Hiragana');
 470     print "looks like hiragana\n" if charinrange($range, $codepoint);
 471
 472 =cut
 473
 474 =head2 compexcl
 475
 476     use Unicode::UCD 'compexcl';
 477
 478     my $compexcl = compexcl("09dc");
 479
 480 The compexcl() returns the composition exclusion (that is, if the
 481 character should not be produced during a precomposition) of the
 482 character specified by a B<code point argument>.
 483
 484 If there is a composition exclusion for the character, true is
 485 returned.  Otherwise, false is returned.
 486
 487 =cut
 488
 489 my %COMPEXCL;
 490
 491 sub _compexcl {
 492     unless (%COMPEXCL) {
 493         if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
 494             while (<$COMPEXCLFH>) {
 495                 if (/^([0-9A-F]+) \# /) {
 496                     my $code = hex($1);
 497                     $COMPEXCL{$code} = undef;
 498                 }
 499             }
 500             close($COMPEXCLFH);
 501         }
 502     }
 503 }
 504
 505 sub compexcl {
 506     my $arg  = shift;
 507     my $code = _getcode($arg);
 508     croak __PACKAGE__, "::compexcl: unknown code '$arg'"
 509         unless defined $code;
 510
 511     _compexcl() unless %COMPEXCL;
 512
 513     return exists $COMPEXCL{$code};
 514 }
 515
 516 =head2 casefold
 517
 518     use Unicode::UCD 'casefold';
 519
 520     my %casefold = casefold("09dc");
 521
 522 The casefold() returns the locale-independent case folding of the
 523 character specified by a B<code point argument>.
 524
 525 If there is a case folding for that character, a reference to a hash
 526 with the following fields is returned:
 527
 528     key
 529
 530     code             code point with at least four hexdigits
 531     status           "C", "F", "S", or "I"
 532     mapping          one or more codes separated by spaces
 533
 534 The meaning of the I<status> is as follows:
 535
 536    C                 common case folding, common mappings shared
 537                      by both simple and full mappings
 538    F                 full case folding, mappings that cause strings
 539                      to grow in length. Multiple characters are separated
 540                      by spaces
 541    S                 simple case folding, mappings to single characters
 542                      where different from F
 543    I                 special case for dotted uppercase I and
 544                      dotless lowercase i
 545                      - If this mapping is included, the result is
 546                        case-insensitive, but dotless and dotted I's
 547                        are not distinguished
 548                      - If this mapping is excluded, the result is not
 549                        fully case-insensitive, but dotless and dotted
 550                        I's are distinguished
 551
 552 If there is no case folding for that character, C<undef> is returned.
 553
 554 For more information about case mappings see
 555 http://www.unicode.org/unicode/reports/tr21/
 556
 557 =cut
 558
 559 my %CASEFOLD;
 560
 561 sub _casefold {
 562     unless (%CASEFOLD) {
 563         if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
 564             while (<$CASEFOLDFH>) {
 565                 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 566                     my $code = hex($1);
 567                     $CASEFOLD{$code} = { code    => $1,
 568                                          status  => $2,
 569                                          mapping => $3 };
 570                 }
 571             }
 572             close($CASEFOLDFH);
 573         }
 574     }
 575 }
 576
 577 sub casefold {
 578     my $arg  = shift;
 579     my $code = _getcode($arg);
 580     croak __PACKAGE__, "::casefold: unknown code '$arg'"
 581         unless defined $code;
 582
 583     _casefold() unless %CASEFOLD;
 584
 585     return $CASEFOLD{$code};
 586 }
 587
 588 =head2 casespec
 589
 590     use Unicode::UCD 'casespec';
 591
 592     my %casespec = casespec("09dc");
 593
 594 The casespec() returns the potentially locale-dependent case mapping
 595 of the character specified by a B<code point argument>.  The mapping
 596 may change the length of the string (which the basic Unicode case
 597 mappings as returned by charinfo() never do).
 598
 599 If there is a case folding for that character, a reference to a hash
 600 with the following fields is returned:
 601
 602     key
 603
 604     code             code point with at least four hexdigits
 605     lower            lowercase
 606     title            titlecase
 607     upper            uppercase
 608     condition        condition list (may be undef)
 609
 610 The C<condition> is optional.  Where present, it consists of one or
 611 more I<locales> or I<contexts>, separated by spaces (other than as
 612 used to separate elements, spaces are to be ignored).  A condition
 613 list overrides the normal behavior if all of the listed conditions are
 614 true.  Case distinctions in the condition list are not significant.
 615 Conditions preceded by "NON_" represent the negation of the condition
 616
 617 Note that when there are multiple case folding definitions for a
 618 single code point because of different locales, the value returned by
 619 casespec() is a hash reference which has the locales as the keys and
 620 hash references as described above as the values.
 621
 622 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
 623 followed by a "_" and a 2-letter ISO language code (possibly followed
 624 by a "_" and a variant code).  You can find the lists of those codes,
 625 see L<Locale::Country> and L<Locale::Language>.
 626
 627 A I<context> is one of the following choices:
 628
 629     FINAL            The letter is not followed by a letter of
 630                      general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
 631     MODERN           The mapping is only used for modern text
 632     AFTER_i          The last base character was "i" (U+0069)
 633
 634 For more information about case mappings see
 635 http://www.unicode.org/unicode/reports/tr21/
 636
 637 =cut
 638
 639 my %CASESPEC;
 640
 641 sub _casespec {
 642     unless (%CASESPEC) {
 643         if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
 644             while (<$CASESPECFH>) {
 645                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
 646                     my ($hexcode, $lower, $title, $upper, $condition) =
 647                         ($1, $2, $3, $4, $5);
 648                     my $code = hex($hexcode);
 649                     if (exists $CASESPEC{$code}) {
 650                         if (exists $CASESPEC{$code}->{code}) {
 651                             my ($oldlower,
 652                                 $oldtitle,
 653                                 $oldupper,
 654                                 $oldcondition) =
 655                                     @{$CASESPEC{$code}}{qw(lower
 656                                                            title
 657                                                            upper
 658                                                            condition)};
 659                             my ($oldlocale) =
 660                                 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
 661                             if (defined $oldlocale) {
 662                                 delete $CASESPEC{$code};
 663                                 $CASESPEC{$code}->{$oldlocale} =
 664                                 { code      => $hexcode,
 665                                   lower     => $oldlower,
 666                                   title     => $oldtitle,
 667                                   upper     => $oldupper,
 668                                   condition => $oldcondition };
 669                             } else {
 670                                 warn __PACKAGE__, ": SpecCase.txt:", $., ": No oldlocale for 0x$hexcode\n"
 671                             }
 672                         }
 673                         my ($locale) =
 674                             ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
 675                         $CASESPEC{$code}->{$locale} =
 676                         { code      => $hexcode,
 677                           lower     => $lower,
 678                           title     => $title,
 679                           upper     => $upper,
 680                           condition => $condition };
 681                     } else {
 682                         $CASESPEC{$code} =
 683                         { code      => $hexcode,
 684                           lower     => $lower,
 685                           title     => $title,
 686                           upper     => $upper,
 687                           condition => $condition };
 688                     }
 689                 }
 690             }
 691             close($CASESPECFH);
 692         }
 693     }
 694 }
 695
 696 sub casespec {
 697     my $arg  = shift;
 698     my $code = _getcode($arg);
 699     croak __PACKAGE__, "::casespec: unknown code '$arg'"
 700         unless defined $code;
 701
 702     _casespec() unless %CASESPEC;
 703
 704     return $CASESPEC{$code};
 705 }
 706
 707 =head2 Unicode::UCD::UnicodeVersion
 708
 709 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
 710 Character Database, in other words, the version of the Unicode
 711 standard the database implements.  The version is a string
 712 of numbers delimited by dots (C<'.'>).
 713
 714 =cut
 715
 716 my $UNICODEVERSION;
 717
 718 sub UnicodeVersion {
 719     unless (defined $UNICODEVERSION) {
 720         openunicode(\$VERSIONFH, "version");
 721         chomp($UNICODEVERSION = <$VERSIONFH>);
 722         close($VERSIONFH);
 723         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
 724             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
 725     }
 726     return $UNICODEVERSION;
 727 }
 728
 729 =head2 Implementation Note
 730
 731 The first use of charinfo() opens a read-only filehandle to the Unicode
 732 Character Database (the database is included in the Perl distribution).
 733 The filehandle is then kept open for further queries.  In other words,
 734 if you are wondering where one of your filehandles went, that's where.
 735
 736 =head1 BUGS
 737
 738 Does not yet support EBCDIC platforms.
 739
 740 =head1 AUTHOR
 741
 742 Jarkko Hietaniemi
 743
 744 =cut
 745
 746 1;