lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '0.2';
   7
   8 require Exporter;
   9
  10 our @ISA = qw(Exporter);
  11
  12 our @EXPORT_OK = qw(charinfo
  13                     charblock charscript
  14                     charblocks charscripts
  15                     charinrange
  16                     compexcl
  17                     casefold casespec);
  18
  19 use Carp;
  20
  21 =head1 NAME
  22
  23 Unicode::UCD - Unicode character database
  24
  25 =head1 SYNOPSIS
  26
  27     use Unicode::UCD 'charinfo';
  28     my $charinfo   = charinfo($codepoint);
  29
  30     use Unicode::UCD 'charblock';
  31     my $charblock  = charblock($codepoint);
  32
  33     use Unicode::UCD 'charscript';
  34     my $charscript = charblock($codepoint);
  35
  36     use Unicode::UCD 'charblocks';
  37     my $charblocks = charblocks();
  38
  39     use Unicode::UCD 'charscripts';
  40     my %charscripts = charscripts();
  41
  42     use Unicode::UCD qw(charscript charinrange);
  43     my $range = charscript($script);
  44     print "looks like $script\n" if charinrange($range, $codepoint);
  45
  46     use Unicode::UCD 'compexcl';
  47     my $compexcl = compexcl($codepoint);
  48
  49     my $unicode_version = Unicode::UCD::UnicodeVersion();
  50
  51 =head1 DESCRIPTION
  52
  53 The Unicode::UCD module offers a simple interface to the Unicode
  54 Character Database.
  55
  56 =cut
  57
  58 my $UNICODEFH;
  59 my $BLOCKSFH;
  60 my $SCRIPTSFH;
  61 my $VERSIONFH;
  62 my $COMPEXCLFH;
  63 my $CASEFOLDFH;
  64 my $CASESPECFH;
  65
  66 sub openunicode {
  67     my ($rfh, @path) = @_;
  68     my $f;
  69     unless (defined $$rfh) {
  70         for my $d (@INC) {
  71             use File::Spec;
  72             $f = File::Spec->catfile($d, "unicore", @path);
  73             last if open($$rfh, $f);
  74             undef $f;
  75         }
  76         croak __PACKAGE__, ": failed to find ",
  77               File::Spec->catfile(@path), " in @INC"
  78             unless defined $f;
  79     }
  80     return $f;
  81 }
  82
  83 =head2 charinfo
  84
  85     use Unicode::UCD 'charinfo';
  86
  87     my $charinfo = charinfo(0x41);
  88
  89 charinfo() returns a reference to a hash that has the following fields
  90 as defined by the Unicode standard:
  91
  92     key
  93
  94     code             code point with at least four hexdigits
  95     name             name of the character IN UPPER CASE
  96     category         general category of the character
  97     combining        classes used in the Canonical Ordering Algorithm
  98     bidi             bidirectional category
  99     decomposition    character decomposition mapping
 100     decimal          if decimal digit this is the integer numeric value
 101     digit            if digit this is the numeric value
 102     numeric          if numeric is the integer or rational numeric value
 103     mirrored         if mirrored in bidirectional text
 104     unicode10        Unicode 1.0 name if existed and different
 105     comment          ISO 10646 comment field
 106     upper            uppercase equivalent mapping
 107     lower            lowercase equivalent mapping
 108     title            titlecase equivalent mapping
 109
 110     block            block the character belongs to (used in \p{In...})
 111     script           script the character belongs to
 112
 113 If no match is found, a reference to an empty hash is returned.
 114
 115 The C<block> property is the same as returned by charinfo().  It is
 116 not defined in the Unicode Character Database proper (Chapter 4 of the
 117 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
 118 (Chapter 14 of TUS3).  Similarly for the C<script> property.
 119
 120 Note that you cannot do (de)composition and casing based solely on the
 121 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
 122 you will need also the compexcl(), casefold(), and casespec() functions.
 123
 124 =cut
 125
 126 sub _getcode {
 127     my $arg = shift;
 128
 129     if ($arg =~ /^\d+$/) {
 130         return $arg;
 131     } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
 132         return hex($1);
 133     }
 134
 135     return;
 136 }
 137
 138 # Lingua::KO::Hangul::Util not part of the standard distribution
 139 # but it will be used if available.
 140
 141 eval { require Lingua::KO::Hangul::Util };
 142 my $hasHangulUtil = ! $@;
 143 if ($hasHangulUtil) {
 144     Lingua::KO::Hangul::Util->import();
 145 }
 146
 147 sub hangul_decomp { # internal: called from charinfo
 148     if ($hasHangulUtil) {
 149         my @tmp = decomposeHangul(shift);
 150         return sprintf("%04X %04X",      @tmp) if @tmp == 2;
 151         return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
 152     }
 153     return;
 154 }
 155
 156 sub hangul_charname { # internal: called from charinfo
 157     return sprintf("HANGUL SYLLABLE-%04X", shift);
 158 }
 159
 160 sub han_charname { # internal: called from charinfo
 161     return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
 162 }
 163
 164 my @CharinfoRanges = (
 165 # block name
 166 # [ first, last, coderef to name, coderef to decompose ],
 167 # CJK Ideographs Extension A
 168   [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
 169 # CJK Ideographs
 170   [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
 171 # Hangul Syllables
 172   [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
 173 # Non-Private Use High Surrogates
 174   [ 0xD800,   0xDB7F,   undef,   undef  ],
 175 # Private Use High Surrogates
 176   [ 0xDB80,   0xDBFF,   undef,   undef  ],
 177 # Low Surrogates
 178   [ 0xDC00,   0xDFFF,   undef,   undef  ],
 179 # The Private Use Area
 180   [ 0xE000,   0xF8FF,   undef,   undef  ],
 181 # CJK Ideographs Extension B
 182   [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
 183 # Plane 15 Private Use Area
 184   [ 0xF0000,  0xFFFFD,  undef,   undef  ],
 185 # Plane 16 Private Use Area
 186   [ 0x100000, 0x10FFFD, undef,   undef  ],
 187 );
 188
 189 sub charinfo {
 190     my $arg  = shift;
 191     my $code = _getcode($arg);
 192     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 193         unless defined $code;
 194     my $hexk = sprintf("%06X", $code);
 195     my($rcode,$rname,$rdec);
 196     foreach my $range (@CharinfoRanges){
 197       if ($range->[0] <= $code && $code <= $range->[1]) {
 198         $rcode = $hexk;
 199         $rcode =~ s/^0+//;
 200         $rcode =  sprintf("%04X", hex($rcode));
 201         $rname = $range->[2] ? $range->[2]->($code) : '';
 202         $rdec  = $range->[3] ? $range->[3]->($code) : '';
 203         $hexk  = sprintf("%06X", $range->[0]); # replace by the first
 204         last;
 205       }
 206     }
 207     openunicode(\$UNICODEFH, "Unicode.txt");
 208     if (defined $UNICODEFH) {
 209         use Search::Dict 1.02;
 210         if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
 211             my $line = <$UNICODEFH>;
 212             chomp $line;
 213             my %prop;
 214             @prop{qw(
 215                      code name category
 216                      combining bidi decomposition
 217                      decimal digit numeric
 218                      mirrored unicode10 comment
 219                      upper lower title
 220                     )} = split(/;/, $line, -1);
 221             $hexk =~ s/^0+//;
 222             $hexk =  sprintf("%04X", hex($hexk));
 223             if ($prop{code} eq $hexk) {
 224                 $prop{block}  = charblock($code);
 225                 $prop{script} = charscript($code);
 226                 if(defined $rname){
 227                     $prop{code} = $rcode;
 228                     $prop{name} = $rname;
 229                     $prop{decomposition} = $rdec;
 230                 }
 231                 return \%prop;
 232             }
 233         }
 234     }
 235     return;
 236 }
 237
 238 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 239     my ($table, $lo, $hi, $code) = @_;
 240
 241     return if $lo > $hi;
 242
 243     my $mid = int(($lo+$hi) / 2);
 244
 245     if ($table->[$mid]->[0] < $code) {
 246         if ($table->[$mid]->[1] >= $code) {
 247             return $table->[$mid]->[2];
 248         } else {
 249             _search($table, $mid + 1, $hi, $code);
 250         }
 251     } elsif ($table->[$mid]->[0] > $code) {
 252         _search($table, $lo, $mid - 1, $code);
 253     } else {
 254         return $table->[$mid]->[2];
 255     }
 256 }
 257
 258 sub charinrange {
 259     my ($range, $arg) = @_;
 260     my $code = _getcode($arg);
 261     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 262         unless defined $code;
 263     _search($range, 0, $#$range, $code);
 264 }
 265
 266 =head2 charblock
 267
 268     use Unicode::UCD 'charblock';
 269
 270     my $charblock = charblock(0x41);
 271     my $charblock = charblock(1234);
 272     my $charblock = charblock("0x263a");
 273     my $charblock = charblock("U+263a");
 274
 275     my $range     = charblock('Armenian');
 276
 277 With a B<code point argument> charblock() returns the I<block> the character
 278 belongs to, e.g.  C<Basic Latin>.  Note that not all the character
 279 positions within all blocks are defined.
 280
 281 See also L</Blocks versus Scripts>.
 282
 283 If supplied with an argument that can't be a code point, charblock() tries
 284 to do the opposite and interpret the argument as a character block. The
 285 return value is a I<range>: an anonymous list of lists that contain
 286 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
 287 code point is in a range using the L</charinrange> function. If the
 288 argument is not a known charater block, C<undef> is returned.
 289
 290 =cut
 291
 292 my @BLOCKS;
 293 my %BLOCKS;
 294
 295 sub _charblocks {
 296     unless (@BLOCKS) {
 297         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 298             while (<$BLOCKSFH>) {
 299                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 300                     my ($lo, $hi) = (hex($1), hex($2));
 301                     my $subrange = [ $lo, $hi, $3 ];
 302                     push @BLOCKS, $subrange;
 303                     push @{$BLOCKS{$3}}, $subrange;
 304                 }
 305             }
 306             close($BLOCKSFH);
 307         }
 308     }
 309 }
 310
 311 sub charblock {
 312     my $arg = shift;
 313
 314     _charblocks() unless @BLOCKS;
 315
 316     my $code = _getcode($arg);
 317
 318     if (defined $code) {
 319         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 320     } else {
 321         if (exists $BLOCKS{$arg}) {
 322             return $BLOCKS{$arg};
 323         } else {
 324             return;
 325         }
 326     }
 327 }
 328
 329 =head2 charscript
 330
 331     use Unicode::UCD 'charscript';
 332
 333     my $charscript = charscript(0x41);
 334     my $charscript = charscript(1234);
 335     my $charscript = charscript("U+263a");
 336
 337     my $range      = charscript('Thai');
 338
 339 With a B<code point argument> charscript() returns the I<script> the
 340 character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 341
 342 See also L</Blocks versus Scripts>.
 343
 344 If supplied with an argument that can't be a code point, charscript() tries
 345 to do the opposite and interpret the argument as a character script. The
 346 return value is a I<range>: an anonymous list of lists that contain
 347 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
 348 code point is in a range using the L</charinrange> function. If the
 349 argument is not a known charater script, C<undef> is returned.
 350
 351 =cut
 352
 353 my @SCRIPTS;
 354 my %SCRIPTS;
 355
 356 sub _charscripts {
 357     unless (@SCRIPTS) {
 358         if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 359             while (<$SCRIPTSFH>) {
 360                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 361                     my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 362                     my $script = lc($3);
 363                     $script =~ s/\b(\w)/uc($1)/ge;
 364                     my $subrange = [ $lo, $hi, $script ];
 365                     push @SCRIPTS, $subrange;
 366                     push @{$SCRIPTS{$script}}, $subrange;
 367                 }
 368             }
 369             close($SCRIPTSFH);
 370             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 371         }
 372     }
 373 }
 374
 375 sub charscript {
 376     my $arg = shift;
 377
 378     _charscripts() unless @SCRIPTS;
 379
 380     my $code = _getcode($arg);
 381
 382     if (defined $code) {
 383         _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 384     } else {
 385         if (exists $SCRIPTS{$arg}) {
 386             return $SCRIPTS{$arg};
 387         } else {
 388             return;
 389         }
 390     }
 391 }
 392
 393 =head2 charblocks
 394
 395     use Unicode::UCD 'charblocks';
 396
 397     my $charblocks = charblocks();
 398
 399 charblocks() returns a reference to a hash with the known block names
 400 as the keys, and the code point ranges (see L</charblock>) as the values.
 401
 402 See also L</Blocks versus Scripts>.
 403
 404 =cut
 405
 406 sub charblocks {
 407     _charblocks() unless %BLOCKS;
 408     return \%BLOCKS;
 409 }
 410
 411 =head2 charscripts
 412
 413     use Unicode::UCD 'charscripts';
 414
 415     my %charscripts = charscripts();
 416
 417 charscripts() returns a hash with the known script names as the keys,
 418 and the code point ranges (see L</charscript>) as the values.
 419
 420 See also L</Blocks versus Scripts>.
 421
 422 =cut
 423
 424 sub charscripts {
 425     _charscripts() unless %SCRIPTS;
 426     return \%SCRIPTS;
 427 }
 428
 429 =head2 Blocks versus Scripts
 430
 431 The difference between a block and a script is that scripts are closer
 432 to the linguistic notion of a set of characters required to present
 433 languages, while block is more of an artifact of the Unicode character
 434 numbering and separation into blocks of (mostly) 256 characters.
 435
 436 For example the Latin B<script> is spread over several B<blocks>, such
 437 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
 438 C<Latin Extended-B>.  On the other hand, the Latin script does not
 439 contain all the characters of the C<Basic Latin> block (also known as
 440 the ASCII): it includes only the letters, and not, for example, the digits
 441 or the punctuation.
 442
 443 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
 444
 445 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
 446
 447 =head2 Matching Scripts and Blocks
 448
 449 Scripts are matched with the regular-expression construct
 450 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
 451 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
 452 any of the 256 code points in the Tibetan block).
 453
 454 =head2 Code Point Arguments
 455
 456 A <code point argument> is either a decimal or a hexadecimal scalar
 457 designating a Unicode character, or "U+" followed by hexadecimals
 458 designating a Unicode character.  Note that Unicode is B<not> limited
 459 to 16 bits (the number of Unicode characters is open-ended, in theory
 460 unlimited): you may have more than 4 hexdigits.
 461
 462 =head2 charinrange
 463
 464 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 465 can also test whether a code point is in the I<range> as returned by
 466 L</charblock> and L</charscript> or as the values of the hash returned
 467 by L</charblocks> and L</charscripts> by using charinrange():
 468
 469     use Unicode::UCD qw(charscript charinrange);
 470
 471     $range = charscript('Hiragana');
 472     print "looks like hiragana\n" if charinrange($range, $codepoint);
 473
 474 =cut
 475
 476 =head2 compexcl
 477
 478     use Unicode::UCD 'compexcl';
 479
 480     my $compexcl = compexcl("09dc");
 481
 482 The compexcl() returns the composition exclusion (that is, if the
 483 character should not be produced during a precomposition) of the
 484 character specified by a B<code point argument>.
 485
 486 If there is a composition exclusion for the character, true is
 487 returned.  Otherwise, false is returned.
 488
 489 =cut
 490
 491 my %COMPEXCL;
 492
 493 sub _compexcl {
 494     unless (%COMPEXCL) {
 495         if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
 496             while (<$COMPEXCLFH>) {
 497                 if (/^([0-9A-F]+) \# /) {
 498                     my $code = hex($1);
 499                     $COMPEXCL{$code} = undef;
 500                 }
 501             }
 502             close($COMPEXCLFH);
 503         }
 504     }
 505 }
 506
 507 sub compexcl {
 508     my $arg  = shift;
 509     my $code = _getcode($arg);
 510     croak __PACKAGE__, "::compexcl: unknown code '$arg'"
 511         unless defined $code;
 512
 513     _compexcl() unless %COMPEXCL;
 514
 515     return exists $COMPEXCL{$code};
 516 }
 517
 518 =head2 casefold
 519
 520     use Unicode::UCD 'casefold';
 521
 522     my %casefold = casefold("09dc");
 523
 524 The casefold() returns the locale-independent case folding of the
 525 character specified by a B<code point argument>.
 526
 527 If there is a case folding for that character, a reference to a hash
 528 with the following fields is returned:
 529
 530     key
 531
 532     code             code point with at least four hexdigits
 533     status           "C", "F", "S", or "I"
 534     mapping          one or more codes separated by spaces
 535
 536 The meaning of the I<status> is as follows:
 537
 538    C                 common case folding, common mappings shared
 539                      by both simple and full mappings
 540    F                 full case folding, mappings that cause strings
 541                      to grow in length. Multiple characters are separated
 542                      by spaces
 543    S                 simple case folding, mappings to single characters
 544                      where different from F
 545    I                 special case for dotted uppercase I and
 546                      dotless lowercase i
 547                      - If this mapping is included, the result is
 548                        case-insensitive, but dotless and dotted I's
 549                        are not distinguished
 550                      - If this mapping is excluded, the result is not
 551                        fully case-insensitive, but dotless and dotted
 552                        I's are distinguished
 553
 554 If there is no case folding for that character, C<undef> is returned.
 555
 556 For more information about case mappings see
 557 http://www.unicode.org/unicode/reports/tr21/
 558
 559 =cut
 560
 561 my %CASEFOLD;
 562
 563 sub _casefold {
 564     unless (%CASEFOLD) {
 565         if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
 566             while (<$CASEFOLDFH>) {
 567                 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 568                     my $code = hex($1);
 569                     $CASEFOLD{$code} = { code    => $1,
 570                                          status  => $2,
 571                                          mapping => $3 };
 572                 }
 573             }
 574             close($CASEFOLDFH);
 575         }
 576     }
 577 }
 578
 579 sub casefold {
 580     my $arg  = shift;
 581     my $code = _getcode($arg);
 582     croak __PACKAGE__, "::casefold: unknown code '$arg'"
 583         unless defined $code;
 584
 585     _casefold() unless %CASEFOLD;
 586
 587     return $CASEFOLD{$code};
 588 }
 589
 590 =head2 casespec
 591
 592     use Unicode::UCD 'casespec';
 593
 594     my %casespec = casespec("09dc");
 595
 596 The casespec() returns the potentially locale-dependent case mapping
 597 of the character specified by a B<code point argument>.  The mapping
 598 may change the length of the string (which the basic Unicode case
 599 mappings as returned by charinfo() never do).
 600
 601 If there is a case folding for that character, a reference to a hash
 602 with the following fields is returned:
 603
 604     key
 605
 606     code             code point with at least four hexdigits
 607     lower            lowercase
 608     title            titlecase
 609     upper            uppercase
 610     condition        condition list (may be undef)
 611
 612 The C<condition> is optional.  Where present, it consists of one or
 613 more I<locales> or I<contexts>, separated by spaces (other than as
 614 used to separate elements, spaces are to be ignored).  A condition
 615 list overrides the normal behavior if all of the listed conditions are
 616 true.  Case distinctions in the condition list are not significant.
 617 Conditions preceded by "NON_" represent the negation of the condition
 618
 619 Note that when there are multiple case folding definitions for a
 620 single code point because of different locales, the value returned by
 621 casespec() is a hash reference which has the locales as the keys and
 622 hash references as described above as the values.
 623
 624 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
 625 followed by a "_" and a 2-letter ISO language code (possibly followed
 626 by a "_" and a variant code).  You can find the lists of those codes,
 627 see L<Locale::Country> and L<Locale::Language>.
 628
 629 A I<context> is one of the following choices:
 630
 631     FINAL            The letter is not followed by a letter of
 632                      general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
 633     MODERN           The mapping is only used for modern text
 634     AFTER_i          The last base character was "i" (U+0069)
 635
 636 For more information about case mappings see
 637 http://www.unicode.org/unicode/reports/tr21/
 638
 639 =cut
 640
 641 my %CASESPEC;
 642
 643 sub _casespec {
 644     unless (%CASESPEC) {
 645         if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
 646             while (<$CASESPECFH>) {
 647                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
 648                     my ($hexcode, $lower, $title, $upper, $condition) =
 649                         ($1, $2, $3, $4, $5);
 650                     my $code = hex($hexcode);
 651                     if (exists $CASESPEC{$code}) {
 652                         if (exists $CASESPEC{$code}->{code}) {
 653                             my ($oldlower,
 654                                 $oldtitle,
 655                                 $oldupper,
 656                                 $oldcondition) =
 657                                     @{$CASESPEC{$code}}{qw(lower
 658                                                            title
 659                                                            upper
 660                                                            condition)};
 661                             my ($oldlocale) =
 662                                 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
 663                             if (defined $oldlocale) {
 664                                 delete $CASESPEC{$code};
 665                                 $CASESPEC{$code}->{$oldlocale} =
 666                                 { code      => $hexcode,
 667                                   lower     => $oldlower,
 668                                   title     => $oldtitle,
 669                                   upper     => $oldupper,
 670                                   condition => $oldcondition };
 671                             } else {
 672                                 warn __PACKAGE__, ": SpecCase.txt:", $., ": No oldlocale for 0x$hexcode\n"
 673                             }
 674                         }
 675                         my ($locale) =
 676                             ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
 677                         $CASESPEC{$code}->{$locale} =
 678                         { code      => $hexcode,
 679                           lower     => $lower,
 680                           title     => $title,
 681                           upper     => $upper,
 682                           condition => $condition };
 683                     } else {
 684                         $CASESPEC{$code} =
 685                         { code      => $hexcode,
 686                           lower     => $lower,
 687                           title     => $title,
 688                           upper     => $upper,
 689                           condition => $condition };
 690                     }
 691                 }
 692             }
 693             close($CASESPECFH);
 694         }
 695     }
 696 }
 697
 698 sub casespec {
 699     my $arg  = shift;
 700     my $code = _getcode($arg);
 701     croak __PACKAGE__, "::casespec: unknown code '$arg'"
 702         unless defined $code;
 703
 704     _casespec() unless %CASESPEC;
 705
 706     return $CASESPEC{$code};
 707 }
 708
 709 =head2 Unicode::UCD::UnicodeVersion
 710
 711 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
 712 Character Database, in other words, the version of the Unicode
 713 standard the database implements.  The version is a string
 714 of numbers delimited by dots (C<'.'>).
 715
 716 =cut
 717
 718 my $UNICODEVERSION;
 719
 720 sub UnicodeVersion {
 721     unless (defined $UNICODEVERSION) {
 722         openunicode(\$VERSIONFH, "version");
 723         chomp($UNICODEVERSION = <$VERSIONFH>);
 724         close($VERSIONFH);
 725         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
 726             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
 727     }
 728     return $UNICODEVERSION;
 729 }
 730
 731 =head2 Implementation Note
 732
 733 The first use of charinfo() opens a read-only filehandle to the Unicode
 734 Character Database (the database is included in the Perl distribution).
 735 The filehandle is then kept open for further queries.  In other words,
 736 if you are wondering where one of your filehandles went, that's where.
 737
 738 =head1 BUGS
 739
 740 Does not yet support EBCDIC platforms.
 741
 742 =head1 AUTHOR
 743
 744 Jarkko Hietaniemi
 745
 746 =cut
 747
 748 1;