lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '0.23';
   7
   8 use Storable qw(dclone);
   9
  10 require Exporter;
  11
  12 our @ISA = qw(Exporter);
  13
  14 our @EXPORT_OK = qw(charinfo
  15                     charblock charscript
  16                     charblocks charscripts
  17                     charinrange
  18                     compexcl
  19                     casefold casespec
  20                     namedseq);
  21
  22 use Carp;
  23
  24 =head1 NAME
  25
  26 Unicode::UCD - Unicode character database
  27
  28 =head1 SYNOPSIS
  29
  30     use Unicode::UCD 'charinfo';
  31     my $charinfo   = charinfo($codepoint);
  32
  33     use Unicode::UCD 'charblock';
  34     my $charblock  = charblock($codepoint);
  35
  36     use Unicode::UCD 'charscript';
  37     my $charscript = charscript($codepoint);
  38
  39     use Unicode::UCD 'charblocks';
  40     my $charblocks = charblocks();
  41
  42     use Unicode::UCD 'charscripts';
  43     my %charscripts = charscripts();
  44
  45     use Unicode::UCD qw(charscript charinrange);
  46     my $range = charscript($script);
  47     print "looks like $script\n" if charinrange($range, $codepoint);
  48
  49     use Unicode::UCD 'compexcl';
  50     my $compexcl = compexcl($codepoint);
  51
  52     use Unicode::UCD 'namedseq';
  53     my $namedseq = namedseq($named_sequence_name);
  54
  55     my $unicode_version = Unicode::UCD::UnicodeVersion();
  56
  57 =head1 DESCRIPTION
  58
  59 The Unicode::UCD module offers a simple interface to the Unicode
  60 Character Database.
  61
  62 =cut
  63
  64 my $UNICODEFH;
  65 my $BLOCKSFH;
  66 my $SCRIPTSFH;
  67 my $VERSIONFH;
  68 my $COMPEXCLFH;
  69 my $CASEFOLDFH;
  70 my $CASESPECFH;
  71 my $NAMEDSEQFH;
  72
  73 sub openunicode {
  74     my ($rfh, @path) = @_;
  75     my $f;
  76     unless (defined $$rfh) {
  77         for my $d (@INC) {
  78             use File::Spec;
  79             $f = File::Spec->catfile($d, "unicore", @path);
  80             last if open($$rfh, $f);
  81             undef $f;
  82         }
  83         croak __PACKAGE__, ": failed to find ",
  84               File::Spec->catfile(@path), " in @INC"
  85             unless defined $f;
  86     }
  87     return $f;
  88 }
  89
  90 =head2 charinfo
  91
  92     use Unicode::UCD 'charinfo';
  93
  94     my $charinfo = charinfo(0x41);
  95
  96 charinfo() returns a reference to a hash that has the following fields
  97 as defined by the Unicode standard:
  98
  99     key
 100
 101     code             code point with at least four hexdigits
 102     name             name of the character IN UPPER CASE
 103     category         general category of the character
 104     combining        classes used in the Canonical Ordering Algorithm
 105     bidi             bidirectional category
 106     decomposition    character decomposition mapping
 107     decimal          if decimal digit this is the integer numeric value
 108     digit            if digit this is the numeric value
 109     numeric          if numeric is the integer or rational numeric value
 110     mirrored         if mirrored in bidirectional text
 111     unicode10        Unicode 1.0 name if existed and different
 112     comment          ISO 10646 comment field
 113     upper            uppercase equivalent mapping
 114     lower            lowercase equivalent mapping
 115     title            titlecase equivalent mapping
 116
 117     block            block the character belongs to (used in \p{In...})
 118     script           script the character belongs to
 119
 120 If no match is found, a reference to an empty hash is returned.
 121
 122 The C<block> property is the same as returned by charinfo().  It is
 123 not defined in the Unicode Character Database proper (Chapter 4 of the
 124 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
 125 (Chapter 14 of TUS3).  Similarly for the C<script> property.
 126
 127 Note that you cannot do (de)composition and casing based solely on the
 128 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
 129 you will need also the compexcl(), casefold(), and casespec() functions.
 130
 131 =cut
 132
 133 # NB: This function is duplicated in charnames.pm
 134 sub _getcode {
 135     my $arg = shift;
 136
 137     if ($arg =~ /^[1-9]\d*$/) {
 138         return $arg;
 139     } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
 140         return hex($1);
 141     }
 142
 143     return;
 144 }
 145
 146 # Lingua::KO::Hangul::Util not part of the standard distribution
 147 # but it will be used if available.
 148
 149 eval { require Lingua::KO::Hangul::Util };
 150 my $hasHangulUtil = ! $@;
 151 if ($hasHangulUtil) {
 152     Lingua::KO::Hangul::Util->import();
 153 }
 154
 155 sub hangul_decomp { # internal: called from charinfo
 156     if ($hasHangulUtil) {
 157         my @tmp = decomposeHangul(shift);
 158         return sprintf("%04X %04X",      @tmp) if @tmp == 2;
 159         return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
 160     }
 161     return;
 162 }
 163
 164 sub hangul_charname { # internal: called from charinfo
 165     return sprintf("HANGUL SYLLABLE-%04X", shift);
 166 }
 167
 168 sub han_charname { # internal: called from charinfo
 169     return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
 170 }
 171
 172 my @CharinfoRanges = (
 173 # block name
 174 # [ first, last, coderef to name, coderef to decompose ],
 175 # CJK Ideographs Extension A
 176   [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
 177 # CJK Ideographs
 178   [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
 179 # Hangul Syllables
 180   [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
 181 # Non-Private Use High Surrogates
 182   [ 0xD800,   0xDB7F,   undef,   undef  ],
 183 # Private Use High Surrogates
 184   [ 0xDB80,   0xDBFF,   undef,   undef  ],
 185 # Low Surrogates
 186   [ 0xDC00,   0xDFFF,   undef,   undef  ],
 187 # The Private Use Area
 188   [ 0xE000,   0xF8FF,   undef,   undef  ],
 189 # CJK Ideographs Extension B
 190   [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
 191 # Plane 15 Private Use Area
 192   [ 0xF0000,  0xFFFFD,  undef,   undef  ],
 193 # Plane 16 Private Use Area
 194   [ 0x100000, 0x10FFFD, undef,   undef  ],
 195 );
 196
 197 sub charinfo {
 198     my $arg  = shift;
 199     my $code = _getcode($arg);
 200     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 201         unless defined $code;
 202     my $hexk = sprintf("%06X", $code);
 203     my($rcode,$rname,$rdec);
 204     foreach my $range (@CharinfoRanges){
 205       if ($range->[0] <= $code && $code <= $range->[1]) {
 206         $rcode = $hexk;
 207         $rcode =~ s/^0+//;
 208         $rcode =  sprintf("%04X", hex($rcode));
 209         $rname = $range->[2] ? $range->[2]->($code) : '';
 210         $rdec  = $range->[3] ? $range->[3]->($code) : '';
 211         $hexk  = sprintf("%06X", $range->[0]); # replace by the first
 212         last;
 213       }
 214     }
 215     openunicode(\$UNICODEFH, "UnicodeData.txt");
 216     if (defined $UNICODEFH) {
 217         use Search::Dict 1.02;
 218         if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
 219             my $line = <$UNICODEFH>;
 220             return unless defined $line;
 221             chomp $line;
 222             my %prop;
 223             @prop{qw(
 224                      code name category
 225                      combining bidi decomposition
 226                      decimal digit numeric
 227                      mirrored unicode10 comment
 228                      upper lower title
 229                     )} = split(/;/, $line, -1);
 230             $hexk =~ s/^0+//;
 231             $hexk =  sprintf("%04X", hex($hexk));
 232             if ($prop{code} eq $hexk) {
 233                 $prop{block}  = charblock($code);
 234                 $prop{script} = charscript($code);
 235                 if(defined $rname){
 236                     $prop{code} = $rcode;
 237                     $prop{name} = $rname;
 238                     $prop{decomposition} = $rdec;
 239                 }
 240                 return \%prop;
 241             }
 242         }
 243     }
 244     return;
 245 }
 246
 247 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 248     my ($table, $lo, $hi, $code) = @_;
 249
 250     return if $lo > $hi;
 251
 252     my $mid = int(($lo+$hi) / 2);
 253
 254     if ($table->[$mid]->[0] < $code) {
 255         if ($table->[$mid]->[1] >= $code) {
 256             return $table->[$mid]->[2];
 257         } else {
 258             _search($table, $mid + 1, $hi, $code);
 259         }
 260     } elsif ($table->[$mid]->[0] > $code) {
 261         _search($table, $lo, $mid - 1, $code);
 262     } else {
 263         return $table->[$mid]->[2];
 264     }
 265 }
 266
 267 sub charinrange {
 268     my ($range, $arg) = @_;
 269     my $code = _getcode($arg);
 270     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 271         unless defined $code;
 272     _search($range, 0, $#$range, $code);
 273 }
 274
 275 =head2 charblock
 276
 277     use Unicode::UCD 'charblock';
 278
 279     my $charblock = charblock(0x41);
 280     my $charblock = charblock(1234);
 281     my $charblock = charblock("0x263a");
 282     my $charblock = charblock("U+263a");
 283
 284     my $range     = charblock('Armenian');
 285
 286 With a B<code point argument> charblock() returns the I<block> the character
 287 belongs to, e.g.  C<Basic Latin>.  Note that not all the character
 288 positions within all blocks are defined.
 289
 290 See also L</Blocks versus Scripts>.
 291
 292 If supplied with an argument that can't be a code point, charblock() tries
 293 to do the opposite and interpret the argument as a character block. The
 294 return value is a I<range>: an anonymous list of lists that contain
 295 I<start-of-range>, I<end-of-range> code point pairs. You can test whether
 296 a code point is in a range using the L</charinrange> function. If the
 297 argument is not a known character block, C<undef> is returned.
 298
 299 =cut
 300
 301 my @BLOCKS;
 302 my %BLOCKS;
 303
 304 sub _charblocks {
 305     unless (@BLOCKS) {
 306         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 307             local $_;
 308             while (<$BLOCKSFH>) {
 309                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 310                     my ($lo, $hi) = (hex($1), hex($2));
 311                     my $subrange = [ $lo, $hi, $3 ];
 312                     push @BLOCKS, $subrange;
 313                     push @{$BLOCKS{$3}}, $subrange;
 314                 }
 315             }
 316             close($BLOCKSFH);
 317         }
 318     }
 319 }
 320
 321 sub charblock {
 322     my $arg = shift;
 323
 324     _charblocks() unless @BLOCKS;
 325
 326     my $code = _getcode($arg);
 327
 328     if (defined $code) {
 329         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 330     } else {
 331         if (exists $BLOCKS{$arg}) {
 332             return dclone $BLOCKS{$arg};
 333         } else {
 334             return;
 335         }
 336     }
 337 }
 338
 339 =head2 charscript
 340
 341     use Unicode::UCD 'charscript';
 342
 343     my $charscript = charscript(0x41);
 344     my $charscript = charscript(1234);
 345     my $charscript = charscript("U+263a");
 346
 347     my $range      = charscript('Thai');
 348
 349 With a B<code point argument> charscript() returns the I<script> the
 350 character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 351
 352 See also L</Blocks versus Scripts>.
 353
 354 If supplied with an argument that can't be a code point, charscript() tries
 355 to do the opposite and interpret the argument as a character script. The
 356 return value is a I<range>: an anonymous list of lists that contain
 357 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
 358 code point is in a range using the L</charinrange> function. If the
 359 argument is not a known character script, C<undef> is returned.
 360
 361 =cut
 362
 363 my @SCRIPTS;
 364 my %SCRIPTS;
 365
 366 sub _charscripts {
 367     unless (@SCRIPTS) {
 368         if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 369             local $_;
 370             while (<$SCRIPTSFH>) {
 371                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 372                     my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 373                     my $script = lc($3);
 374                     $script =~ s/\b(\w)/uc($1)/ge;
 375                     my $subrange = [ $lo, $hi, $script ];
 376                     push @SCRIPTS, $subrange;
 377                     push @{$SCRIPTS{$script}}, $subrange;
 378                 }
 379             }
 380             close($SCRIPTSFH);
 381             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 382         }
 383     }
 384 }
 385
 386 sub charscript {
 387     my $arg = shift;
 388
 389     _charscripts() unless @SCRIPTS;
 390
 391     my $code = _getcode($arg);
 392
 393     if (defined $code) {
 394         _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 395     } else {
 396         if (exists $SCRIPTS{$arg}) {
 397             return dclone $SCRIPTS{$arg};
 398         } else {
 399             return;
 400         }
 401     }
 402 }
 403
 404 =head2 charblocks
 405
 406     use Unicode::UCD 'charblocks';
 407
 408     my $charblocks = charblocks();
 409
 410 charblocks() returns a reference to a hash with the known block names
 411 as the keys, and the code point ranges (see L</charblock>) as the values.
 412
 413 See also L</Blocks versus Scripts>.
 414
 415 =cut
 416
 417 sub charblocks {
 418     _charblocks() unless %BLOCKS;
 419     return dclone \%BLOCKS;
 420 }
 421
 422 =head2 charscripts
 423
 424     use Unicode::UCD 'charscripts';
 425
 426     my %charscripts = charscripts();
 427
 428 charscripts() returns a hash with the known script names as the keys,
 429 and the code point ranges (see L</charscript>) as the values.
 430
 431 See also L</Blocks versus Scripts>.
 432
 433 =cut
 434
 435 sub charscripts {
 436     _charscripts() unless %SCRIPTS;
 437     return dclone \%SCRIPTS;
 438 }
 439
 440 =head2 Blocks versus Scripts
 441
 442 The difference between a block and a script is that scripts are closer
 443 to the linguistic notion of a set of characters required to present
 444 languages, while block is more of an artifact of the Unicode character
 445 numbering and separation into blocks of (mostly) 256 characters.
 446
 447 For example the Latin B<script> is spread over several B<blocks>, such
 448 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
 449 C<Latin Extended-B>.  On the other hand, the Latin script does not
 450 contain all the characters of the C<Basic Latin> block (also known as
 451 the ASCII): it includes only the letters, and not, for example, the digits
 452 or the punctuation.
 453
 454 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
 455
 456 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
 457
 458 =head2 Matching Scripts and Blocks
 459
 460 Scripts are matched with the regular-expression construct
 461 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
 462 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
 463 any of the 256 code points in the Tibetan block).
 464
 465 =head2 Code Point Arguments
 466
 467 A I<code point argument> is either a decimal or a hexadecimal scalar
 468 designating a Unicode character, or C<U+> followed by hexadecimals
 469 designating a Unicode character.  In other words, if you want a code
 470 point to be interpreted as a hexadecimal number, you must prefix it
 471 with either C<0x> or C<U+>, because a string like e.g. C<123> will
 472 be interpreted as a decimal code point.  Also note that Unicode is
 473 B<not> limited to 16 bits (the number of Unicode characters is
 474 open-ended, in theory unlimited): you may have more than 4 hexdigits.
 475
 476 =head2 charinrange
 477
 478 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 479 can also test whether a code point is in the I<range> as returned by
 480 L</charblock> and L</charscript> or as the values of the hash returned
 481 by L</charblocks> and L</charscripts> by using charinrange():
 482
 483     use Unicode::UCD qw(charscript charinrange);
 484
 485     $range = charscript('Hiragana');
 486     print "looks like hiragana\n" if charinrange($range, $codepoint);
 487
 488 =cut
 489
 490 =head2 compexcl
 491
 492     use Unicode::UCD 'compexcl';
 493
 494     my $compexcl = compexcl("09dc");
 495
 496 The compexcl() returns the composition exclusion (that is, if the
 497 character should not be produced during a precomposition) of the
 498 character specified by a B<code point argument>.
 499
 500 If there is a composition exclusion for the character, true is
 501 returned.  Otherwise, false is returned.
 502
 503 =cut
 504
 505 my %COMPEXCL;
 506
 507 sub _compexcl {
 508     unless (%COMPEXCL) {
 509         if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
 510             local $_;
 511             while (<$COMPEXCLFH>) {
 512                 if (/^([0-9A-F]+)\s+\#\s+/) {
 513                     my $code = hex($1);
 514                     $COMPEXCL{$code} = undef;
 515                 }
 516             }
 517             close($COMPEXCLFH);
 518         }
 519     }
 520 }
 521
 522 sub compexcl {
 523     my $arg  = shift;
 524     my $code = _getcode($arg);
 525     croak __PACKAGE__, "::compexcl: unknown code '$arg'"
 526         unless defined $code;
 527
 528     _compexcl() unless %COMPEXCL;
 529
 530     return exists $COMPEXCL{$code};
 531 }
 532
 533 =head2 casefold
 534
 535     use Unicode::UCD 'casefold';
 536
 537     my $casefold = casefold("00DF");
 538
 539 The casefold() returns the locale-independent case folding of the
 540 character specified by a B<code point argument>.
 541
 542 If there is a case folding for that character, a reference to a hash
 543 with the following fields is returned:
 544
 545     key
 546
 547     code             code point with at least four hexdigits
 548     status           "C", "F", "S", or "I"
 549     mapping          one or more codes separated by spaces
 550
 551 The meaning of the I<status> is as follows:
 552
 553    C                 common case folding, common mappings shared
 554                      by both simple and full mappings
 555    F                 full case folding, mappings that cause strings
 556                      to grow in length. Multiple characters are separated
 557                      by spaces
 558    S                 simple case folding, mappings to single characters
 559                      where different from F
 560    I                 special case for dotted uppercase I and
 561                      dotless lowercase i
 562                      - If this mapping is included, the result is
 563                        case-insensitive, but dotless and dotted I's
 564                        are not distinguished
 565                      - If this mapping is excluded, the result is not
 566                        fully case-insensitive, but dotless and dotted
 567                        I's are distinguished
 568
 569 If there is no case folding for that character, C<undef> is returned.
 570
 571 For more information about case mappings see
 572 http://www.unicode.org/unicode/reports/tr21/
 573
 574 =cut
 575
 576 my %CASEFOLD;
 577
 578 sub _casefold {
 579     unless (%CASEFOLD) {
 580         if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
 581             local $_;
 582             while (<$CASEFOLDFH>) {
 583                 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 584                     my $code = hex($1);
 585                     $CASEFOLD{$code} = { code    => $1,
 586                                          status  => $2,
 587                                          mapping => $3 };
 588                 }
 589             }
 590             close($CASEFOLDFH);
 591         }
 592     }
 593 }
 594
 595 sub casefold {
 596     my $arg  = shift;
 597     my $code = _getcode($arg);
 598     croak __PACKAGE__, "::casefold: unknown code '$arg'"
 599         unless defined $code;
 600
 601     _casefold() unless %CASEFOLD;
 602
 603     return $CASEFOLD{$code};
 604 }
 605
 606 =head2 casespec
 607
 608     use Unicode::UCD 'casespec';
 609
 610     my $casespec = casespec("FB00");
 611
 612 The casespec() returns the potentially locale-dependent case mapping
 613 of the character specified by a B<code point argument>.  The mapping
 614 may change the length of the string (which the basic Unicode case
 615 mappings as returned by charinfo() never do).
 616
 617 If there is a case folding for that character, a reference to a hash
 618 with the following fields is returned:
 619
 620     key
 621
 622     code             code point with at least four hexdigits
 623     lower            lowercase
 624     title            titlecase
 625     upper            uppercase
 626     condition        condition list (may be undef)
 627
 628 The C<condition> is optional.  Where present, it consists of one or
 629 more I<locales> or I<contexts>, separated by spaces (other than as
 630 used to separate elements, spaces are to be ignored).  A condition
 631 list overrides the normal behavior if all of the listed conditions are
 632 true.  Case distinctions in the condition list are not significant.
 633 Conditions preceded by "NON_" represent the negation of the condition.
 634
 635 Note that when there are multiple case folding definitions for a
 636 single code point because of different locales, the value returned by
 637 casespec() is a hash reference which has the locales as the keys and
 638 hash references as described above as the values.
 639
 640 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
 641 followed by a "_" and a 2-letter ISO language code (possibly followed
 642 by a "_" and a variant code).  You can find the lists of those codes,
 643 see L<Locale::Country> and L<Locale::Language>.
 644
 645 A I<context> is one of the following choices:
 646
 647     FINAL            The letter is not followed by a letter of
 648                      general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
 649     MODERN           The mapping is only used for modern text
 650     AFTER_i          The last base character was "i" (U+0069)
 651
 652 For more information about case mappings see
 653 http://www.unicode.org/unicode/reports/tr21/
 654
 655 =cut
 656
 657 my %CASESPEC;
 658
 659 sub _casespec {
 660     unless (%CASESPEC) {
 661         if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
 662             local $_;
 663             while (<$CASESPECFH>) {
 664                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
 665                     my ($hexcode, $lower, $title, $upper, $condition) =
 666                         ($1, $2, $3, $4, $5);
 667                     my $code = hex($hexcode);
 668                     if (exists $CASESPEC{$code}) {
 669                         if (exists $CASESPEC{$code}->{code}) {
 670                             my ($oldlower,
 671                                 $oldtitle,
 672                                 $oldupper,
 673                                 $oldcondition) =
 674                                     @{$CASESPEC{$code}}{qw(lower
 675                                                            title
 676                                                            upper
 677                                                            condition)};
 678                             if (defined $oldcondition) {
 679                                 my ($oldlocale) =
 680                                 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
 681                                 delete $CASESPEC{$code};
 682                                 $CASESPEC{$code}->{$oldlocale} =
 683                                 { code      => $hexcode,
 684                                   lower     => $oldlower,
 685                                   title     => $oldtitle,
 686                                   upper     => $oldupper,
 687                                   condition => $oldcondition };
 688                             }
 689                         }
 690                         my ($locale) =
 691                             ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
 692                         $CASESPEC{$code}->{$locale} =
 693                         { code      => $hexcode,
 694                           lower     => $lower,
 695                           title     => $title,
 696                           upper     => $upper,
 697                           condition => $condition };
 698                     } else {
 699                         $CASESPEC{$code} =
 700                         { code      => $hexcode,
 701                           lower     => $lower,
 702                           title     => $title,
 703                           upper     => $upper,
 704                           condition => $condition };
 705                     }
 706                 }
 707             }
 708             close($CASESPECFH);
 709         }
 710     }
 711 }
 712
 713 sub casespec {
 714     my $arg  = shift;
 715     my $code = _getcode($arg);
 716     croak __PACKAGE__, "::casespec: unknown code '$arg'"
 717         unless defined $code;
 718
 719     _casespec() unless %CASESPEC;
 720
 721     return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
 722 }
 723
 724 =head2 namedseq()
 725
 726     use Unicode::UCD 'namedseq';
 727
 728     my $namedseq = namedseq("KATAKANA LETTER AINU P");
 729     my @namedseq = namedseq("KATAKANA LETTER AINU P");
 730     my %namedseq = namedseq();
 731
 732 If used with a single argument in a scalar context, returns the string
 733 consisting of the code points of the named sequence, or C<undef> if no
 734 named sequence by that name exists.  If used with a single argument in
 735 a list context, returns list of the code points.  If used with no
 736 arguments in a list context, returns a hash with the names of the
 737 named sequences as the keys and the named sequences as strings as
 738 the values.  Otherwise, returns C<undef> or empty list depending
 739 on the context.
 740
 741 (New from Unicode 4.1.0)
 742
 743 =cut
 744
 745 my %NAMEDSEQ;
 746
 747 sub _namedseq {
 748     unless (%NAMEDSEQ) {
 749         if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
 750             local $_;
 751             while (<$NAMEDSEQFH>) {
 752                 if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
 753                     my ($n, $s) = ($1, $2);
 754                     my @s = map { chr(hex($_)) } split(' ', $s);
 755                     $NAMEDSEQ{$n} = join("", @s);
 756                 }
 757             }
 758             close($NAMEDSEQFH);
 759         }
 760     }
 761 }
 762
 763 sub namedseq {
 764     _namedseq() unless %NAMEDSEQ;
 765     my $wantarray = wantarray();
 766     if (defined $wantarray) {
 767         if ($wantarray) {
 768             if (@_ == 0) {
 769                 return %NAMEDSEQ;
 770             } elsif (@_ == 1) {
 771                 my $s = $NAMEDSEQ{ $_[0] };
 772                 return defined $s ? map { ord($_) } split('', $s) : ();
 773             }
 774         } elsif (@_ == 1) {
 775             return $NAMEDSEQ{ $_[0] };
 776         }
 777     }
 778     return;
 779 }
 780
 781 =head2 Unicode::UCD::UnicodeVersion
 782
 783 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
 784 Character Database, in other words, the version of the Unicode
 785 standard the database implements.  The version is a string
 786 of numbers delimited by dots (C<'.'>).
 787
 788 =cut
 789
 790 my $UNICODEVERSION;
 791
 792 sub UnicodeVersion {
 793     unless (defined $UNICODEVERSION) {
 794         openunicode(\$VERSIONFH, "version");
 795         chomp($UNICODEVERSION = <$VERSIONFH>);
 796         close($VERSIONFH);
 797         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
 798             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
 799     }
 800     return $UNICODEVERSION;
 801 }
 802
 803 =head2 Implementation Note
 804
 805 The first use of charinfo() opens a read-only filehandle to the Unicode
 806 Character Database (the database is included in the Perl distribution).
 807 The filehandle is then kept open for further queries.  In other words,
 808 if you are wondering where one of your filehandles went, that's where.
 809
 810 =head1 BUGS
 811
 812 Does not yet support EBCDIC platforms.
 813
 814 =head1 AUTHOR
 815
 816 Jarkko Hietaniemi
 817
 818 =cut
 819
 820 1;