lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '0.2';
   7
   8 require Exporter;
   9
  10 our @ISA = qw(Exporter);
  11
  12 our @EXPORT_OK = qw(charinfo
  13                     charblock charscript
  14                     charblocks charscripts
  15                     charinrange
  16                     compexcl
  17                     casefold casespec);
  18
  19 use Carp;
  20
  21 =head1 NAME
  22
  23 Unicode::UCD - Unicode character database
  24
  25 =head1 SYNOPSIS
  26
  27     use Unicode::UCD 'charinfo';
  28     my $charinfo   = charinfo($codepoint);
  29
  30     use Unicode::UCD 'charblock';
  31     my $charblock  = charblock($codepoint);
  32
  33     use Unicode::UCD 'charscript';
  34     my $charscript = charblock($codepoint);
  35
  36     use Unicode::UCD 'charblocks';
  37     my $charblocks = charblocks();
  38
  39     use Unicode::UCD 'charscripts';
  40     my %charscripts = charscripts();
  41
  42     use Unicode::UCD qw(charscript charinrange);
  43     my $range = charscript($script);
  44     print "looks like $script\n" if charinrange($range, $codepoint);
  45
  46     use Unicode::UCD 'compexcl';
  47     my $compexcl = compexcl($codepoint);
  48
  49     my $unicode_version = Unicode::UCD::UnicodeVersion();
  50
  51 =head1 DESCRIPTION
  52
  53 The Unicode::UCD module offers a simple interface to the Unicode Character
  54 Database.
  55
  56 =cut
  57
  58 my $UNICODEFH;
  59 my $BLOCKSFH;
  60 my $SCRIPTSFH;
  61 my $VERSIONFH;
  62 my $COMPEXCLFH;
  63 my $CASEFOLDFH;
  64 my $CASESPECFH;
  65
  66 sub openunicode {
  67     my ($rfh, @path) = @_;
  68     my $f;
  69     unless (defined $$rfh) {
  70         for my $d (@INC) {
  71             use File::Spec;
  72             $f = File::Spec->catfile($d, "unicore", @path);
  73             last if open($$rfh, $f);
  74             undef $f;
  75         }
  76         croak __PACKAGE__, ": failed to find ",
  77               File::Spec->catfile(@path), " in @INC"
  78             unless defined $f;
  79     }
  80     return $f;
  81 }
  82
  83 =head2 charinfo
  84
  85     use Unicode::UCD 'charinfo';
  86
  87     my $charinfo = charinfo(0x41);
  88
  89 charinfo() returns a reference to a hash that has the following fields
  90 as defined by the Unicode standard:
  91
  92     key
  93
  94     code             code point with at least four hexdigits
  95     name             name of the character IN UPPER CASE
  96     category         general category of the character
  97     combining        classes used in the Canonical Ordering Algorithm
  98     bidi             bidirectional category
  99     decomposition    character decomposition mapping
 100     decimal          if decimal digit this is the integer numeric value
 101     digit            if digit this is the numeric value
 102     numeric          if numeric is the integer or rational numeric value
 103     mirrored         if mirrored in bidirectional text
 104     unicode10        Unicode 1.0 name if existed and different
 105     comment          ISO 10646 comment field
 106     upper            uppercase equivalent mapping
 107     lower            lowercase equivalent mapping
 108     title            titlecase equivalent mapping
 109
 110     block            block the character belongs to (used in \p{In...})
 111     script           script the character belongs to
 112
 113 If no match is found, a reference to an empty hash is returned.
 114
 115 The C<block> property is the same as as returned by charinfo().  It is
 116 not defined in the Unicode Character Database proper (Chapter 4 of the
 117 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
 118 (Chapter 14 of TUS3).  Similarly for the C<script> property.
 119
 120 Note that you cannot do (de)composition and casing based solely on the
 121 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
 122 you will need also the compexcl(), casefold(), and casespec() functions.
 123
 124 =cut
 125
 126 sub _getcode {
 127     my $arg = shift;
 128
 129     if ($arg =~ /^\d+$/) {
 130         return $arg;
 131     } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
 132         return hex($1);
 133     }
 134
 135     return;
 136 }
 137
 138 sub han_charname {
 139     my $arg  = shift;
 140     my $code = _getcode($arg);
 141     croak __PACKAGE__, "::han_charname: unknown code '$arg'"
 142         unless defined $code;
 143     croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'"
 144         unless 0x3400  <= $code && $code <= 0x4DB5
 145             || 0x4E00  <= $code && $code <= 0x9FA5
 146             || 0x20000 <= $code && $code <= 0x2A6D6;
 147     sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
 148 }
 149
 150 my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
 151     "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
 152     "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
 153   );
 154
 155 my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
 156     "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
 157     "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
 158     "YU", "EU", "YI", "I",
 159   );
 160
 161 my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
 162     "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
 163     "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
 164     "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
 165   );
 166
 167 my %HangulConst = (
 168    SBase  => 0xAC00,
 169    LBase  => 0x1100,
 170    VBase  => 0x1161,
 171    TBase  => 0x11A7,
 172    LCount => 19,     # scalar @JamoL
 173    VCount => 21,     # scalar @JamoV
 174    TCount => 28,     # scalar @JamoT
 175    NCount => 588,    # VCount * TCount
 176    SCount => 11172,  # LCount * NCount
 177    Final  => 0xD7A3, # SBase -1 + SCount
 178   );
 179
 180 sub hangul_charname {
 181     my $arg  = shift;
 182     my $code = _getcode($arg);
 183     croak __PACKAGE__, "::hangul_charname: unknown code '$arg'"
 184         unless defined $code;
 185     croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
 186         unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
 187     my $SIndex = $code - $HangulConst{SBase};
 188     my $LIndex = int( $SIndex / $HangulConst{NCount});
 189     my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
 190     my $TIndex =      $SIndex % $HangulConst{TCount};
 191     return join('',
 192         "HANGUL SYLLABLE ",
 193         $JamoL[$LIndex],
 194         $JamoV[$VIndex],
 195         $JamoT[$TIndex],
 196       );
 197 }
 198
 199 sub hangul_decomp {
 200     my $arg  = shift;
 201     my $code = _getcode($arg);
 202     croak __PACKAGE__, "::hangul_decomp: unknown code '$arg'"
 203         unless defined $code;
 204     croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
 205         unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
 206     my $SIndex = $code - $HangulConst{SBase};
 207     my $LIndex = int( $SIndex / $HangulConst{NCount});
 208     my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
 209     my $TIndex =      $SIndex % $HangulConst{TCount};
 210
 211     return join(" ",
 212         sprintf("%04X", $HangulConst{LBase} + $LIndex),
 213         sprintf("%04X", $HangulConst{VBase} + $VIndex),
 214       $TIndex ?
 215         sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
 216     );
 217 }
 218
 219 my @CharinfoRanges = (
 220 # block name
 221 # [ first, last, coderef to name, coderef to decompose ],
 222 # CJK Ideographs Extension A
 223   [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
 224 # CJK Ideographs
 225   [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
 226 # Hangul Syllables
 227   [ 0xAC00,   0xD7A3,   \&hangul_charname, \&hangul_decomp  ],
 228 # Non-Private Use High Surrogates
 229   [ 0xD800,   0xDB7F,   undef,   undef  ],
 230 # Private Use High Surrogates
 231   [ 0xDB80,   0xDBFF,   undef,   undef  ],
 232 # Low Surrogates
 233   [ 0xDC00,   0xDFFF,   undef,   undef  ],
 234 # The Private Use Area
 235   [ 0xE000,   0xF8FF,   undef,   undef  ],
 236 # CJK Ideographs Extension B
 237   [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
 238 # Plane 15 Private Use Area
 239   [ 0xF0000,  0xFFFFD,  undef,   undef  ],
 240 # Plane 16 Private Use Area
 241   [ 0x100000, 0x10FFFD, undef,   undef  ],
 242 );
 243
 244 sub charinfo {
 245     my $arg  = shift;
 246     my $code = _getcode($arg);
 247     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 248         unless defined $code;
 249     my $hexk = sprintf("%06X", $code);
 250     my($rcode,$rname,$rdec);
 251     foreach my $range (@CharinfoRanges){
 252       if ($range->[0] <= $code && $code <= $range->[1]) {
 253         $rcode = $hexk;
 254         $rcode =~ s/^0+//;
 255         $rcode =  sprintf("%04X", hex($rcode));
 256         $rname = $range->[2] ? $range->[2]->($code) : '';
 257         $rdec  = $range->[3] ? $range->[3]->($code) : '';
 258         $hexk  = sprintf("%06X", $range->[0]); # replace by the first
 259         last;
 260       }
 261     }
 262     openunicode(\$UNICODEFH, "Unicode.txt");
 263     if (defined $UNICODEFH) {
 264         use Search::Dict 1.02;
 265         if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
 266             my $line = <$UNICODEFH>;
 267             chomp $line;
 268             my %prop;
 269             @prop{qw(
 270                      code name category
 271                      combining bidi decomposition
 272                      decimal digit numeric
 273                      mirrored unicode10 comment
 274                      upper lower title
 275                     )} = split(/;/, $line, -1);
 276             $hexk =~ s/^0+//;
 277             $hexk =  sprintf("%04X", hex($hexk));
 278             if ($prop{code} eq $hexk) {
 279                 $prop{block}  = charblock($code);
 280                 $prop{script} = charscript($code);
 281                 if(defined $rname){
 282                     $prop{code} = $rcode;
 283                     $prop{name} = $rname;
 284                     $prop{decomposition} = $rdec;
 285                 }
 286                 return \%prop;
 287             }
 288         }
 289     }
 290     return;
 291 }
 292
 293 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 294     my ($table, $lo, $hi, $code) = @_;
 295
 296     return if $lo > $hi;
 297
 298     my $mid = int(($lo+$hi) / 2);
 299
 300     if ($table->[$mid]->[0] < $code) {
 301         if ($table->[$mid]->[1] >= $code) {
 302             return $table->[$mid]->[2];
 303         } else {
 304             _search($table, $mid + 1, $hi, $code);
 305         }
 306     } elsif ($table->[$mid]->[0] > $code) {
 307         _search($table, $lo, $mid - 1, $code);
 308     } else {
 309         return $table->[$mid]->[2];
 310     }
 311 }
 312
 313 sub charinrange {
 314     my ($range, $arg) = @_;
 315     my $code = _getcode($arg);
 316     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 317         unless defined $code;
 318     _search($range, 0, $#$range, $code);
 319 }
 320
 321 =head2 charblock
 322
 323     use Unicode::UCD 'charblock';
 324
 325     my $charblock = charblock(0x41);
 326     my $charblock = charblock(1234);
 327     my $charblock = charblock("0x263a");
 328     my $charblock = charblock("U+263a");
 329
 330     my $range     = charblock('Armenian');
 331
 332 With a B<code point argument> charblock() returns the I<block> the character
 333 belongs to, e.g.  C<Basic Latin>.  Note that not all the character
 334 positions within all blocks are defined.
 335
 336 See also L</Blocks versus Scripts>.
 337
 338 If supplied with an argument that can't be a code point, charblock()
 339 tries to do the opposite and interpret the argument as a character
 340 block.  The return value is a I<range>: an anonymous list that
 341 contains anonymous lists, which in turn contain I<start-of-range>,
 342 I<end-of-range> code point pairs.  You can test whether a code point
 343 is in a range using the L</charinrange> function.  If the argument is
 344 not a known charater block, C<undef> is returned.
 345
 346 =cut
 347
 348 my @BLOCKS;
 349 my %BLOCKS;
 350
 351 sub _charblocks {
 352     unless (@BLOCKS) {
 353         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 354             while (<$BLOCKSFH>) {
 355                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 356                     my ($lo, $hi) = (hex($1), hex($2));
 357                     my $subrange = [ $lo, $hi, $3 ];
 358                     push @BLOCKS, $subrange;
 359                     push @{$BLOCKS{$3}}, $subrange;
 360                 }
 361             }
 362             close($BLOCKSFH);
 363         }
 364     }
 365 }
 366
 367 sub charblock {
 368     my $arg = shift;
 369
 370     _charblocks() unless @BLOCKS;
 371
 372     my $code = _getcode($arg);
 373
 374     if (defined $code) {
 375         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 376     } else {
 377         if (exists $BLOCKS{$arg}) {
 378             return $BLOCKS{$arg};
 379         } else {
 380             return;
 381         }
 382     }
 383 }
 384
 385 =head2 charscript
 386
 387     use Unicode::UCD 'charscript';
 388
 389     my $charscript = charscript(0x41);
 390     my $charscript = charscript(1234);
 391     my $charscript = charscript("U+263a");
 392
 393     my $range      = charscript('Thai');
 394
 395 With a B<code point argument> charscript() returns the I<script> the
 396 character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 397
 398 See also L</Blocks versus Scripts>.
 399
 400 If supplied with an argument that can't be a code point, charscript()
 401 tries to do the opposite and interpret the argument as a character
 402 script.  The return value is a I<range>: an anonymous list that
 403 contains anonymous lists, which in turn contain I<start-of-range>,
 404 I<end-of-range> code point pairs.  You can test whether a code point
 405 is in a range using the L</charinrange> function.  If the argument is
 406 not a known charater script, C<undef> is returned.
 407
 408 =cut
 409
 410 my @SCRIPTS;
 411 my %SCRIPTS;
 412
 413 sub _charscripts {
 414     unless (@SCRIPTS) {
 415         if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 416             while (<$SCRIPTSFH>) {
 417                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 418                     my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 419                     my $script = lc($3);
 420                     $script =~ s/\b(\w)/uc($1)/ge;
 421                     my $subrange = [ $lo, $hi, $script ];
 422                     push @SCRIPTS, $subrange;
 423                     push @{$SCRIPTS{$script}}, $subrange;
 424                 }
 425             }
 426             close($SCRIPTSFH);
 427             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 428         }
 429     }
 430 }
 431
 432 sub charscript {
 433     my $arg = shift;
 434
 435     _charscripts() unless @SCRIPTS;
 436
 437     my $code = _getcode($arg);
 438
 439     if (defined $code) {
 440         _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 441     } else {
 442         if (exists $SCRIPTS{$arg}) {
 443             return $SCRIPTS{$arg};
 444         } else {
 445             return;
 446         }
 447     }
 448 }
 449
 450 =head2 charblocks
 451
 452     use Unicode::UCD 'charblocks';
 453
 454     my $charblocks = charblocks();
 455
 456 charblocks() returns a reference to a hash with the known block names
 457 as the keys, and the code point ranges (see L</charblock>) as the values.
 458
 459 See also L</Blocks versus Scripts>.
 460
 461 =cut
 462
 463 sub charblocks {
 464     _charblocks() unless %BLOCKS;
 465     return \%BLOCKS;
 466 }
 467
 468 =head2 charscripts
 469
 470     use Unicode::UCD 'charscripts';
 471
 472     my %charscripts = charscripts();
 473
 474 charscripts() returns a hash with the known script names as the keys,
 475 and the code point ranges (see L</charscript>) as the values.
 476
 477 See also L</Blocks versus Scripts>.
 478
 479 =cut
 480
 481 sub charscripts {
 482     _charscripts() unless %SCRIPTS;
 483     return \%SCRIPTS;
 484 }
 485
 486 =head2 Blocks versus Scripts
 487
 488 The difference between a block and a script is that scripts are closer
 489 to the linguistic notion of a set of characters required to present
 490 languages, while block is more of an artifact of the Unicode character
 491 numbering and separation into blocks of 256 characters.
 492
 493 For example the Latin B<script> is spread over several B<blocks>, such
 494 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
 495 C<Latin Extended-B>.  On the other hand, the Latin script does not
 496 contain all the characters of the C<Basic Latin> block (also known as
 497 the ASCII): it includes only the letters, not for example the digits
 498 or the punctuation.
 499
 500 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
 501
 502 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
 503
 504 =head2 Matching Scripts and Blocks
 505
 506 Both scripts and blocks can be matched using the regular expression
 507 construct C<\p{In...}> and its negation C<\P{In...}>.
 508
 509 The name of the script or the block comes after the C<In>, for example
 510 C<\p{InCyrillic}>, C<\P{InBasicLatin}>.  Spaces and dashes ('-') are
 511 removed from the names for the C<\p{In...}>, for example
 512 C<LatinExtendedA> instead of C<Latin Extended-A>.
 513
 514 There are a few cases where there is both a script and a block by the
 515 same name, in these cases the block version has C<Block> appended to
 516 its name: C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is
 517 the block.
 518
 519 =head2 Code Point Arguments
 520
 521 A <code point argument> is either a decimal or a hexadecimal scalar
 522 designating a Unicode character, or "U+" followed by hexadecimals
 523 designating a Unicode character.  Note that Unicode is B<not> limited
 524 to 16 bits (the number of Unicode characters is open-ended, in theory
 525 unlimited): you may have more than 4 hexdigits.
 526
 527 =head2 charinrange
 528
 529 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 530 can also test whether a code point is in the I<range> as returned by
 531 L</charblock> and L</charscript> or as the values of the hash returned
 532 by L</charblocks> and L</charscripts> by using charinrange():
 533
 534     use Unicode::UCD qw(charscript charinrange);
 535
 536     $range = charscript('Hiragana');
 537     print "looks like hiragana\n" if charinrange($range, $codepoint);
 538
 539 =cut
 540
 541 =head2 compexcl
 542
 543     use Unicode::UCD 'compexcl';
 544
 545     my $compexcl = compexcl("09dc");
 546
 547 The compexcl() returns the composition exclusion (that is, if the
 548 character should not be produced during a precomposition) of the
 549 character specified by a B<code point argument>.
 550
 551 If there is a composition exclusion for the character, true is
 552 returned.  Otherwise, false is returned.
 553
 554 =cut
 555
 556 my %COMPEXCL;
 557
 558 sub _compexcl {
 559     unless (%COMPEXCL) {
 560         if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
 561             while (<$COMPEXCLFH>) {
 562                 if (/^([0-9A-F]+) \# /) {
 563                     my $code = hex($1);
 564                     $COMPEXCL{$code} = undef;
 565                 }
 566             }
 567             close($COMPEXCLFH);
 568         }
 569     }
 570 }
 571
 572 sub compexcl {
 573     my $arg  = shift;
 574     my $code = _getcode($arg);
 575     croak __PACKAGE__, "::compexcl: unknown code '$arg'"
 576         unless defined $code;
 577
 578     _compexcl() unless %COMPEXCL;
 579
 580     return exists $COMPEXCL{$code};
 581 }
 582
 583 =head2 casefold
 584
 585     use Unicode::UCD 'casefold';
 586
 587     my %casefold = casefold("09dc");
 588
 589 The casefold() returns the locale-independent case folding of the
 590 character specified by a B<code point argument>.
 591
 592 If there is a case folding for that character, a reference to a hash
 593 with the following fields is returned:
 594
 595     key
 596
 597     code             code point with at least four hexdigits
 598     status           "C", "F", "S", or "I"
 599     mapping          one or more codes separated by spaces
 600
 601 The meaning of the I<status> is as follows:
 602
 603    C                 common case folding, common mappings shared
 604                      by both simple and full mappings
 605    F                 full case folding, mappings that cause strings
 606                      to grow in length. Multiple characters are separated
 607                      by spaces
 608    S                 simple case folding, mappings to single characters
 609                      where different from F
 610    I                 special case for dotted uppercase I and
 611                      dotless lowercase i
 612                      - If this mapping is included, the result is
 613                        case-insensitive, but dotless and dotted I's
 614                        are not distinguished
 615                      - If this mapping is excluded, the result is not
 616                        fully case-insensitive, but dotless and dotted
 617                        I's are distinguished
 618
 619 If there is no case folding for that character, C<undef> is returned.
 620
 621 For more information about case mappings see
 622 http://www.unicode.org/unicode/reports/tr21/
 623
 624 =cut
 625
 626 my %CASEFOLD;
 627
 628 sub _casefold {
 629     unless (%CASEFOLD) {
 630         if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
 631             while (<$CASEFOLDFH>) {
 632                 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 633                     my $code = hex($1);
 634                     $CASEFOLD{$code} = { code    => $1,
 635                                          status  => $2,
 636                                          mapping => $3 };
 637                 }
 638             }
 639             close($CASEFOLDFH);
 640         }
 641     }
 642 }
 643
 644 sub casefold {
 645     my $arg  = shift;
 646     my $code = _getcode($arg);
 647     croak __PACKAGE__, "::casefold: unknown code '$arg'"
 648         unless defined $code;
 649
 650     _casefold() unless %CASEFOLD;
 651
 652     return $CASEFOLD{$code};
 653 }
 654
 655 =head2 casespec
 656
 657     use Unicode::UCD 'casespec';
 658
 659     my %casespec = casespec("09dc");
 660
 661 The casespec() returns the potentially locale-dependent case mapping
 662 of the character specified by a B<code point argument>.  The mapping
 663 may change the length of the string (which the basic Unicode case
 664 mappings as returned by charinfo() never do).
 665
 666 If there is a case folding for that character, a reference to a hash
 667 with the following fields is returned:
 668
 669     key
 670
 671     code             code point with at least four hexdigits
 672     lower            lowercase
 673     title            titlecase
 674     upper            uppercase
 675     condition        condition list (may be undef)
 676
 677 The C<condition> is optional.  Where present, it consists of one or
 678 more I<locales> or I<contexts>, separated by spaces (other than as
 679 used to separate elements, spaces are to be ignored).  A condition
 680 list overrides the normal behavior if all of the listed conditions are
 681 true.  Case distinctions in the condition list are not significant.
 682 Conditions preceded by "NON_" represent the negation of the condition
 683
 684 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
 685 followed by a "_" and a 2-letter ISO language code (possibly followed
 686 by a "_" and a variant code).  You can find the lists of those codes,
 687 see L<Locale::Country> and L<Locale::Language>.
 688
 689 A I<context> is one of the following choices:
 690
 691     FINAL            The letter is not followed by a letter of
 692                      general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
 693     MODERN           The mapping is only used for modern text
 694     AFTER_i          The last base character was "i" (U+0069)
 695
 696 For more information about case mappings see
 697 http://www.unicode.org/unicode/reports/tr21/
 698
 699 =cut
 700
 701 my %CASESPEC;
 702
 703 sub _casespec {
 704     unless (%CASESPEC) {
 705         if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
 706             while (<$CASESPECFH>) {
 707                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
 708                     my $code = hex($1);
 709                     $CASESPEC{$code} = { code      => $1,
 710                                          lower     => $2,
 711                                          title     => $3,
 712                                          upper     => $4,
 713                                          condition => $5 };
 714                 }
 715             }
 716             close($CASESPECFH);
 717         }
 718     }
 719 }
 720
 721 sub casespec {
 722     my $arg  = shift;
 723     my $code = _getcode($arg);
 724     croak __PACKAGE__, "::casespec: unknown code '$arg'"
 725         unless defined $code;
 726
 727     _casespec() unless %CASESPEC;
 728
 729     return $CASESPEC{$code};
 730 }
 731
 732 =head2 Unicode::UCD::UnicodeVersion
 733
 734 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
 735 Character Database, in other words, the version of the Unicode
 736 standard the database implements.  The version is a string
 737 of numbers delimited by dots (C<'.'>).
 738
 739 =cut
 740
 741 my $UNICODEVERSION;
 742
 743 sub UnicodeVersion {
 744     unless (defined $UNICODEVERSION) {
 745         openunicode(\$VERSIONFH, "version");
 746         chomp($UNICODEVERSION = <$VERSIONFH>);
 747         close($VERSIONFH);
 748         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
 749             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
 750     }
 751     return $UNICODEVERSION;
 752 }
 753
 754 =head2 Implementation Note
 755
 756 The first use of charinfo() opens a read-only filehandle to the Unicode
 757 Character Database (the database is included in the Perl distribution).
 758 The filehandle is then kept open for further queries.  In other words,
 759 if you are wondering where one of your filehandles went, that's where.
 760
 761 =head1 AUTHOR
 762
 763 Jarkko Hietaniemi
 764
 765 =cut
 766
 767 1;