lib/UnicodeCD.pm

   1 package UnicodeCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '0.2';
   7
   8 require Exporter;
   9
  10 our @ISA = qw(Exporter);
  11
  12 our @EXPORT_OK = qw(charinfo
  13                     charblock charscript
  14                     charblocks charscripts
  15                     charinrange
  16                     compexcl
  17                     casefold casespec);
  18
  19 use Carp;
  20
  21 =head1 NAME
  22
  23 UnicodeCD - Unicode character database
  24
  25 =head1 SYNOPSIS
  26
  27     use UnicodeCD 'charinfo';
  28     my $charinfo   = charinfo($codepoint);
  29
  30     use UnicodeCD 'charblock';
  31     my $charblock  = charblock($codepoint);
  32
  33     use UnicodeCD 'charscript';
  34     my $charscript = charblock($codepoint);
  35
  36     use UnicodeCD 'charblocks';
  37     my $charblocks = charblocks();
  38
  39     use UnicodeCD 'charscripts';
  40     my %charscripts = charscripts();
  41
  42     use UnicodeCD qw(charscript charinrange);
  43     my $range = charscript($script);
  44     print "looks like $script\n" if charinrange($range, $codepoint);
  45
  46     use UnicodeCD 'compexcl';
  47     my $compexcl = compexcl($codepoint);
  48
  49     my $unicode_version = UnicodeCD::UnicodeVersion();
  50
  51 =head1 DESCRIPTION
  52
  53 The UnicodeCD module offers a simple interface to the Unicode Character
  54 Database.
  55
  56 =cut
  57
  58 my $UNICODEFH;
  59 my $BLOCKSFH;
  60 my $SCRIPTSFH;
  61 my $VERSIONFH;
  62 my $COMPEXCLFH;
  63 my $CASEFOLDFH;
  64 my $CASESPECFH;
  65
  66 sub openunicode {
  67     my ($rfh, @path) = @_;
  68     my $f;
  69     unless (defined $$rfh) {
  70         for my $d (@INC) {
  71             use File::Spec;
  72             $f = File::Spec->catfile($d, "unicode", @path);
  73             last if open($$rfh, $f);
  74             undef $f;
  75         }
  76         croak __PACKAGE__, ": failed to find ",
  77               File::Spec->catfile(@path), " in @INC"
  78             unless defined $f;
  79     }
  80     return $f;
  81 }
  82
  83 =head2 charinfo
  84
  85     use UnicodeCD 'charinfo';
  86
  87     my $charinfo = charinfo(0x41);
  88
  89 charinfo() returns a reference to a hash that has the following fields
  90 as defined by the Unicode standard:
  91
  92     key
  93
  94     code             code point with at least four hexdigits
  95     name             name of the character IN UPPER CASE
  96     category         general category of the character
  97     combining        classes used in the Canonical Ordering Algorithm
  98     bidi             bidirectional category
  99     decomposition    character decomposition mapping
 100     decimal          if decimal digit this is the integer numeric value
 101     digit            if digit this is the numeric value
 102     numeric          if numeric is the integer or rational numeric value
 103     mirrored         if mirrored in bidirectional text
 104     unicode10        Unicode 1.0 name if existed and different
 105     comment          ISO 10646 comment field
 106     upper            uppercase equivalent mapping
 107     lower            lowercase equivalent mapping
 108     title            titlecase equivalent mapping
 109
 110     block            block the character belongs to (used in \p{In...})
 111     script           script the character belongs to
 112
 113 If no match is found, a reference to an empty hash is returned.
 114
 115 The C<block> property is the same as as returned by charinfo().  It is
 116 not defined in the Unicode Character Database proper (Chapter 4 of the
 117 Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
 118 of TUS3).  Similarly for the C<script> property.
 119
 120 Note that you cannot do (de)composition and casing based solely on the
 121 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
 122 you will need also the compexcl(), casefold(), and casespec() functions.
 123
 124 =cut
 125
 126 sub _getcode {
 127     my $arg = shift;
 128
 129     if ($arg =~ /^\d+$/) {
 130         return $arg;
 131     } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
 132         return hex($1);
 133     }
 134
 135     return;
 136 }
 137
 138 sub han_charname {
 139     my $arg  = shift;
 140     my $code = _getcode($arg);
 141     croak __PACKAGE__, "::han_charname: unknown code '$arg'"
 142         unless defined $code;
 143     croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'"
 144         unless 0x3400  <= $code && $code <= 0x4DB5
 145             || 0x4E00  <= $code && $code <= 0x9FA5
 146             || 0x20000 <= $code && $code <= 0x2A6D6;
 147     sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
 148 }
 149
 150 my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
 151     "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
 152     "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
 153   );
 154
 155 my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
 156     "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
 157     "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
 158     "YU", "EU", "YI", "I",
 159   );
 160
 161 my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
 162     "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
 163     "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
 164     "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
 165   );
 166
 167 my %HangulConst = (
 168    SBase  => 0xAC00,
 169    LBase  => 0x1100,
 170    VBase  => 0x1161,
 171    TBase  => 0x11A7,
 172    LCount => 19,     # scalar @JamoL
 173    VCount => 21,     # scalar @JamoV
 174    TCount => 28,     # scalar @JamoT
 175    NCount => 588,    # VCount * TCount
 176    SCount => 11172,  # LCount * NCount
 177    Final  => 0xD7A3, # SBase -1 + SCount
 178   );
 179
 180 sub hangul_charname {
 181     my $arg  = shift;
 182     my $code = _getcode($arg);
 183     croak __PACKAGE__, "::hangul_charname: unknown code '$arg'"
 184         unless defined $code;
 185     croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
 186         unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
 187     my $SIndex = $code - $HangulConst{SBase};
 188     my $LIndex = int( $SIndex / $HangulConst{NCount});
 189     my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
 190     my $TIndex =      $SIndex % $HangulConst{TCount};
 191     return join('',
 192         "HANGUL SYLLABLE ",
 193         $JamoL[$LIndex],
 194         $JamoV[$VIndex],
 195         $JamoT[$TIndex],
 196       );
 197 }
 198
 199 sub hangul_decomp {
 200     my $arg  = shift;
 201     my $code = _getcode($arg);
 202     croak __PACKAGE__, "::hangul_decomp: unknown code '$arg'"
 203         unless defined $code;
 204     croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
 205         unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
 206     my $SIndex = $code - $HangulConst{SBase};
 207     my $LIndex = int( $SIndex / $HangulConst{NCount});
 208     my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
 209     my $TIndex =      $SIndex % $HangulConst{TCount};
 210
 211     return join(" ",
 212         sprintf("%04X", $HangulConst{LBase} + $LIndex),
 213         sprintf("%04X", $HangulConst{VBase} + $VIndex),
 214       $TIndex ?
 215         sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
 216     );
 217 }
 218
 219 my @CharinfoRanges = (
 220 # block name
 221 # [ first, last, coderef to name, coderef to decompose ],
 222 # CJK Ideographs Extension A
 223   [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
 224 # CJK Ideographs
 225   [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
 226 # Hangul Syllables
 227   [ 0xAC00,   0xD7A3,   \&hangul_charname, \&hangul_decomp  ],
 228 # Non-Private Use High Surrogates
 229   [ 0xD800,   0xDB7F,   undef,   undef  ],
 230 # Private Use High Surrogates
 231   [ 0xDB80,   0xDBFF,   undef,   undef  ],
 232 # Low Surrogates
 233   [ 0xDC00,   0xDFFF,   undef,   undef  ],
 234 # The Private Use Area
 235   [ 0xE000,   0xF8FF,   undef,   undef  ],
 236 # CJK Ideographs Extension B
 237   [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
 238 # Plane 15 Private Use Area
 239   [ 0xF0000,  0xFFFFD,  undef,   undef  ],
 240 # Plane 16 Private Use Area
 241   [ 0x100000, 0x10FFFD, undef,   undef  ],
 242 );
 243
 244 sub charinfo {
 245     my $arg  = shift;
 246     my $code = _getcode($arg);
 247     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 248         unless defined $code;
 249     my $hexk = sprintf("%06X", $code);
 250     my($rcode,$rname,$rdec);
 251     foreach my $range (@CharinfoRanges){
 252       if ($range->[0] <= $code && $code <= $range->[1]) {
 253         $rcode = $hexk;
 254         $rcode =~ s/^0+//;
 255         $rcode =  sprintf("%04X", hex($rcode));
 256         $rname = $range->[2] ? $range->[2]->($code) : '';
 257         $rdec  = $range->[3] ? $range->[3]->($code) : '';
 258         $hexk  = sprintf("%06X", $range->[0]); # replace by the first
 259         last;
 260       }
 261     }
 262     openunicode(\$UNICODEFH, "Unicode.txt");
 263     if (defined $UNICODEFH) {
 264         use Search::Dict 1.02;
 265         if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
 266             my $line = <$UNICODEFH>;
 267             chomp $line;
 268             my %prop;
 269             @prop{qw(
 270                      code name category
 271                      combining bidi decomposition
 272                      decimal digit numeric
 273                      mirrored unicode10 comment
 274                      upper lower title
 275                     )} = split(/;/, $line, -1);
 276             $hexk =~ s/^0+//;
 277             $hexk =  sprintf("%04X", hex($hexk));
 278             if ($prop{code} eq $hexk) {
 279                 $prop{block}  = charblock($code);
 280                 $prop{script} = charscript($code);
 281                 if(defined $rname){
 282                     $prop{code} = $rcode;
 283                     $prop{name} = $rname;
 284                     $prop{decomposition} = $rdec;
 285                 }
 286                 return \%prop;
 287             }
 288         }
 289     }
 290     return;
 291 }
 292
 293 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 294     my ($table, $lo, $hi, $code) = @_;
 295
 296     return if $lo > $hi;
 297
 298     my $mid = int(($lo+$hi) / 2);
 299
 300     if ($table->[$mid]->[0] < $code) {
 301         if ($table->[$mid]->[1] >= $code) {
 302             return $table->[$mid]->[2];
 303         } else {
 304             _search($table, $mid + 1, $hi, $code);
 305         }
 306     } elsif ($table->[$mid]->[0] > $code) {
 307         _search($table, $lo, $mid - 1, $code);
 308     } else {
 309         return $table->[$mid]->[2];
 310     }
 311 }
 312
 313 sub charinrange {
 314     my ($range, $arg) = @_;
 315     my $code = _getcode($arg);
 316     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 317         unless defined $code;
 318     _search($range, 0, $#$range, $code);
 319 }
 320
 321 =head2 charblock
 322
 323     use UnicodeCD 'charblock';
 324
 325     my $charblock = charblock(0x41);
 326     my $charblock = charblock(1234);
 327     my $charblock = charblock("0x263a");
 328     my $charblock = charblock("U+263a");
 329
 330     my $ranges    = charblock('Armenian');
 331
 332 With a B<code point argument> charblock() returns the block the character
 333 belongs to, e.g.  C<Basic Latin>.  Note that not all the character
 334 positions within all blocks are defined.
 335
 336 If supplied with an argument that can't be a code point, charblock()
 337 tries to do the opposite and interpret the argument as a character
 338 block.  The return value is a I<range>: an anonymous list that
 339 contains anonymous lists, which in turn contain I<start-of-range>,
 340 I<end-of-range> code point pairs.  You can test whether a code point
 341 is in a range using the L</charinrange> function.  If the argument is
 342 not a known charater block, C<undef> is returned.
 343
 344 =cut
 345
 346 my @BLOCKS;
 347 my %BLOCKS;
 348
 349 sub _charblocks {
 350     unless (@BLOCKS) {
 351         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 352             while (<$BLOCKSFH>) {
 353                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 354                     my ($lo, $hi) = (hex($1), hex($2));
 355                     my $subrange = [ $lo, $hi, $3 ];
 356                     push @BLOCKS, $subrange;
 357                     push @{$BLOCKS{$3}}, $subrange;
 358                 }
 359             }
 360             close($BLOCKSFH);
 361         }
 362     }
 363 }
 364
 365 sub charblock {
 366     my $arg = shift;
 367
 368     _charblocks() unless @BLOCKS;
 369
 370     my $code = _getcode($arg);
 371
 372     if (defined $code) {
 373         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 374     } else {
 375         if (exists $BLOCKS{$arg}) {
 376             return $BLOCKS{$arg};
 377         } else {
 378             return;
 379         }
 380     }
 381 }
 382
 383 =head2 charscript
 384
 385     use UnicodeCD 'charscript';
 386
 387     my $charscript = charscript(0x41);
 388     my $charscript = charscript(1234);
 389     my $charscript = charscript("U+263a");
 390
 391     my $ranges     = charscript('Thai');
 392
 393 With a B<code point argument> charscript() returns the script the
 394 character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 395
 396 If supplied with an argument that can't be a code point, charscript()
 397 tries to do the opposite and interpret the argument as a character
 398 script.  The return value is a I<range>: an anonymous list that
 399 contains anonymous lists, which in turn contain I<start-of-range>,
 400 I<end-of-range> code point pairs.  You can test whether a code point
 401 is in a range using the L</charinrange> function.  If the argument is
 402 not a known charater script, C<undef> is returned.
 403
 404 =cut
 405
 406 my @SCRIPTS;
 407 my %SCRIPTS;
 408
 409 sub _charscripts {
 410     unless (@SCRIPTS) {
 411         if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 412             while (<$SCRIPTSFH>) {
 413                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 414                     my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 415                     my $script = lc($3);
 416                     $script =~ s/\b(\w)/uc($1)/ge;
 417                     my $subrange = [ $lo, $hi, $script ];
 418                     push @SCRIPTS, $subrange;
 419                     push @{$SCRIPTS{$script}}, $subrange;
 420                 }
 421             }
 422             close($SCRIPTSFH);
 423             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 424         }
 425     }
 426 }
 427
 428 sub charscript {
 429     my $arg = shift;
 430
 431     _charscripts() unless @SCRIPTS;
 432
 433     my $code = _getcode($arg);
 434
 435     if (defined $code) {
 436         _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 437     } else {
 438         if (exists $SCRIPTS{$arg}) {
 439             return $SCRIPTS{$arg};
 440         } else {
 441             return;
 442         }
 443     }
 444 }
 445
 446 =head2 charblocks
 447
 448     use UnicodeCD 'charblocks';
 449
 450     my $charblocks = charblocks();
 451
 452 charblocks() returns a reference to a hash with the known block names
 453 as the keys, and the code point ranges (see L</charblock>) as the values.
 454
 455 =cut
 456
 457 sub charblocks {
 458     _charblocks() unless %BLOCKS;
 459     return \%BLOCKS;
 460 }
 461
 462 =head2 charscripts
 463
 464     use UnicodeCD 'charscripts';
 465
 466     my %charscripts = charscripts();
 467
 468 charscripts() returns a hash with the known script names as the keys,
 469 and the code point ranges (see L</charscript>) as the values.
 470
 471 =cut
 472
 473 sub charscripts {
 474     _charscripts() unless %SCRIPTS;
 475     return \%SCRIPTS;
 476 }
 477
 478 =head2 Blocks versus Scripts
 479
 480 The difference between a block and a script is that scripts are closer
 481 to the linguistic notion of a set of characters required to present
 482 languages, while block is more of an artifact of the Unicode character
 483 numbering and separation into blocks of 256 characters.
 484
 485 For example the Latin B<script> is spread over several B<blocks>, such
 486 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
 487 C<Latin Extended-B>.  On the other hand, the Latin script does not
 488 contain all the characters of the C<Basic Latin> block (also known as
 489 the ASCII): it includes only the letters, not for example the digits
 490 or the punctuation.
 491
 492 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
 493
 494 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
 495
 496 =head2 Matching Scripts and Blocks
 497
 498 Both scripts and blocks can be matched using the regular expression
 499 construct C<\p{In...}> and its negation C<\P{In...}>.
 500
 501 The name of the script or the block comes after the C<In>, for example
 502 C<\p{InCyrillic}>, C<\P{InBasicLatin}>.  Spaces and dashes ('-') are
 503 removed from the names for the C<\p{In...}>, for example
 504 C<LatinExtendedA> instead of C<Latin Extended-A>.
 505
 506 There are a few cases where there exists both a script and a block by
 507 the same name, in these cases the block version has C<Block> appended:
 508 C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
 509
 510 =head2 Code Point Arguments
 511
 512 A <code point argument> is either a decimal or a hexadecimal scalar,
 513 or "U+" followed by hexadecimals.
 514
 515 =head2 charinrange
 516
 517 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 518 can also test whether a code point is in the I<range> as returned by
 519 L</charblock> and L</charscript> or as the values of the hash returned
 520 by L</charblocks> and L</charscripts> by using charinrange():
 521
 522     use UnicodeCD qw(charscript charinrange);
 523
 524     $range = charscript('Hiragana');
 525     print "looks like hiragana\n" if charinrange($range, $codepoint);
 526
 527 =cut
 528
 529 =head2 compexcl
 530
 531     use UnicodeCD 'compexcl';
 532
 533     my $compexcl = compexcl("09dc");
 534
 535 The compexcl() returns the composition exclusion (that is, if the
 536 character should not be produced during a precomposition) of the
 537 character specified by a B<code point argument>.
 538
 539 If there is a composition exclusion for the character, true is
 540 returned.  Otherwise, false is returned.
 541
 542 =cut
 543
 544 my %COMPEXCL;
 545
 546 sub _compexcl {
 547     unless (%COMPEXCL) {
 548         if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
 549             while (<$COMPEXCLFH>) {
 550                 if (/^([0-9A-F]+) \# /) {
 551                     my $code = hex($1);
 552                     $COMPEXCL{$code} = undef;
 553                 }
 554             }
 555             close($COMPEXCLFH);
 556         }
 557     }
 558 }
 559
 560 sub compexcl {
 561     my $arg  = shift;
 562     my $code = _getcode($arg);
 563     croak __PACKAGE__, "::compexcl: unknown code '$arg'"
 564         unless defined $code;
 565
 566     _compexcl() unless %COMPEXCL;
 567
 568     return exists $COMPEXCL{$code};
 569 }
 570
 571 =head2 casefold
 572
 573     use UnicodeCD 'casefold';
 574
 575     my %casefold = casefold("09dc");
 576
 577 The casefold() returns the locale-independent case folding of the
 578 character specified by a B<code point argument>.
 579
 580 If there is a case folding for that character, a reference to a hash
 581 with the following fields is returned:
 582
 583     key
 584
 585     code             code point with at least four hexdigits
 586     status           "C", "F", "S", or "I"
 587     mapping          one or more codes separated by spaces
 588
 589 The meaning of the I<status> is as follows:
 590
 591    C                 common case folding, common mappings shared
 592                      by both simple and full mappings
 593    F                 full case folding, mappings that cause strings
 594                      to grow in length. Multiple characters are separated
 595                      by spaces
 596    S                 simple case folding, mappings to single characters
 597                      where different from F
 598    I                 special case for dotted uppercase I and
 599                      dotless lowercase i
 600                      - If this mapping is included, the result is
 601                        case-insensitive, but dotless and dotted I's
 602                        are not distinguished
 603                      - If this mapping is excluded, the result is not
 604                        fully case-insensitive, but dotless and dotted
 605                        I's are distinguished
 606
 607 If there is no case folding for that character, C<undef> is returned.
 608
 609 For more information about case mappings see
 610 http://www.unicode.org/unicode/reports/tr21/
 611
 612 =cut
 613
 614 my %CASEFOLD;
 615
 616 sub _casefold {
 617     unless (%CASEFOLD) {
 618         if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
 619             while (<$CASEFOLDFH>) {
 620                 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 621                     my $code = hex($1);
 622                     $CASEFOLD{$code} = { code    => $1,
 623                                          status  => $2,
 624                                          mapping => $3 };
 625                 }
 626             }
 627             close($CASEFOLDFH);
 628         }
 629     }
 630 }
 631
 632 sub casefold {
 633     my $arg  = shift;
 634     my $code = _getcode($arg);
 635     croak __PACKAGE__, "::casefold: unknown code '$arg'"
 636         unless defined $code;
 637
 638     _casefold() unless %CASEFOLD;
 639
 640     return $CASEFOLD{$code};
 641 }
 642
 643 =head2 casespec
 644
 645     use UnicodeCD 'casespec';
 646
 647     my %casespec = casespec("09dc");
 648
 649 The casespec() returns the potentially locale-dependent case mapping
 650 of the character specified by a B<code point argument>.  The mapping
 651 may change the length of the string (which the basic Unicode case
 652 mappings as returned by charinfo() never do).
 653
 654 If there is a case folding for that character, a reference to a hash
 655 with the following fields is returned:
 656
 657     key
 658
 659     code             code point with at least four hexdigits
 660     lower            lowercase
 661     title            titlecase
 662     upper            uppercase
 663     condition        condition list (may be undef)
 664
 665 The C<condition> is optional.  Where present, it consists of one or
 666 more I<locales> or I<contexts>, separated by spaces (other than as
 667 used to separate elements, spaces are to be ignored).  A condition
 668 list overrides the normal behavior if all of the listed conditions are
 669 true.  Case distinctions in the condition list are not significant.
 670 Conditions preceded by "NON_" represent the negation of the condition
 671
 672 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
 673 followed by a "_" and a 2-letter ISO language code (possibly followed
 674 by a "_" and a variant code).  You can find the lists of those codes,
 675 see L<Locale::Country> and L<Locale::Language>.
 676
 677 A I<context> is one of the following choices:
 678
 679     FINAL            The letter is not followed by a letter of
 680                      general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
 681     MODERN           The mapping is only used for modern text
 682     AFTER_i          The last base character was "i" (U+0069)
 683
 684 For more information about case mappings see
 685 http://www.unicode.org/unicode/reports/tr21/
 686
 687 =cut
 688
 689 my %CASESPEC;
 690
 691 sub _casespec {
 692     unless (%CASESPEC) {
 693         if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
 694             while (<$CASESPECFH>) {
 695                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
 696                     my $code = hex($1);
 697                     $CASESPEC{$code} = { code      => $1,
 698                                          lower     => $2,
 699                                          title     => $3,
 700                                          upper     => $4,
 701                                          condition => $5 };
 702                 }
 703             }
 704             close($CASESPECFH);
 705         }
 706     }
 707 }
 708
 709 sub casespec {
 710     my $arg  = shift;
 711     my $code = _getcode($arg);
 712     croak __PACKAGE__, "::casespec: unknown code '$arg'"
 713         unless defined $code;
 714
 715     _casespec() unless %CASESPEC;
 716
 717     return $CASESPEC{$code};
 718 }
 719
 720 =head2 UnicodeCD::UnicodeVersion
 721
 722 UnicodeCD::UnicodeVersion() returns the version of the Unicode Character
 723 Database, in other words, the version of the Unicode standard the
 724 database implements.
 725
 726 =cut
 727
 728 my $UNICODEVERSION;
 729
 730 sub UnicodeVersion {
 731     unless (defined $UNICODEVERSION) {
 732         openunicode(\$VERSIONFH, "version");
 733         chomp($UNICODEVERSION = <$VERSIONFH>);
 734         close($VERSIONFH);
 735         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
 736             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
 737     }
 738     return $UNICODEVERSION;
 739 }
 740
 741 =head2 Implementation Note
 742
 743 The first use of charinfo() opens a read-only filehandle to the Unicode
 744 Character Database (the database is included in the Perl distribution).
 745 The filehandle is then kept open for further queries.
 746
 747 =head1 AUTHOR
 748
 749 Jarkko Hietaniemi
 750
 751 =cut
 752
 753 1;