lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '0.25';
   7
   8 use Storable qw(dclone);
   9
  10 require Exporter;
  11
  12 our @ISA = qw(Exporter);
  13
  14 our @EXPORT_OK = qw(charinfo
  15                     charblock charscript
  16                     charblocks charscripts
  17                     charinrange
  18                     general_categories bidi_types
  19                     compexcl
  20                     casefold casespec
  21                     namedseq);
  22
  23 use Carp;
  24
  25 =head1 NAME
  26
  27 Unicode::UCD - Unicode character database
  28
  29 =head1 SYNOPSIS
  30
  31     use Unicode::UCD 'charinfo';
  32     my $charinfo   = charinfo($codepoint);
  33
  34     use Unicode::UCD 'charblock';
  35     my $charblock  = charblock($codepoint);
  36
  37     use Unicode::UCD 'charscript';
  38     my $charscript = charscript($codepoint);
  39
  40     use Unicode::UCD 'charblocks';
  41     my $charblocks = charblocks();
  42
  43     use Unicode::UCD 'charscripts';
  44     my $charscripts = charscripts();
  45
  46     use Unicode::UCD qw(charscript charinrange);
  47     my $range = charscript($script);
  48     print "looks like $script\n" if charinrange($range, $codepoint);
  49
  50     use Unicode::UCD qw(general_categories bidi_types);
  51     my $categories = general_categories();
  52     my $types = bidi_types();
  53
  54     use Unicode::UCD 'compexcl';
  55     my $compexcl = compexcl($codepoint);
  56
  57     use Unicode::UCD 'namedseq';
  58     my $namedseq = namedseq($named_sequence_name);
  59
  60     my $unicode_version = Unicode::UCD::UnicodeVersion();
  61
  62 =head1 DESCRIPTION
  63
  64 The Unicode::UCD module offers a simple interface to the Unicode
  65 Character Database.
  66
  67 =cut
  68
  69 my $UNICODEFH;
  70 my $BLOCKSFH;
  71 my $SCRIPTSFH;
  72 my $VERSIONFH;
  73 my $COMPEXCLFH;
  74 my $CASEFOLDFH;
  75 my $CASESPECFH;
  76 my $NAMEDSEQFH;
  77
  78 sub openunicode {
  79     my ($rfh, @path) = @_;
  80     my $f;
  81     unless (defined $$rfh) {
  82         for my $d (@INC) {
  83             use File::Spec;
  84             $f = File::Spec->catfile($d, "unicore", @path);
  85             last if open($$rfh, $f);
  86             undef $f;
  87         }
  88         croak __PACKAGE__, ": failed to find ",
  89               File::Spec->catfile(@path), " in @INC"
  90             unless defined $f;
  91     }
  92     return $f;
  93 }
  94
  95 =head2 charinfo
  96
  97     use Unicode::UCD 'charinfo';
  98
  99     my $charinfo = charinfo(0x41);
 100
 101 charinfo() returns a reference to a hash that has the following fields
 102 as defined by the Unicode standard:
 103
 104     key
 105
 106     code             code point with at least four hexdigits
 107     name             name of the character IN UPPER CASE
 108     category         general category of the character
 109     combining        classes used in the Canonical Ordering Algorithm
 110     bidi             bidirectional type
 111     decomposition    character decomposition mapping
 112     decimal          if decimal digit this is the integer numeric value
 113     digit            if digit this is the numeric value
 114     numeric          if numeric is the integer or rational numeric value
 115     mirrored         if mirrored in bidirectional text
 116     unicode10        Unicode 1.0 name if existed and different
 117     comment          ISO 10646 comment field
 118     upper            uppercase equivalent mapping
 119     lower            lowercase equivalent mapping
 120     title            titlecase equivalent mapping
 121
 122     block            block the character belongs to (used in \p{In...})
 123     script           script the character belongs to
 124
 125 If no match is found, a reference to an empty hash is returned.
 126
 127 The C<block> property is the same as returned by charinfo().  It is
 128 not defined in the Unicode Character Database proper (Chapter 4 of the
 129 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
 130 (Chapter 14 of TUS3).  Similarly for the C<script> property.
 131
 132 Note that you cannot do (de)composition and casing based solely on the
 133 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
 134 you will need also the compexcl(), casefold(), and casespec() functions.
 135
 136 =cut
 137
 138 # NB: This function is duplicated in charnames.pm
 139 sub _getcode {
 140     my $arg = shift;
 141
 142     if ($arg =~ /^[1-9]\d*$/) {
 143         return $arg;
 144     } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
 145         return hex($1);
 146     }
 147
 148     return;
 149 }
 150
 151 # Lingua::KO::Hangul::Util not part of the standard distribution
 152 # but it will be used if available.
 153
 154 eval { require Lingua::KO::Hangul::Util };
 155 my $hasHangulUtil = ! $@;
 156 if ($hasHangulUtil) {
 157     Lingua::KO::Hangul::Util->import();
 158 }
 159
 160 sub hangul_decomp { # internal: called from charinfo
 161     if ($hasHangulUtil) {
 162         my @tmp = decomposeHangul(shift);
 163         return sprintf("%04X %04X",      @tmp) if @tmp == 2;
 164         return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
 165     }
 166     return;
 167 }
 168
 169 sub hangul_charname { # internal: called from charinfo
 170     return sprintf("HANGUL SYLLABLE-%04X", shift);
 171 }
 172
 173 sub han_charname { # internal: called from charinfo
 174     return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
 175 }
 176
 177 my @CharinfoRanges = (
 178 # block name
 179 # [ first, last, coderef to name, coderef to decompose ],
 180 # CJK Ideographs Extension A
 181   [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
 182 # CJK Ideographs
 183   [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
 184 # Hangul Syllables
 185   [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
 186 # Non-Private Use High Surrogates
 187   [ 0xD800,   0xDB7F,   undef,   undef  ],
 188 # Private Use High Surrogates
 189   [ 0xDB80,   0xDBFF,   undef,   undef  ],
 190 # Low Surrogates
 191   [ 0xDC00,   0xDFFF,   undef,   undef  ],
 192 # The Private Use Area
 193   [ 0xE000,   0xF8FF,   undef,   undef  ],
 194 # CJK Ideographs Extension B
 195   [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
 196 # Plane 15 Private Use Area
 197   [ 0xF0000,  0xFFFFD,  undef,   undef  ],
 198 # Plane 16 Private Use Area
 199   [ 0x100000, 0x10FFFD, undef,   undef  ],
 200 );
 201
 202 sub charinfo {
 203     my $arg  = shift;
 204     my $code = _getcode($arg);
 205     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 206         unless defined $code;
 207     my $hexk = sprintf("%06X", $code);
 208     my($rcode,$rname,$rdec);
 209     foreach my $range (@CharinfoRanges){
 210       if ($range->[0] <= $code && $code <= $range->[1]) {
 211         $rcode = $hexk;
 212         $rcode =~ s/^0+//;
 213         $rcode =  sprintf("%04X", hex($rcode));
 214         $rname = $range->[2] ? $range->[2]->($code) : '';
 215         $rdec  = $range->[3] ? $range->[3]->($code) : '';
 216         $hexk  = sprintf("%06X", $range->[0]); # replace by the first
 217         last;
 218       }
 219     }
 220     openunicode(\$UNICODEFH, "UnicodeData.txt");
 221     if (defined $UNICODEFH) {
 222         use Search::Dict 1.02;
 223         if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
 224             my $line = <$UNICODEFH>;
 225             return unless defined $line;
 226             chomp $line;
 227             my %prop;
 228             @prop{qw(
 229                      code name category
 230                      combining bidi decomposition
 231                      decimal digit numeric
 232                      mirrored unicode10 comment
 233                      upper lower title
 234                     )} = split(/;/, $line, -1);
 235             $hexk =~ s/^0+//;
 236             $hexk =  sprintf("%04X", hex($hexk));
 237             if ($prop{code} eq $hexk) {
 238                 $prop{block}  = charblock($code);
 239                 $prop{script} = charscript($code);
 240                 if(defined $rname){
 241                     $prop{code} = $rcode;
 242                     $prop{name} = $rname;
 243                     $prop{decomposition} = $rdec;
 244                 }
 245                 return \%prop;
 246             }
 247         }
 248     }
 249     return;
 250 }
 251
 252 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 253     my ($table, $lo, $hi, $code) = @_;
 254
 255     return if $lo > $hi;
 256
 257     my $mid = int(($lo+$hi) / 2);
 258
 259     if ($table->[$mid]->[0] < $code) {
 260         if ($table->[$mid]->[1] >= $code) {
 261             return $table->[$mid]->[2];
 262         } else {
 263             _search($table, $mid + 1, $hi, $code);
 264         }
 265     } elsif ($table->[$mid]->[0] > $code) {
 266         _search($table, $lo, $mid - 1, $code);
 267     } else {
 268         return $table->[$mid]->[2];
 269     }
 270 }
 271
 272 sub charinrange {
 273     my ($range, $arg) = @_;
 274     my $code = _getcode($arg);
 275     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 276         unless defined $code;
 277     _search($range, 0, $#$range, $code);
 278 }
 279
 280 =head2 charblock
 281
 282     use Unicode::UCD 'charblock';
 283
 284     my $charblock = charblock(0x41);
 285     my $charblock = charblock(1234);
 286     my $charblock = charblock("0x263a");
 287     my $charblock = charblock("U+263a");
 288
 289     my $range     = charblock('Armenian');
 290
 291 With a B<code point argument> charblock() returns the I<block> the character
 292 belongs to, e.g.  C<Basic Latin>.  Note that not all the character
 293 positions within all blocks are defined.
 294
 295 See also L</Blocks versus Scripts>.
 296
 297 If supplied with an argument that can't be a code point, charblock() tries
 298 to do the opposite and interpret the argument as a character block. The
 299 return value is a I<range>: an anonymous list of lists that contain
 300 I<start-of-range>, I<end-of-range> code point pairs. You can test whether
 301 a code point is in a range using the L</charinrange> function. If the
 302 argument is not a known character block, C<undef> is returned.
 303
 304 =cut
 305
 306 my @BLOCKS;
 307 my %BLOCKS;
 308
 309 sub _charblocks {
 310     unless (@BLOCKS) {
 311         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 312             local $_;
 313             while (<$BLOCKSFH>) {
 314                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 315                     my ($lo, $hi) = (hex($1), hex($2));
 316                     my $subrange = [ $lo, $hi, $3 ];
 317                     push @BLOCKS, $subrange;
 318                     push @{$BLOCKS{$3}}, $subrange;
 319                 }
 320             }
 321             close($BLOCKSFH);
 322         }
 323     }
 324 }
 325
 326 sub charblock {
 327     my $arg = shift;
 328
 329     _charblocks() unless @BLOCKS;
 330
 331     my $code = _getcode($arg);
 332
 333     if (defined $code) {
 334         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 335     } else {
 336         if (exists $BLOCKS{$arg}) {
 337             return dclone $BLOCKS{$arg};
 338         } else {
 339             return;
 340         }
 341     }
 342 }
 343
 344 =head2 charscript
 345
 346     use Unicode::UCD 'charscript';
 347
 348     my $charscript = charscript(0x41);
 349     my $charscript = charscript(1234);
 350     my $charscript = charscript("U+263a");
 351
 352     my $range      = charscript('Thai');
 353
 354 With a B<code point argument> charscript() returns the I<script> the
 355 character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 356
 357 See also L</Blocks versus Scripts>.
 358
 359 If supplied with an argument that can't be a code point, charscript() tries
 360 to do the opposite and interpret the argument as a character script. The
 361 return value is a I<range>: an anonymous list of lists that contain
 362 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
 363 code point is in a range using the L</charinrange> function. If the
 364 argument is not a known character script, C<undef> is returned.
 365
 366 =cut
 367
 368 my @SCRIPTS;
 369 my %SCRIPTS;
 370
 371 sub _charscripts {
 372     unless (@SCRIPTS) {
 373         if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 374             local $_;
 375             while (<$SCRIPTSFH>) {
 376                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 377                     my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 378                     my $script = lc($3);
 379                     $script =~ s/\b(\w)/uc($1)/ge;
 380                     my $subrange = [ $lo, $hi, $script ];
 381                     push @SCRIPTS, $subrange;
 382                     push @{$SCRIPTS{$script}}, $subrange;
 383                 }
 384             }
 385             close($SCRIPTSFH);
 386             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 387         }
 388     }
 389 }
 390
 391 sub charscript {
 392     my $arg = shift;
 393
 394     _charscripts() unless @SCRIPTS;
 395
 396     my $code = _getcode($arg);
 397
 398     if (defined $code) {
 399         _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 400     } else {
 401         if (exists $SCRIPTS{$arg}) {
 402             return dclone $SCRIPTS{$arg};
 403         } else {
 404             return;
 405         }
 406     }
 407 }
 408
 409 =head2 charblocks
 410
 411     use Unicode::UCD 'charblocks';
 412
 413     my $charblocks = charblocks();
 414
 415 charblocks() returns a reference to a hash with the known block names
 416 as the keys, and the code point ranges (see L</charblock>) as the values.
 417
 418 See also L</Blocks versus Scripts>.
 419
 420 =cut
 421
 422 sub charblocks {
 423     _charblocks() unless %BLOCKS;
 424     return dclone \%BLOCKS;
 425 }
 426
 427 =head2 charscripts
 428
 429     use Unicode::UCD 'charscripts';
 430
 431     my $charscripts = charscripts();
 432
 433 charscripts() returns a reference to a hash with the known script
 434 names as the keys, and the code point ranges (see L</charscript>) as
 435 the values.
 436
 437 See also L</Blocks versus Scripts>.
 438
 439 =cut
 440
 441 sub charscripts {
 442     _charscripts() unless %SCRIPTS;
 443     return dclone \%SCRIPTS;
 444 }
 445
 446 =head2 Blocks versus Scripts
 447
 448 The difference between a block and a script is that scripts are closer
 449 to the linguistic notion of a set of characters required to present
 450 languages, while block is more of an artifact of the Unicode character
 451 numbering and separation into blocks of (mostly) 256 characters.
 452
 453 For example the Latin B<script> is spread over several B<blocks>, such
 454 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
 455 C<Latin Extended-B>.  On the other hand, the Latin script does not
 456 contain all the characters of the C<Basic Latin> block (also known as
 457 the ASCII): it includes only the letters, and not, for example, the digits
 458 or the punctuation.
 459
 460 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
 461
 462 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
 463
 464 =head2 Matching Scripts and Blocks
 465
 466 Scripts are matched with the regular-expression construct
 467 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
 468 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
 469 any of the 256 code points in the Tibetan block).
 470
 471 =head2 Code Point Arguments
 472
 473 A I<code point argument> is either a decimal or a hexadecimal scalar
 474 designating a Unicode character, or C<U+> followed by hexadecimals
 475 designating a Unicode character.  In other words, if you want a code
 476 point to be interpreted as a hexadecimal number, you must prefix it
 477 with either C<0x> or C<U+>, because a string like e.g. C<123> will
 478 be interpreted as a decimal code point.  Also note that Unicode is
 479 B<not> limited to 16 bits (the number of Unicode characters is
 480 open-ended, in theory unlimited): you may have more than 4 hexdigits.
 481
 482 =head2 charinrange
 483
 484 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 485 can also test whether a code point is in the I<range> as returned by
 486 L</charblock> and L</charscript> or as the values of the hash returned
 487 by L</charblocks> and L</charscripts> by using charinrange():
 488
 489     use Unicode::UCD qw(charscript charinrange);
 490
 491     $range = charscript('Hiragana');
 492     print "looks like hiragana\n" if charinrange($range, $codepoint);
 493
 494 =cut
 495
 496 my %GENERAL_CATEGORIES =
 497  (
 498     'L'  =>         'Letter',
 499     'LC' =>         'CasedLetter',
 500     'Lu' =>         'UppercaseLetter',
 501     'Ll' =>         'LowercaseLetter',
 502     'Lt' =>         'TitlecaseLetter',
 503     'Lm' =>         'ModifierLetter',
 504     'Lo' =>         'OtherLetter',
 505     'M'  =>         'Mark',
 506     'Mn' =>         'NonspacingMark',
 507     'Mc' =>         'SpacingMark',
 508     'Me' =>         'EnclosingMark',
 509     'N'  =>         'Number',
 510     'Nd' =>         'DecimalNumber',
 511     'Nl' =>         'LetterNumber',
 512     'No' =>         'OtherNumber',
 513     'P'  =>         'Punctuation',
 514     'Pc' =>         'ConnectorPunctuation',
 515     'Pd' =>         'DashPunctuation',
 516     'Ps' =>         'OpenPunctuation',
 517     'Pe' =>         'ClosePunctuation',
 518     'Pi' =>         'InitialPunctuation',
 519     'Pf' =>         'FinalPunctuation',
 520     'Po' =>         'OtherPunctuation',
 521     'S'  =>         'Symbol',
 522     'Sm' =>         'MathSymbol',
 523     'Sc' =>         'CurrencySymbol',
 524     'Sk' =>         'ModifierSymbol',
 525     'So' =>         'OtherSymbol',
 526     'Z'  =>         'Separator',
 527     'Zs' =>         'SpaceSeparator',
 528     'Zl' =>         'LineSeparator',
 529     'Zp' =>         'ParagraphSeparator',
 530     'C'  =>         'Other',
 531     'Cc' =>         'Control',
 532     'Cf' =>         'Format',
 533     'Cs' =>         'Surrogate',
 534     'Co' =>         'PrivateUse',
 535     'Cn' =>         'Unassigned',
 536  );
 537
 538 sub general_categories {
 539     return dclone \%GENERAL_CATEGORIES;
 540 }
 541
 542 =head2 general_categories
 543
 544     use Unicode::UCD 'general_categories';
 545
 546     my $categories = general_categories();
 547
 548 The general_categories() returns a reference to a hash which has short
 549 general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
 550 names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
 551 C<Symbol>) as values.  The hash is reversible in case you need to go
 552 from the long names to the short names.  The general category is the
 553 one returned from charinfo() under the C<category> key.
 554
 555 =cut
 556
 557 my %BIDI_TYPES =
 558  (
 559    'L'   => 'Left-to-Right',
 560    'LRE' => 'Left-to-Right Embedding',
 561    'LRO' => 'Left-to-Right Override',
 562    'R'   => 'Right-to-Left',
 563    'AL'  => 'Right-to-Left Arabic',
 564    'RLE' => 'Right-to-Left Embedding',
 565    'RLO' => 'Right-to-Left Override',
 566    'PDF' => 'Pop Directional Format',
 567    'EN'  => 'European Number',
 568    'ES'  => 'European Number Separator',
 569    'ET'  => 'European Number Terminator',
 570    'AN'  => 'Arabic Number',
 571    'CS'  => 'Common Number Separator',
 572    'NSM' => 'Non-Spacing Mark',
 573    'BN'  => 'Boundary Neutral',
 574    'B'   => 'Paragraph Separator',
 575    'S'   => 'Segment Separator',
 576    'WS'  => 'Whitespace',
 577    'ON'  => 'Other Neutrals',
 578  );
 579
 580 sub bidi_types {
 581     return dclone \%BIDI_TYPES;
 582 }
 583
 584 =head2 bidi_types
 585
 586     use Unicode::UCD 'bidi_types';
 587
 588     my $categories = bidi_types();
 589
 590 The bidi_types() returns a reference to a hash which has the short
 591 bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
 592 names (such as C<Left-to-Right>, C<Right-to-Left>) as values.  The
 593 hash is reversible in case you need to go from the long names to the
 594 short names.  The bidi type is the one returned from charinfo()
 595 under the C<bidi> key.  For the exact meaning of the various bidi classes
 596 the Unicode TR9 is recommended reading:
 597 http://www.unicode.org/reports/tr9/tr9-17.html
 598 (as of Unicode 5.0.0)
 599
 600 =cut
 601
 602 =head2 compexcl
 603
 604     use Unicode::UCD 'compexcl';
 605
 606     my $compexcl = compexcl("09dc");
 607
 608 The compexcl() returns the composition exclusion (that is, if the
 609 character should not be produced during a precomposition) of the
 610 character specified by a B<code point argument>.
 611
 612 If there is a composition exclusion for the character, true is
 613 returned.  Otherwise, false is returned.
 614
 615 =cut
 616
 617 my %COMPEXCL;
 618
 619 sub _compexcl {
 620     unless (%COMPEXCL) {
 621         if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
 622             local $_;
 623             while (<$COMPEXCLFH>) {
 624                 if (/^([0-9A-F]+)\s+\#\s+/) {
 625                     my $code = hex($1);
 626                     $COMPEXCL{$code} = undef;
 627                 }
 628             }
 629             close($COMPEXCLFH);
 630         }
 631     }
 632 }
 633
 634 sub compexcl {
 635     my $arg  = shift;
 636     my $code = _getcode($arg);
 637     croak __PACKAGE__, "::compexcl: unknown code '$arg'"
 638         unless defined $code;
 639
 640     _compexcl() unless %COMPEXCL;
 641
 642     return exists $COMPEXCL{$code};
 643 }
 644
 645 =head2 casefold
 646
 647     use Unicode::UCD 'casefold';
 648
 649     my $casefold = casefold("00DF");
 650
 651 The casefold() returns the locale-independent case folding of the
 652 character specified by a B<code point argument>.
 653
 654 If there is a case folding for that character, a reference to a hash
 655 with the following fields is returned:
 656
 657     key
 658
 659     code             code point with at least four hexdigits
 660     status           "C", "F", "S", or "I"
 661     mapping          one or more codes separated by spaces
 662
 663 The meaning of the I<status> is as follows:
 664
 665    C                 common case folding, common mappings shared
 666                      by both simple and full mappings
 667    F                 full case folding, mappings that cause strings
 668                      to grow in length. Multiple characters are separated
 669                      by spaces
 670    S                 simple case folding, mappings to single characters
 671                      where different from F
 672    I                 special case for dotted uppercase I and
 673                      dotless lowercase i
 674                      - If this mapping is included, the result is
 675                        case-insensitive, but dotless and dotted I's
 676                        are not distinguished
 677                      - If this mapping is excluded, the result is not
 678                        fully case-insensitive, but dotless and dotted
 679                        I's are distinguished
 680
 681 If there is no case folding for that character, C<undef> is returned.
 682
 683 For more information about case mappings see
 684 http://www.unicode.org/unicode/reports/tr21/
 685
 686 =cut
 687
 688 my %CASEFOLD;
 689
 690 sub _casefold {
 691     unless (%CASEFOLD) {
 692         if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
 693             local $_;
 694             while (<$CASEFOLDFH>) {
 695                 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 696                     my $code = hex($1);
 697                     $CASEFOLD{$code} = { code    => $1,
 698                                          status  => $2,
 699                                          mapping => $3 };
 700                 }
 701             }
 702             close($CASEFOLDFH);
 703         }
 704     }
 705 }
 706
 707 sub casefold {
 708     my $arg  = shift;
 709     my $code = _getcode($arg);
 710     croak __PACKAGE__, "::casefold: unknown code '$arg'"
 711         unless defined $code;
 712
 713     _casefold() unless %CASEFOLD;
 714
 715     return $CASEFOLD{$code};
 716 }
 717
 718 =head2 casespec
 719
 720     use Unicode::UCD 'casespec';
 721
 722     my $casespec = casespec("FB00");
 723
 724 The casespec() returns the potentially locale-dependent case mapping
 725 of the character specified by a B<code point argument>.  The mapping
 726 may change the length of the string (which the basic Unicode case
 727 mappings as returned by charinfo() never do).
 728
 729 If there is a case folding for that character, a reference to a hash
 730 with the following fields is returned:
 731
 732     key
 733
 734     code             code point with at least four hexdigits
 735     lower            lowercase
 736     title            titlecase
 737     upper            uppercase
 738     condition        condition list (may be undef)
 739
 740 The C<condition> is optional.  Where present, it consists of one or
 741 more I<locales> or I<contexts>, separated by spaces (other than as
 742 used to separate elements, spaces are to be ignored).  A condition
 743 list overrides the normal behavior if all of the listed conditions are
 744 true.  Case distinctions in the condition list are not significant.
 745 Conditions preceded by "NON_" represent the negation of the condition.
 746
 747 Note that when there are multiple case folding definitions for a
 748 single code point because of different locales, the value returned by
 749 casespec() is a hash reference which has the locales as the keys and
 750 hash references as described above as the values.
 751
 752 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
 753 followed by a "_" and a 2-letter ISO language code (possibly followed
 754 by a "_" and a variant code).  You can find the lists of those codes,
 755 see L<Locale::Country> and L<Locale::Language>.
 756
 757 A I<context> is one of the following choices:
 758
 759     FINAL            The letter is not followed by a letter of
 760                      general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
 761     MODERN           The mapping is only used for modern text
 762     AFTER_i          The last base character was "i" (U+0069)
 763
 764 For more information about case mappings see
 765 http://www.unicode.org/unicode/reports/tr21/
 766
 767 =cut
 768
 769 my %CASESPEC;
 770
 771 sub _casespec {
 772     unless (%CASESPEC) {
 773         if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
 774             local $_;
 775             while (<$CASESPECFH>) {
 776                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
 777                     my ($hexcode, $lower, $title, $upper, $condition) =
 778                         ($1, $2, $3, $4, $5);
 779                     my $code = hex($hexcode);
 780                     if (exists $CASESPEC{$code}) {
 781                         if (exists $CASESPEC{$code}->{code}) {
 782                             my ($oldlower,
 783                                 $oldtitle,
 784                                 $oldupper,
 785                                 $oldcondition) =
 786                                     @{$CASESPEC{$code}}{qw(lower
 787                                                            title
 788                                                            upper
 789                                                            condition)};
 790                             if (defined $oldcondition) {
 791                                 my ($oldlocale) =
 792                                 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
 793                                 delete $CASESPEC{$code};
 794                                 $CASESPEC{$code}->{$oldlocale} =
 795                                 { code      => $hexcode,
 796                                   lower     => $oldlower,
 797                                   title     => $oldtitle,
 798                                   upper     => $oldupper,
 799                                   condition => $oldcondition };
 800                             }
 801                         }
 802                         my ($locale) =
 803                             ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
 804                         $CASESPEC{$code}->{$locale} =
 805                         { code      => $hexcode,
 806                           lower     => $lower,
 807                           title     => $title,
 808                           upper     => $upper,
 809                           condition => $condition };
 810                     } else {
 811                         $CASESPEC{$code} =
 812                         { code      => $hexcode,
 813                           lower     => $lower,
 814                           title     => $title,
 815                           upper     => $upper,
 816                           condition => $condition };
 817                     }
 818                 }
 819             }
 820             close($CASESPECFH);
 821         }
 822     }
 823 }
 824
 825 sub casespec {
 826     my $arg  = shift;
 827     my $code = _getcode($arg);
 828     croak __PACKAGE__, "::casespec: unknown code '$arg'"
 829         unless defined $code;
 830
 831     _casespec() unless %CASESPEC;
 832
 833     return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
 834 }
 835
 836 =head2 namedseq()
 837
 838     use Unicode::UCD 'namedseq';
 839
 840     my $namedseq = namedseq("KATAKANA LETTER AINU P");
 841     my @namedseq = namedseq("KATAKANA LETTER AINU P");
 842     my %namedseq = namedseq();
 843
 844 If used with a single argument in a scalar context, returns the string
 845 consisting of the code points of the named sequence, or C<undef> if no
 846 named sequence by that name exists.  If used with a single argument in
 847 a list context, returns list of the code points.  If used with no
 848 arguments in a list context, returns a hash with the names of the
 849 named sequences as the keys and the named sequences as strings as
 850 the values.  Otherwise, returns C<undef> or empty list depending
 851 on the context.
 852
 853 (New from Unicode 4.1.0)
 854
 855 =cut
 856
 857 my %NAMEDSEQ;
 858
 859 sub _namedseq {
 860     unless (%NAMEDSEQ) {
 861         if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
 862             local $_;
 863             while (<$NAMEDSEQFH>) {
 864                 if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
 865                     my ($n, $s) = ($1, $2);
 866                     my @s = map { chr(hex($_)) } split(' ', $s);
 867                     $NAMEDSEQ{$n} = join("", @s);
 868                 }
 869             }
 870             close($NAMEDSEQFH);
 871         }
 872     }
 873 }
 874
 875 sub namedseq {
 876     _namedseq() unless %NAMEDSEQ;
 877     my $wantarray = wantarray();
 878     if (defined $wantarray) {
 879         if ($wantarray) {
 880             if (@_ == 0) {
 881                 return %NAMEDSEQ;
 882             } elsif (@_ == 1) {
 883                 my $s = $NAMEDSEQ{ $_[0] };
 884                 return defined $s ? map { ord($_) } split('', $s) : ();
 885             }
 886         } elsif (@_ == 1) {
 887             return $NAMEDSEQ{ $_[0] };
 888         }
 889     }
 890     return;
 891 }
 892
 893 =head2 Unicode::UCD::UnicodeVersion
 894
 895 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
 896 Character Database, in other words, the version of the Unicode
 897 standard the database implements.  The version is a string
 898 of numbers delimited by dots (C<'.'>).
 899
 900 =cut
 901
 902 my $UNICODEVERSION;
 903
 904 sub UnicodeVersion {
 905     unless (defined $UNICODEVERSION) {
 906         openunicode(\$VERSIONFH, "version");
 907         chomp($UNICODEVERSION = <$VERSIONFH>);
 908         close($VERSIONFH);
 909         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
 910             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
 911     }
 912     return $UNICODEVERSION;
 913 }
 914
 915 =head2 Implementation Note
 916
 917 The first use of charinfo() opens a read-only filehandle to the Unicode
 918 Character Database (the database is included in the Perl distribution).
 919 The filehandle is then kept open for further queries.  In other words,
 920 if you are wondering where one of your filehandles went, that's where.
 921
 922 =head1 BUGS
 923
 924 Does not yet support EBCDIC platforms.
 925
 926 =head1 AUTHOR
 927
 928 Jarkko Hietaniemi
 929
 930 =cut
 931
 932 1;