lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '0.27';
   7
   8 use Storable qw(dclone);
   9
  10 require Exporter;
  11
  12 our @ISA = qw(Exporter);
  13
  14 our @EXPORT_OK = qw(charinfo
  15                     charblock charscript
  16                     charblocks charscripts
  17                     charinrange
  18                     general_categories bidi_types
  19                     compexcl
  20                     casefold casespec
  21                     namedseq);
  22
  23 use Carp;
  24
  25 =head1 NAME
  26
  27 Unicode::UCD - Unicode character database
  28
  29 =head1 SYNOPSIS
  30
  31     use Unicode::UCD 'charinfo';
  32     my $charinfo   = charinfo($codepoint);
  33
  34     use Unicode::UCD 'casespec';
  35     my $casespec = casespec(0xFB00);
  36
  37     use Unicode::UCD 'charblock';
  38     my $charblock  = charblock($codepoint);
  39
  40     use Unicode::UCD 'charscript';
  41     my $charscript = charscript($codepoint);
  42
  43     use Unicode::UCD 'charblocks';
  44     my $charblocks = charblocks();
  45
  46     use Unicode::UCD 'charscripts';
  47     my $charscripts = charscripts();
  48
  49     use Unicode::UCD qw(charscript charinrange);
  50     my $range = charscript($script);
  51     print "looks like $script\n" if charinrange($range, $codepoint);
  52
  53     use Unicode::UCD qw(general_categories bidi_types);
  54     my $categories = general_categories();
  55     my $types = bidi_types();
  56
  57     use Unicode::UCD 'compexcl';
  58     my $compexcl = compexcl($codepoint);
  59
  60     use Unicode::UCD 'namedseq';
  61     my $namedseq = namedseq($named_sequence_name);
  62
  63     my $unicode_version = Unicode::UCD::UnicodeVersion();
  64
  65 =head1 DESCRIPTION
  66
  67 The Unicode::UCD module offers a series of functions that
  68 provide a simple interface to the Unicode
  69 Character Database.
  70
  71 =head2 code point argument
  72
  73 Some of the functions are called with a I<code point argument>, which is either
  74 a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
  75 followed by hexadecimals designating a Unicode code point.  In other words, if
  76 you want a code point to be interpreted as a hexadecimal number, you must
  77 prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
  78 interpreted as a decimal code point.  Also note that Unicode is B<not> limited
  79 to 16 bits (the number of Unicode code points is open-ended, in theory
  80 unlimited): you may have more than 4 hexdigits.
  81 =cut
  82
  83 my $UNICODEFH;
  84 my $BLOCKSFH;
  85 my $SCRIPTSFH;
  86 my $VERSIONFH;
  87 my $COMPEXCLFH;
  88 my $CASEFOLDFH;
  89 my $CASESPECFH;
  90 my $NAMEDSEQFH;
  91
  92 sub openunicode {
  93     my ($rfh, @path) = @_;
  94     my $f;
  95     unless (defined $$rfh) {
  96         for my $d (@INC) {
  97             use File::Spec;
  98             $f = File::Spec->catfile($d, "unicore", @path);
  99             last if open($$rfh, $f);
 100             undef $f;
 101         }
 102         croak __PACKAGE__, ": failed to find ",
 103               File::Spec->catfile(@path), " in @INC"
 104             unless defined $f;
 105     }
 106     return $f;
 107 }
 108
 109 =head2 B<charinfo()>
 110
 111     use Unicode::UCD 'charinfo';
 112
 113     my $charinfo = charinfo(0x41);
 114
 115 This returns information about the input L</code point argument>
 116 as a reference to a hash of fields as defined by the Unicode
 117 standard.  If the L</code point argument> is not assigned in the standard
 118 (i.e., has the general category C<Cn> meaning C<Unassigned>)
 119 or is a non-character (meaning it is guaranteed to never be assigned in
 120 the standard),
 121 B<undef> is returned.
 122
 123 Fields that aren't applicable to the particular code point argument exist in the
 124 returned hash, and are empty.
 125
 126 The keys in the hash with the meanings of their values are:
 127
 128 =over
 129
 130 =item B<code>
 131
 132 the input L</code point argument> expressed in hexadecimal, with leading zeros
 133 added if necessary to make it contain at least four hexdigits
 134
 135 =item B<name>
 136
 137 name of I<code>, all IN UPPER CASE.
 138 Some control-type code points do not have names.
 139 This field will be empty for C<Surrogate> and C<Private Use> code points,
 140 and for the others without a name,
 141 it will contain a description enclosed in angle brackets, like
 142 C<E<lt>controlE<gt>>.
 143
 144
 145 =item B<category>
 146
 147 The short name of the general category of I<code>.
 148 This will match one of the keys in the hash returned by L</general_categories()>.
 149
 150 =item B<combining>
 151
 152 the combining class number for I<code> used in the Canonical Ordering Algorithm.
 153 For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
 154 available at
 155 L<http://www.unicode.org/versions/Unicode5.1.0/>
 156
 157 =item B<bidi>
 158
 159 bidirectional type of I<code>.
 160 This will match one of the keys in the hash returned by L</bidi_types()>.
 161
 162 =item B<decomposition>
 163
 164 is empty if I<code> has no decomposition; or is one or more codes
 165 (separated by spaces) that taken in order represent a decomposition for
 166 I<code>.  Each has at least four hexdigits.
 167 The codes may be preceded by a word enclosed in angle brackets then a space,
 168 like C<E<lt>compatE<gt> >, giving the type of decomposition
 169
 170 =item B<decimal>
 171
 172 if I<code> is a decimal digit this is its integer numeric value
 173
 174 =item B<digit>
 175
 176 if I<code> represents a whole number, this is its integer numeric value
 177
 178 =item B<numeric>
 179
 180 if I<code> represents a whole or rational number, this is its numeric value.
 181 Rational values are expressed as a string like C<1/4>.
 182
 183 =item B<mirrored>
 184
 185 C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
 186
 187 =item B<unicode10>
 188
 189 name of I<code> in the Unicode 1.0 standard if one
 190 existed for this code point and is different from the current name
 191
 192 =item B<comment>
 193
 194 ISO 10646 comment field.
 195 It appears in parentheses in the ISO 10646 names list,
 196 or contains an asterisk to indicate there is
 197 a note for this code point in Annex P of that standard.
 198
 199 =item B<upper>
 200
 201 is empty if there is no single code point uppercase mapping for I<code>;
 202 otherwise it is that mapping expressed as at least four hexdigits.
 203 (L</casespec()> should be used in addition to B<charinfo()>
 204 for case mappings when the calling program can cope with multiple code point
 205 mappings.)
 206
 207 =item B<lower>
 208
 209 is empty if there is no single code point lowercase mapping for I<code>;
 210 otherwise it is that mapping expressed as at least four hexdigits.
 211 (L</casespec()> should be used in addition to B<charinfo()>
 212 for case mappings when the calling program can cope with multiple code point
 213 mappings.)
 214
 215 =item B<title>
 216
 217 is empty if there is no single code point titlecase mapping for I<code>;
 218 otherwise it is that mapping expressed as at least four hexdigits.
 219 (L</casespec()> should be used in addition to B<charinfo()>
 220 for case mappings when the calling program can cope with multiple code point
 221 mappings.)
 222
 223 =item B<block>
 224
 225 block I<code> belongs to (used in \p{In...}).
 226 See L</Blocks versus Scripts>.
 227
 228
 229 =item B<script>
 230
 231 script I<code> belongs to.
 232 See L</Blocks versus Scripts>.
 233
 234 =back
 235
 236 Note that you cannot do (de)composition and casing based solely on the
 237 I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields;
 238 you will need also the L</compexcl()>, and L</casespec()> functions.
 239
 240 =cut
 241
 242 # NB: This function is duplicated in charnames.pm
 243 sub _getcode {
 244     my $arg = shift;
 245
 246     if ($arg =~ /^[1-9]\d*$/) {
 247         return $arg;
 248     } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
 249         return hex($1);
 250     }
 251
 252     return;
 253 }
 254
 255 # Lingua::KO::Hangul::Util not part of the standard distribution
 256 # but it will be used if available.
 257
 258 eval { require Lingua::KO::Hangul::Util };
 259 my $hasHangulUtil = ! $@;
 260 if ($hasHangulUtil) {
 261     Lingua::KO::Hangul::Util->import();
 262 }
 263
 264 sub hangul_decomp { # internal: called from charinfo
 265     if ($hasHangulUtil) {
 266         my @tmp = decomposeHangul(shift);
 267         return sprintf("%04X %04X",      @tmp) if @tmp == 2;
 268         return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
 269     }
 270     return;
 271 }
 272
 273 sub hangul_charname { # internal: called from charinfo
 274     return sprintf("HANGUL SYLLABLE-%04X", shift);
 275 }
 276
 277 sub han_charname { # internal: called from charinfo
 278     return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
 279 }
 280
 281 # Overwritten by data in file
 282 my %first_last = (
 283    'CJK Ideograph Extension A' => [ 0x3400,   0x4DB5   ],
 284    'CJK Ideograph'             => [ 0x4E00,   0x9FA5   ],
 285    'CJK Ideograph Extension B' => [ 0x20000,  0x2A6D6  ],
 286 );
 287
 288 get_charinfo_ranges();
 289
 290 sub get_charinfo_ranges {
 291    my @blocks = keys %first_last;
 292
 293    my $fh;
 294    openunicode( \$fh, 'UnicodeData.txt' );
 295    if( defined $fh ){
 296       while( my $line = <$fh> ){
 297          next unless $line =~ /(?:First|Last)/;
 298          if( grep{ $line =~ /[^;]+;<$_\s*,\s*(?:First|Last)>/ }@blocks ){
 299             my ($number,$block,$type);
 300             ($number,$block) = split /;/, $line;
 301             $block =~ s/<|>//g;
 302             ($block,$type) = split /, /, $block;
 303             my $index = $type eq 'First' ? 0 : 1;
 304             $first_last{ $block }->[$index] = hex $number;
 305          }
 306       }
 307    }
 308 }
 309
 310 my @CharinfoRanges = (
 311 # block name
 312 # [ first, last, coderef to name, coderef to decompose ],
 313 # CJK Ideographs Extension A
 314   [ @{ $first_last{'CJK Ideograph Extension A'} },        \&han_charname,   undef  ],
 315 # CJK Ideographs
 316   [ @{ $first_last{'CJK Ideograph'} },                    \&han_charname,   undef  ],
 317 # Hangul Syllables
 318   [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
 319 # Non-Private Use High Surrogates
 320   [ 0xD800,   0xDB7F,   undef,   undef  ],
 321 # Private Use High Surrogates
 322   [ 0xDB80,   0xDBFF,   undef,   undef  ],
 323 # Low Surrogates
 324   [ 0xDC00,   0xDFFF,   undef,   undef  ],
 325 # The Private Use Area
 326   [ 0xE000,   0xF8FF,   undef,   undef  ],
 327 # CJK Ideographs Extension B
 328   [ @{ $first_last{'CJK Ideograph Extension B'} },        \&han_charname,   undef  ],
 329 # Plane 15 Private Use Area
 330   [ 0xF0000,  0xFFFFD,  undef,   undef  ],
 331 # Plane 16 Private Use Area
 332   [ 0x100000, 0x10FFFD, undef,   undef  ],
 333 );
 334
 335 sub charinfo {
 336     my $arg  = shift;
 337     my $code = _getcode($arg);
 338     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 339         unless defined $code;
 340     my $hexk = sprintf("%06X", $code);
 341     my($rcode,$rname,$rdec);
 342     foreach my $range (@CharinfoRanges){
 343       if ($range->[0] <= $code && $code <= $range->[1]) {
 344         $rcode = $hexk;
 345         $rcode =~ s/^0+//;
 346         $rcode =  sprintf("%04X", hex($rcode));
 347         $rname = $range->[2] ? $range->[2]->($code) : '';
 348         $rdec  = $range->[3] ? $range->[3]->($code) : '';
 349         $hexk  = sprintf("%06X", $range->[0]); # replace by the first
 350         last;
 351       }
 352     }
 353     openunicode(\$UNICODEFH, "UnicodeData.txt");
 354     if (defined $UNICODEFH) {
 355         use Search::Dict 1.02;
 356         if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
 357             my $line = <$UNICODEFH>;
 358             return unless defined $line;
 359             chomp $line;
 360             my %prop;
 361             @prop{qw(
 362                      code name category
 363                      combining bidi decomposition
 364                      decimal digit numeric
 365                      mirrored unicode10 comment
 366                      upper lower title
 367                     )} = split(/;/, $line, -1);
 368             $hexk =~ s/^0+//;
 369             $hexk =  sprintf("%04X", hex($hexk));
 370             if ($prop{code} eq $hexk) {
 371                 $prop{block}  = charblock($code);
 372                 $prop{script} = charscript($code);
 373                 if(defined $rname){
 374                     $prop{code} = $rcode;
 375                     $prop{name} = $rname;
 376                     $prop{decomposition} = $rdec;
 377                 }
 378                 return \%prop;
 379             }
 380         }
 381     }
 382     return;
 383 }
 384
 385 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 386     my ($table, $lo, $hi, $code) = @_;
 387
 388     return if $lo > $hi;
 389
 390     my $mid = int(($lo+$hi) / 2);
 391
 392     if ($table->[$mid]->[0] < $code) {
 393         if ($table->[$mid]->[1] >= $code) {
 394             return $table->[$mid]->[2];
 395         } else {
 396             _search($table, $mid + 1, $hi, $code);
 397         }
 398     } elsif ($table->[$mid]->[0] > $code) {
 399         _search($table, $lo, $mid - 1, $code);
 400     } else {
 401         return $table->[$mid]->[2];
 402     }
 403 }
 404
 405 sub charinrange {
 406     my ($range, $arg) = @_;
 407     my $code = _getcode($arg);
 408     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 409         unless defined $code;
 410     _search($range, 0, $#$range, $code);
 411 }
 412
 413 =head2 B<charblock()>
 414
 415     use Unicode::UCD 'charblock';
 416
 417     my $charblock = charblock(0x41);
 418     my $charblock = charblock(1234);
 419     my $charblock = charblock(0x263a);
 420     my $charblock = charblock("U+263a");
 421
 422     my $range     = charblock('Armenian');
 423
 424 With a L</code point argument> charblock() returns the I<block> the code point
 425 belongs to, e.g.  C<Basic Latin>.
 426 If the code point is unassigned, this returns the block it would belong to if
 427 it were assigned (which it may in future versions of the Unicode Standard).
 428
 429 See also L</Blocks versus Scripts>.
 430
 431 If supplied with an argument that can't be a code point, charblock() tries
 432 to do the opposite and interpret the argument as a code point block. The
 433 return value is a I<range>: an anonymous list of lists that contain
 434 I<start-of-range>, I<end-of-range> code point pairs. You can test whether
 435 a code point is in a range using the L</charinrange()> function. If the
 436 argument is not a known code point block, B<undef> is returned.
 437
 438 =cut
 439
 440 my @BLOCKS;
 441 my %BLOCKS;
 442
 443 sub _charblocks {
 444     unless (@BLOCKS) {
 445         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 446             local $_;
 447             while (<$BLOCKSFH>) {
 448                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 449                     my ($lo, $hi) = (hex($1), hex($2));
 450                     my $subrange = [ $lo, $hi, $3 ];
 451                     push @BLOCKS, $subrange;
 452                     push @{$BLOCKS{$3}}, $subrange;
 453                 }
 454             }
 455             close($BLOCKSFH);
 456         }
 457     }
 458 }
 459
 460 sub charblock {
 461     my $arg = shift;
 462
 463     _charblocks() unless @BLOCKS;
 464
 465     my $code = _getcode($arg);
 466
 467     if (defined $code) {
 468         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 469     } else {
 470         if (exists $BLOCKS{$arg}) {
 471             return dclone $BLOCKS{$arg};
 472         } else {
 473             return;
 474         }
 475     }
 476 }
 477
 478 =head2 B<charscript()>
 479
 480     use Unicode::UCD 'charscript';
 481
 482     my $charscript = charscript(0x41);
 483     my $charscript = charscript(1234);
 484     my $charscript = charscript("U+263a");
 485
 486     my $range      = charscript('Thai');
 487
 488 With a L</code point argument> charscript() returns the I<script> the
 489 code point belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 490 If the code point is unassigned, it returns B<undef>
 491
 492 If supplied with an argument that can't be a code point, charscript() tries
 493 to do the opposite and interpret the argument as a code point script. The
 494 return value is a I<range>: an anonymous list of lists that contain
 495 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
 496 code point is in a range using the L</charinrange()> function. If the
 497 argument is not a known code point script, B<undef> is returned.
 498
 499 See also L</Blocks versus Scripts>.
 500
 501 =cut
 502
 503 my @SCRIPTS;
 504 my %SCRIPTS;
 505
 506 sub _charscripts {
 507     unless (@SCRIPTS) {
 508         if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 509             local $_;
 510             while (<$SCRIPTSFH>) {
 511                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 512                     my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 513                     my $script = lc($3);
 514                     $script =~ s/\b(\w)/uc($1)/ge;
 515                     my $subrange = [ $lo, $hi, $script ];
 516                     push @SCRIPTS, $subrange;
 517                     push @{$SCRIPTS{$script}}, $subrange;
 518                 }
 519             }
 520             close($SCRIPTSFH);
 521             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 522         }
 523     }
 524 }
 525
 526 sub charscript {
 527     my $arg = shift;
 528
 529     _charscripts() unless @SCRIPTS;
 530
 531     my $code = _getcode($arg);
 532
 533     if (defined $code) {
 534         _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 535     } else {
 536         if (exists $SCRIPTS{$arg}) {
 537             return dclone $SCRIPTS{$arg};
 538         } else {
 539             return;
 540         }
 541     }
 542 }
 543
 544 =head2 B<charblocks()>
 545
 546     use Unicode::UCD 'charblocks';
 547
 548     my $charblocks = charblocks();
 549
 550 charblocks() returns a reference to a hash with the known block names
 551 as the keys, and the code point ranges (see L</charblock()>) as the values.
 552
 553 See also L</Blocks versus Scripts>.
 554
 555 =cut
 556
 557 sub charblocks {
 558     _charblocks() unless %BLOCKS;
 559     return dclone \%BLOCKS;
 560 }
 561
 562 =head2 B<charscripts()>
 563
 564     use Unicode::UCD 'charscripts';
 565
 566     my $charscripts = charscripts();
 567
 568 charscripts() returns a reference to a hash with the known script
 569 names as the keys, and the code point ranges (see L</charscript()>) as
 570 the values.
 571
 572 See also L</Blocks versus Scripts>.
 573
 574 =cut
 575
 576 sub charscripts {
 577     _charscripts() unless %SCRIPTS;
 578     return dclone \%SCRIPTS;
 579 }
 580
 581 =head2 B<charinrange()>
 582
 583 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 584 can also test whether a code point is in the I<range> as returned by
 585 L</charblock()> and L</charscript()> or as the values of the hash returned
 586 by L</charblocks()> and L</charscripts()> by using charinrange():
 587
 588     use Unicode::UCD qw(charscript charinrange);
 589
 590     $range = charscript('Hiragana');
 591     print "looks like hiragana\n" if charinrange($range, $codepoint);
 592
 593 =cut
 594
 595 my %GENERAL_CATEGORIES =
 596  (
 597     'L'  =>         'Letter',
 598     'LC' =>         'CasedLetter',
 599     'Lu' =>         'UppercaseLetter',
 600     'Ll' =>         'LowercaseLetter',
 601     'Lt' =>         'TitlecaseLetter',
 602     'Lm' =>         'ModifierLetter',
 603     'Lo' =>         'OtherLetter',
 604     'M'  =>         'Mark',
 605     'Mn' =>         'NonspacingMark',
 606     'Mc' =>         'SpacingMark',
 607     'Me' =>         'EnclosingMark',
 608     'N'  =>         'Number',
 609     'Nd' =>         'DecimalNumber',
 610     'Nl' =>         'LetterNumber',
 611     'No' =>         'OtherNumber',
 612     'P'  =>         'Punctuation',
 613     'Pc' =>         'ConnectorPunctuation',
 614     'Pd' =>         'DashPunctuation',
 615     'Ps' =>         'OpenPunctuation',
 616     'Pe' =>         'ClosePunctuation',
 617     'Pi' =>         'InitialPunctuation',
 618     'Pf' =>         'FinalPunctuation',
 619     'Po' =>         'OtherPunctuation',
 620     'S'  =>         'Symbol',
 621     'Sm' =>         'MathSymbol',
 622     'Sc' =>         'CurrencySymbol',
 623     'Sk' =>         'ModifierSymbol',
 624     'So' =>         'OtherSymbol',
 625     'Z'  =>         'Separator',
 626     'Zs' =>         'SpaceSeparator',
 627     'Zl' =>         'LineSeparator',
 628     'Zp' =>         'ParagraphSeparator',
 629     'C'  =>         'Other',
 630     'Cc' =>         'Control',
 631     'Cf' =>         'Format',
 632     'Cs' =>         'Surrogate',
 633     'Co' =>         'PrivateUse',
 634     'Cn' =>         'Unassigned',
 635  );
 636
 637 sub general_categories {
 638     return dclone \%GENERAL_CATEGORIES;
 639 }
 640
 641 =head2 B<general_categories()>
 642
 643     use Unicode::UCD 'general_categories';
 644
 645     my $categories = general_categories();
 646
 647 This returns a reference to a hash which has short
 648 general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
 649 names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
 650 C<Symbol>) as values.  The hash is reversible in case you need to go
 651 from the long names to the short names.  The general category is the
 652 one returned from
 653 L</charinfo()> under the C<category> key.
 654
 655 =cut
 656
 657 my %BIDI_TYPES =
 658  (
 659    'L'   => 'Left-to-Right',
 660    'LRE' => 'Left-to-Right Embedding',
 661    'LRO' => 'Left-to-Right Override',
 662    'R'   => 'Right-to-Left',
 663    'AL'  => 'Right-to-Left Arabic',
 664    'RLE' => 'Right-to-Left Embedding',
 665    'RLO' => 'Right-to-Left Override',
 666    'PDF' => 'Pop Directional Format',
 667    'EN'  => 'European Number',
 668    'ES'  => 'European Number Separator',
 669    'ET'  => 'European Number Terminator',
 670    'AN'  => 'Arabic Number',
 671    'CS'  => 'Common Number Separator',
 672    'NSM' => 'Non-Spacing Mark',
 673    'BN'  => 'Boundary Neutral',
 674    'B'   => 'Paragraph Separator',
 675    'S'   => 'Segment Separator',
 676    'WS'  => 'Whitespace',
 677    'ON'  => 'Other Neutrals',
 678  );
 679
 680 =head2 B<bidi_types()>
 681
 682     use Unicode::UCD 'bidi_types';
 683
 684     my $categories = bidi_types();
 685
 686 This returns a reference to a hash which has the short
 687 bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
 688 names (such as C<Left-to-Right>, C<Right-to-Left>) as values.  The
 689 hash is reversible in case you need to go from the long names to the
 690 short names.  The bidi type is the one returned from
 691 L</charinfo()>
 692 under the C<bidi> key.  For the exact meaning of the various bidi classes
 693 the Unicode TR9 is recommended reading:
 694 L<http://www.unicode.org/reports/tr9/>
 695 (as of Unicode 5.0.0)
 696
 697 =cut
 698
 699 sub bidi_types {
 700     return dclone \%BIDI_TYPES;
 701 }
 702
 703 =head2 B<compexcl()>
 704
 705     use Unicode::UCD 'compexcl';
 706
 707     my $compexcl = compexcl(0x09dc);
 708
 709 This returns B<true> if the
 710 L</code point argument> should not be produced by composition normalization,
 711 B<AND> if that fact is not otherwise determinable from the Unicode data base.
 712 It currently does not return B<true> if the code point has a decomposition
 713 consisting of another single code point, nor if its decomposition starts
 714 with a code point whose combining class is non-zero.  Code points that meet
 715 either of these conditions should also not be produced by composition
 716 normalization.
 717
 718 It returns B<false> otherwise.
 719
 720 =cut
 721
 722 my %COMPEXCL;
 723
 724 sub _compexcl {
 725     unless (%COMPEXCL) {
 726         if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
 727             local $_;
 728             while (<$COMPEXCLFH>) {
 729                 if (/^([0-9A-F]+)\s+\#\s+/) {
 730                     my $code = hex($1);
 731                     $COMPEXCL{$code} = undef;
 732                 }
 733             }
 734             close($COMPEXCLFH);
 735         }
 736     }
 737 }
 738
 739 sub compexcl {
 740     my $arg  = shift;
 741     my $code = _getcode($arg);
 742     croak __PACKAGE__, "::compexcl: unknown code '$arg'"
 743         unless defined $code;
 744
 745     _compexcl() unless %COMPEXCL;
 746
 747     return exists $COMPEXCL{$code};
 748 }
 749
 750 =head2 B<casefold()>
 751
 752     use Unicode::UCD 'casefold';
 753
 754     my $casefold = casefold(0xDF);
 755     if (defined $casefold) {
 756         my @full_fold_hex = split / /, $casefold->{'full'};
 757         my $full_fold_string =
 758                     join "", map {chr(hex($_))} @full_fold_hex;
 759         my @turkic_fold_hex =
 760                         split / /, ($casefold->{'turkic'} ne "")
 761                                         ? $casefold->{'turkic'}
 762                                         : $casefold->{'full'};
 763         my $turkic_fold_string =
 764                         join "", map {chr(hex($_))} @turkic_fold_hex;
 765     }
 766     if (defined $casefold && $casefold->{'simple'} ne "") {
 767         my $simple_fold_hex = $casefold->{'simple'};
 768         my $simple_fold_string = chr(hex($simple_fold_hex));
 769     }
 770
 771 This returns the (almost) locale-independent case folding of the
 772 character specified by the L</code point argument>.
 773
 774 If there is no case folding for that code point, B<undef> is returned.
 775
 776 If there is a case folding for that code point, a reference to a hash
 777 with the following fields is returned:
 778
 779 =over
 780
 781 =item B<code>
 782
 783 the input L</code point argument> expressed in hexadecimal, with leading zeros
 784 added if necessary to make it contain at least four hexdigits
 785
 786 =item B<full>
 787
 788 one or more codes (separated by spaces) that taken in order give the
 789 code points for the case folding for I<code>.
 790 Each has at least four hexdigits.
 791
 792 =item B<simple>
 793
 794 is empty, or is exactly one code with at least four hexdigits which can be used
 795 as an alternative case folding when the calling program cannot cope with the
 796 fold being a sequence of multiple code points.  If I<full> is just one code
 797 point, then I<simple> equals I<full>.  If there is no single code point folding
 798 defined for I<code>, then I<simple> is the empty string.  Otherwise, it is an
 799 inferior, but still better-than-nothing alternative folding to I<full>.
 800
 801 =item B<mapping>
 802
 803 is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
 804 otherwise.  It can be considered to be the simplest possible folding for
 805 I<code>.  It is defined primarily for backwards compatibility.
 806
 807 =item B<status>
 808
 809 is C<C> (for C<common>) if the best possible fold is a single code point
 810 (I<simple> equals I<full> equals I<mapping>).  It is C<S> if there are distinct
 811 folds, I<simple> and I<full> (I<mapping> equals I<simple>).  And it is C<F> if
 812 there only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty).  Note
 813 that this
 814 describes the contents of I<mapping>.  It is defined primarily for backwards
 815 compatibility.
 816
 817 On versions 3.1 and earlier of Unicode, I<status> can also be
 818 C<I> which is the same as C<C> but is a special case for dotted uppercase I and
 819 dotless lowercase i:
 820
 821 =over
 822
 823 =item B<*>
 824
 825 If you use this C<I> mapping, the result is case-insensitive,
 826 but dotless and dotted I's are not distinguished
 827
 828 =item B<*>
 829
 830 If you exclude this C<I> mapping, the result is not fully case-insensitive, but
 831 dotless and dotted I's are distinguished
 832
 833 =back
 834
 835 =item B<turkic>
 836
 837 contains any special folding for Turkic languages.  For versions of Unicode
 838 starting with 3.2, this field is empty unless I<code> has a different folding
 839 in Turkic languages, in which case it is one or more codes (separated by
 840 spaces) that taken in order give the code points for the case folding for
 841 I<code> in those languages.
 842 Each code has at least four hexdigits.
 843 Note that this folding does not maintain canonical equivalence without
 844 additional processing.
 845
 846 For versions of Unicode 3.1 and earlier, this field is empty unless there is a
 847 special folding for Turkic languages, in which case I<status> is C<I>, and
 848 I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
 849
 850 =back
 851
 852 Programs that want complete generality and the best folding results should use
 853 the folding contained in the I<full> field.  But note that the fold for some
 854 code points will be a sequence of multiple code points.
 855
 856 Programs that can't cope with the fold mapping being multiple code points can
 857 use the folding contained in the I<simple> field, with the loss of some
 858 generality.  In Unicode 5.1, about 7% of the defined foldings have no single
 859 code point folding.
 860
 861 The I<mapping> and I<status> fields are provided for backwards compatibility for
 862 existing programs.  They contain the same values as in previous versions of
 863 this function.
 864
 865 Locale is not completely independent.  The I<turkic> field contains results to
 866 use when the locale is a Turkic language.
 867
 868 For more information about case mappings see
 869 L<http://www.unicode.org/unicode/reports/tr21>
 870
 871 =cut
 872
 873 my %CASEFOLD;
 874
 875 sub _casefold {
 876     unless (%CASEFOLD) {
 877         if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
 878             local $_;
 879             while (<$CASEFOLDFH>) {
 880                 if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 881                     my $code = hex($1);
 882                     $CASEFOLD{$code}{'code'} = $1;
 883                     $CASEFOLD{$code}{'turkic'} = "" unless
 884                                             defined $CASEFOLD{$code}{'turkic'};
 885                     if ($2 eq 'C' || $2 eq 'I') {       # 'I' is only on 3.1 and
 886                                                         # earlier Unicodes
 887                                                         # Both entries there (I
 888                                                         # only checked 3.1) are
 889                                                         # the same as C, and
 890                                                         # there are no other
 891                                                         # entries for those
 892                                                         # codepoints, so treat
 893                                                         # as if C, but override
 894                                                         # the turkic one for
 895                                                         # 'I'.
 896                         $CASEFOLD{$code}{'status'} = $2;
 897                         $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
 898                         $CASEFOLD{$code}{'mapping'} = $3;
 899                         $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
 900                     } elsif ($2 eq 'F') {
 901                         $CASEFOLD{$code}{'full'} = $3;
 902                         unless (defined $CASEFOLD{$code}{'simple'}) {
 903                                 $CASEFOLD{$code}{'simple'} = "";
 904                                 $CASEFOLD{$code}{'mapping'} = $3;
 905                                 $CASEFOLD{$code}{'status'} = $2;
 906                         }
 907                     } elsif ($2 eq 'S') {
 908
 909
 910                         # There can't be a simple without a full, and simple
 911                         # overrides all but full
 912
 913                         $CASEFOLD{$code}{'simple'} = $3;
 914                         $CASEFOLD{$code}{'mapping'} = $3;
 915                         $CASEFOLD{$code}{'status'} = $2;
 916                     } elsif ($2 eq 'T') {
 917                         $CASEFOLD{$code}{'turkic'} = $3;
 918                     } # else can't happen because only [CIFST] are possible
 919                 }
 920             }
 921             close($CASEFOLDFH);
 922         }
 923     }
 924 }
 925
 926 sub casefold {
 927     my $arg  = shift;
 928     my $code = _getcode($arg);
 929     croak __PACKAGE__, "::casefold: unknown code '$arg'"
 930         unless defined $code;
 931
 932     _casefold() unless %CASEFOLD;
 933
 934     return $CASEFOLD{$code};
 935 }
 936
 937 =head2 B<casespec()>
 938
 939     use Unicode::UCD 'casespec';
 940
 941     my $casespec = casespec(0xFB00);
 942
 943 This returns the potentially locale-dependent case mappings of the L</code point
 944 argument>.  The mappings may be longer than a single code point (which the basic
 945 Unicode case mappings as returned by L</charinfo()> never are).
 946
 947 If there are no case mappings for the L</code point argument>, or if all three
 948 possible mappings (I<lower>, I<title> and I<upper>) result in single code
 949 points and are locale independent and unconditional, B<undef> is returned
 950 (which means that the case mappings, if any, for the code point are those
 951 returned by L</charinfo()>).
 952
 953 Otherwise, a reference to a hash giving the mappings (or a reference to a hash
 954 of such hashes, explained below) is returned with the following keys and their
 955 meanings:
 956
 957 The keys in the bottom layer hash with the meanings of their values are:
 958
 959 =over
 960
 961 =item B<code>
 962
 963 the input L</code point argument> expressed in hexadecimal, with leading zeros
 964 added if necessary to make it contain at least four hexdigits
 965
 966 =item B<lower>
 967
 968 one or more codes (separated by spaces) that taken in order give the
 969 code points for the lower case of I<code>.
 970 Each has at least four hexdigits.
 971
 972 =item B<title>
 973
 974 one or more codes (separated by spaces) that taken in order give the
 975 code points for the title case of I<code>.
 976 Each has at least four hexdigits.
 977
 978 =item B<lower>
 979
 980 one or more codes (separated by spaces) that taken in order give the
 981 code points for the upper case of I<code>.
 982 Each has at least four hexdigits.
 983
 984 =item B<condition>
 985
 986 the conditions for the mappings to be valid.
 987 If B<undef>, the mappings are always valid.
 988 When defined, this field is a list of conditions,
 989 all of which must be true for the mappings to be valid.
 990 The list consists of one or more
 991 I<locales> (see below)
 992 and/or I<contexts> (explained in the next paragraph),
 993 separated by spaces.
 994 (Other than as used to separate elements, spaces are to be ignored.)
 995 Case distinctions in the condition list are not significant.
 996 Conditions preceded by "NON_" represent the negation of the condition.
 997
 998 A I<context> is one of those defined in the Unicode standard.
 999 For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
1000 available at
1001 L<http://www.unicode.org/versions/Unicode5.1.0/>.
1002 These are for context-sensitive casing.
1003
1004 =back
1005
1006 The hash described above is returned for locale-independent casing, where
1007 at least one of the mappings has length longer than one.  If B<undef> is
1008 returned, the code point may have mappings, but if so, all are length one,
1009 and are returned by L</charinfo()>.
1010 Note that when this function does return a value, it will be for the complete
1011 set of mappings for a code point, even those whose length is one.
1012
1013 If there are additional casing rules that apply only in certain locales,
1014 an additional key for each will be defined in the returned hash.  Each such key
1015 will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
1016 followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
1017 and a variant code).  You can find the lists of all possible locales, see
1018 L<Locale::Country> and L<Locale::Language>.
1019 (In Unicode 5.1, the only locales returned by this function
1020 are C<lt>, C<tr>, and C<az>.)
1021
1022 Each locale key is a reference to a hash that has the form above, and gives
1023 the casing rules for that particular locale, which take precedence over the
1024 locale-independent ones when in that locale.
1025
1026 If the only casing for a code point is locale-dependent, then the returned
1027 hash will not have any of the base keys, like C<code>, C<upper>, etc., but
1028 will contain only locale keys.
1029
1030 For more information about case mappings see
1031 L<http://www.unicode.org/unicode/reports/tr21/>
1032
1033 =cut
1034
1035 my %CASESPEC;
1036
1037 sub _casespec {
1038     unless (%CASESPEC) {
1039         if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
1040             local $_;
1041             while (<$CASESPECFH>) {
1042                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
1043                     my ($hexcode, $lower, $title, $upper, $condition) =
1044                         ($1, $2, $3, $4, $5);
1045                     my $code = hex($hexcode);
1046                     if (exists $CASESPEC{$code}) {
1047                         if (exists $CASESPEC{$code}->{code}) {
1048                             my ($oldlower,
1049                                 $oldtitle,
1050                                 $oldupper,
1051                                 $oldcondition) =
1052                                     @{$CASESPEC{$code}}{qw(lower
1053                                                            title
1054                                                            upper
1055                                                            condition)};
1056                             if (defined $oldcondition) {
1057                                 my ($oldlocale) =
1058                                 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
1059                                 delete $CASESPEC{$code};
1060                                 $CASESPEC{$code}->{$oldlocale} =
1061                                 { code      => $hexcode,
1062                                   lower     => $oldlower,
1063                                   title     => $oldtitle,
1064                                   upper     => $oldupper,
1065                                   condition => $oldcondition };
1066                             }
1067                         }
1068                         my ($locale) =
1069                             ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
1070                         $CASESPEC{$code}->{$locale} =
1071                         { code      => $hexcode,
1072                           lower     => $lower,
1073                           title     => $title,
1074                           upper     => $upper,
1075                           condition => $condition };
1076                     } else {
1077                         $CASESPEC{$code} =
1078                         { code      => $hexcode,
1079                           lower     => $lower,
1080                           title     => $title,
1081                           upper     => $upper,
1082                           condition => $condition };
1083                     }
1084                 }
1085             }
1086             close($CASESPECFH);
1087         }
1088     }
1089 }
1090
1091 sub casespec {
1092     my $arg  = shift;
1093     my $code = _getcode($arg);
1094     croak __PACKAGE__, "::casespec: unknown code '$arg'"
1095         unless defined $code;
1096
1097     _casespec() unless %CASESPEC;
1098
1099     return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
1100 }
1101
1102 =head2 B<namedseq()>
1103
1104     use Unicode::UCD 'namedseq';
1105
1106     my $namedseq = namedseq("KATAKANA LETTER AINU P");
1107     my @namedseq = namedseq("KATAKANA LETTER AINU P");
1108     my %namedseq = namedseq();
1109
1110 If used with a single argument in a scalar context, returns the string
1111 consisting of the code points of the named sequence, or B<undef> if no
1112 named sequence by that name exists.  If used with a single argument in
1113 a list context, it returns the list of the code points.  If used with no
1114 arguments in a list context, returns a hash with the names of the
1115 named sequences as the keys and the named sequences as strings as
1116 the values.  Otherwise, it returns B<undef> or an empty list depending
1117 on the context.
1118
1119 This function only operates on officially approved (not provisional) named
1120 sequences.
1121
1122 =cut
1123
1124 my %NAMEDSEQ;
1125
1126 sub _namedseq {
1127     unless (%NAMEDSEQ) {
1128         if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
1129             local $_;
1130             while (<$NAMEDSEQFH>) {
1131                 if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
1132                     my ($n, $s) = ($1, $2);
1133                     my @s = map { chr(hex($_)) } split(' ', $s);
1134                     $NAMEDSEQ{$n} = join("", @s);
1135                 }
1136             }
1137             close($NAMEDSEQFH);
1138         }
1139     }
1140 }
1141
1142 sub namedseq {
1143     _namedseq() unless %NAMEDSEQ;
1144     my $wantarray = wantarray();
1145     if (defined $wantarray) {
1146         if ($wantarray) {
1147             if (@_ == 0) {
1148                 return %NAMEDSEQ;
1149             } elsif (@_ == 1) {
1150                 my $s = $NAMEDSEQ{ $_[0] };
1151                 return defined $s ? map { ord($_) } split('', $s) : ();
1152             }
1153         } elsif (@_ == 1) {
1154             return $NAMEDSEQ{ $_[0] };
1155         }
1156     }
1157     return;
1158 }
1159
1160 =head2 Unicode::UCD::UnicodeVersion
1161
1162 This returns the version of the Unicode Character Database, in other words, the
1163 version of the Unicode standard the database implements.  The version is a
1164 string of numbers delimited by dots (C<'.'>).
1165
1166 =cut
1167
1168 my $UNICODEVERSION;
1169
1170 sub UnicodeVersion {
1171     unless (defined $UNICODEVERSION) {
1172         openunicode(\$VERSIONFH, "version");
1173         chomp($UNICODEVERSION = <$VERSIONFH>);
1174         close($VERSIONFH);
1175         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
1176             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
1177     }
1178     return $UNICODEVERSION;
1179 }
1180
1181 =head2 B<Blocks versus Scripts>
1182
1183 The difference between a block and a script is that scripts are closer
1184 to the linguistic notion of a set of code points required to present
1185 languages, while block is more of an artifact of the Unicode code point
1186 numbering and separation into blocks of (mostly) 256 code points.
1187
1188 For example the Latin B<script> is spread over several B<blocks>, such
1189 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
1190 C<Latin Extended-B>.  On the other hand, the Latin script does not
1191 contain all the characters of the C<Basic Latin> block (also known as
1192 ASCII): it includes only the letters, and not, for example, the digits
1193 or the punctuation.
1194
1195 For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
1196
1197 For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
1198
1199 =head2 B<Matching Scripts and Blocks>
1200
1201 Scripts are matched with the regular-expression construct
1202 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
1203 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
1204 any of the 256 code points in the Tibetan block).
1205
1206
1207 =head2 Implementation Note
1208
1209 The first use of charinfo() opens a read-only filehandle to the Unicode
1210 Character Database (the database is included in the Perl distribution).
1211 The filehandle is then kept open for further queries.  In other words,
1212 if you are wondering where one of your filehandles went, that's where.
1213
1214 =head1 BUGS
1215
1216 Does not yet support EBCDIC platforms.
1217
1218 L</compexcl()> should give a complete list of excluded code points.
1219
1220 =head1 AUTHOR
1221
1222 Jarkko Hietaniemi
1223
1224 =cut
1225
1226 1;