lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '0.27';
   7
   8 use Storable qw(dclone);
   9
  10 require Exporter;
  11
  12 our @ISA = qw(Exporter);
  13
  14 our @EXPORT_OK = qw(charinfo
  15                     charblock charscript
  16                     charblocks charscripts
  17                     charinrange
  18                     general_categories bidi_types
  19                     compexcl
  20                     casefold casespec
  21                     namedseq);
  22
  23 use Carp;
  24
  25 =head1 NAME
  26
  27 Unicode::UCD - Unicode character database
  28
  29 =head1 SYNOPSIS
  30
  31     use Unicode::UCD 'charinfo';
  32     my $charinfo   = charinfo($codepoint);
  33
  34     use Unicode::UCD 'charblock';
  35     my $charblock  = charblock($codepoint);
  36
  37     use Unicode::UCD 'charscript';
  38     my $charscript = charscript($codepoint);
  39
  40     use Unicode::UCD 'charblocks';
  41     my $charblocks = charblocks();
  42
  43     use Unicode::UCD 'charscripts';
  44     my $charscripts = charscripts();
  45
  46     use Unicode::UCD qw(charscript charinrange);
  47     my $range = charscript($script);
  48     print "looks like $script\n" if charinrange($range, $codepoint);
  49
  50     use Unicode::UCD qw(general_categories bidi_types);
  51     my $categories = general_categories();
  52     my $types = bidi_types();
  53
  54     use Unicode::UCD 'compexcl';
  55     my $compexcl = compexcl($codepoint);
  56
  57     use Unicode::UCD 'namedseq';
  58     my $namedseq = namedseq($named_sequence_name);
  59
  60     my $unicode_version = Unicode::UCD::UnicodeVersion();
  61
  62 =head1 DESCRIPTION
  63
  64 The Unicode::UCD module offers a series of functions that
  65 provide a simple interface to the Unicode
  66 Character Database.
  67
  68 =head2 code point argument
  69
  70 Some of the functions are called with a I<code point argument>, which is either
  71 a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
  72 followed by hexadecimals designating a Unicode code point.  In other words, if
  73 you want a code point to be interpreted as a hexadecimal number, you must
  74 prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
  75 interpreted as a decimal code point.  Also note that Unicode is B<not> limited
  76 to 16 bits (the number of Unicode code points is open-ended, in theory
  77 unlimited): you may have more than 4 hexdigits.
  78 =cut
  79
  80 my $UNICODEFH;
  81 my $BLOCKSFH;
  82 my $SCRIPTSFH;
  83 my $VERSIONFH;
  84 my $COMPEXCLFH;
  85 my $CASEFOLDFH;
  86 my $CASESPECFH;
  87 my $NAMEDSEQFH;
  88
  89 sub openunicode {
  90     my ($rfh, @path) = @_;
  91     my $f;
  92     unless (defined $$rfh) {
  93         for my $d (@INC) {
  94             use File::Spec;
  95             $f = File::Spec->catfile($d, "unicore", @path);
  96             last if open($$rfh, $f);
  97             undef $f;
  98         }
  99         croak __PACKAGE__, ": failed to find ",
 100               File::Spec->catfile(@path), " in @INC"
 101             unless defined $f;
 102     }
 103     return $f;
 104 }
 105
 106 =head2 B<charinfo()>
 107
 108     use Unicode::UCD 'charinfo';
 109
 110     my $charinfo = charinfo(0x41);
 111
 112 This returns information about the input L</code point argument>
 113 as a reference to a hash of fields as defined by the Unicode
 114 standard.  If the L</code point argument> is not assigned in the standard
 115 (i.e., has the general category C<Cn> meaning C<Unassigned>)
 116 or is a non-character (meaning it is guaranteed to never be assigned in
 117 the standard),
 118 B<undef> is returned.
 119
 120 Fields that aren't applicable to the particular code point argument exist in the
 121 returned hash, and are empty.
 122
 123 The keys in the hash with the meanings of their values are:
 124
 125 =over
 126
 127 =item B<code>
 128
 129 the input L</code point argument> expressed in hexadecimal, with leading zeros
 130 added if necessary to make it contain at least four hexdigits
 131
 132 =item B<name>
 133
 134 name of I<code>, all IN UPPER CASE.
 135 Some control-type code points do not have names.
 136 This field will be empty for C<Surrogate> and C<Private Use> code points,
 137 and for the others without a name,
 138 it will contain a description enclosed in angle brackets, like
 139 C<E<lt>controlE<gt>>.
 140
 141
 142 =item B<category>
 143
 144 The short name of the general category of I<code>.
 145 This will match one of the keys in the hash returned by L</general_categories()>.
 146
 147 =item B<combining>
 148
 149 the combining class number for I<code> used in the Canonical Ordering Algorithm.
 150 For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
 151 available at
 152 L<http://www.unicode.org/versions/Unicode5.1.0/>
 153
 154 =item B<bidi>
 155
 156 bidirectional type of I<code>.
 157 This will match one of the keys in the hash returned by L</bidi_types()>.
 158
 159 =item B<decomposition>
 160
 161 is empty if I<code> has no decomposition; or is one or more codes
 162 (separated by spaces) that taken in order represent a decomposition for
 163 I<code>.  Each has at least four hexdigits.
 164 The codes may be preceded by a word enclosed in angle brackets then a space,
 165 like C<E<lt>compatE<gt> >, giving the type of decomposition
 166
 167 =item B<decimal>
 168
 169 if I<code> is a decimal digit this is its integer numeric value
 170
 171 =item B<digit>
 172
 173 if I<code> represents a whole number, this is its integer numeric value
 174
 175 =item B<numeric>
 176
 177 if I<code> represents a whole or rational number, this is its numeric value.
 178 Rational values are expressed as a string like C<1/4>.
 179
 180 =item B<mirrored>
 181
 182 C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
 183
 184 =item B<unicode10>
 185
 186 name of I<code> in the Unicode 1.0 standard if one
 187 existed for this code point and is different from the current name
 188
 189 =item B<comment>
 190
 191 ISO 10646 comment field.
 192 It appears in parentheses in the ISO 10646 names list,
 193 or contains an asterisk to indicate there is
 194 a note for this code point in Annex P of that standard.
 195
 196 =item B<upper>
 197
 198 is empty if there is no single code point uppercase mapping for I<code>;
 199 otherwise it is that mapping expressed as at least four hexdigits.
 200 (L</casespec()> should be used in addition to B<charinfo()>
 201 for case mappings when the calling program can cope with multiple code point
 202 mappings.)
 203
 204 =item B<lower>
 205
 206 is empty if there is no single code point lowercase mapping for I<code>;
 207 otherwise it is that mapping expressed as at least four hexdigits.
 208 (L</casespec()> should be used in addition to B<charinfo()>
 209 for case mappings when the calling program can cope with multiple code point
 210 mappings.)
 211
 212 =item B<title>
 213
 214 is empty if there is no single code point titlecase mapping for I<code>;
 215 otherwise it is that mapping expressed as at least four hexdigits.
 216 (L</casespec()> should be used in addition to B<charinfo()>
 217 for case mappings when the calling program can cope with multiple code point
 218 mappings.)
 219
 220 =item B<block>
 221
 222 block I<code> belongs to (used in \p{In...}).
 223 See L</Blocks versus Scripts>.
 224
 225
 226 =item B<script>
 227
 228 script I<code> belongs to.
 229 See L</Blocks versus Scripts>.
 230
 231 =back
 232
 233 Note that you cannot do (de)composition and casing based solely on the
 234 I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields;
 235 you will need also the L</compexcl()>, and L</casespec()> functions.
 236
 237 =cut
 238
 239 # NB: This function is duplicated in charnames.pm
 240 sub _getcode {
 241     my $arg = shift;
 242
 243     if ($arg =~ /^[1-9]\d*$/) {
 244         return $arg;
 245     } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
 246         return hex($1);
 247     }
 248
 249     return;
 250 }
 251
 252 # Lingua::KO::Hangul::Util not part of the standard distribution
 253 # but it will be used if available.
 254
 255 eval { require Lingua::KO::Hangul::Util };
 256 my $hasHangulUtil = ! $@;
 257 if ($hasHangulUtil) {
 258     Lingua::KO::Hangul::Util->import();
 259 }
 260
 261 sub hangul_decomp { # internal: called from charinfo
 262     if ($hasHangulUtil) {
 263         my @tmp = decomposeHangul(shift);
 264         return sprintf("%04X %04X",      @tmp) if @tmp == 2;
 265         return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
 266     }
 267     return;
 268 }
 269
 270 sub hangul_charname { # internal: called from charinfo
 271     return sprintf("HANGUL SYLLABLE-%04X", shift);
 272 }
 273
 274 sub han_charname { # internal: called from charinfo
 275     return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
 276 }
 277
 278 my %first_last = (
 279    'CJK Ideograph Extension A' => [ 0x3400,   0x4DB5   ],
 280    'CJK Ideograph'             => [ 0x4E00,   0x9FA5   ],
 281    'CJK Ideograph Extension B' => [ 0x20000,  0x2A6D6  ],
 282 );
 283
 284 get_charinfo_ranges();
 285
 286 sub get_charinfo_ranges {
 287    my @blocks = keys %first_last;
 288
 289    my $fh;
 290    openunicode( \$fh, 'UnicodeData.txt' );
 291    if( defined $fh ){
 292       while( my $line = <$fh> ){
 293          next unless $line =~ /(?:First|Last)/;
 294          if( grep{ $line =~ /[^;]+;<$_\s*,\s*(?:First|Last)>/ }@blocks ){
 295             my ($number,$block,$type);
 296             ($number,$block) = split /;/, $line;
 297             $block =~ s/<|>//g;
 298             ($block,$type) = split /, /, $block;
 299             my $index = $type eq 'First' ? 0 : 1;
 300             $first_last{ $block }->[$index] = hex $number;
 301          }
 302       }
 303    }
 304 }
 305
 306 my @CharinfoRanges = (
 307 # block name
 308 # [ first, last, coderef to name, coderef to decompose ],
 309 # CJK Ideographs Extension A
 310   [ @{ $first_last{'CJK Ideograph Extension A'} },        \&han_charname,   undef  ],
 311 # CJK Ideographs
 312   [ @{ $first_last{'CJK Ideograph'} },                    \&han_charname,   undef  ],
 313 # Hangul Syllables
 314   [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
 315 # Non-Private Use High Surrogates
 316   [ 0xD800,   0xDB7F,   undef,   undef  ],
 317 # Private Use High Surrogates
 318   [ 0xDB80,   0xDBFF,   undef,   undef  ],
 319 # Low Surrogates
 320   [ 0xDC00,   0xDFFF,   undef,   undef  ],
 321 # The Private Use Area
 322   [ 0xE000,   0xF8FF,   undef,   undef  ],
 323 # CJK Ideographs Extension B
 324   [ @{ $first_last{'CJK Ideograph Extension B'} },        \&han_charname,   undef  ],
 325 # Plane 15 Private Use Area
 326   [ 0xF0000,  0xFFFFD,  undef,   undef  ],
 327 # Plane 16 Private Use Area
 328   [ 0x100000, 0x10FFFD, undef,   undef  ],
 329 );
 330
 331 sub charinfo {
 332     my $arg  = shift;
 333     my $code = _getcode($arg);
 334     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 335         unless defined $code;
 336     my $hexk = sprintf("%06X", $code);
 337     my($rcode,$rname,$rdec);
 338     foreach my $range (@CharinfoRanges){
 339       if ($range->[0] <= $code && $code <= $range->[1]) {
 340         $rcode = $hexk;
 341         $rcode =~ s/^0+//;
 342         $rcode =  sprintf("%04X", hex($rcode));
 343         $rname = $range->[2] ? $range->[2]->($code) : '';
 344         $rdec  = $range->[3] ? $range->[3]->($code) : '';
 345         $hexk  = sprintf("%06X", $range->[0]); # replace by the first
 346         last;
 347       }
 348     }
 349     openunicode(\$UNICODEFH, "UnicodeData.txt");
 350     if (defined $UNICODEFH) {
 351         use Search::Dict 1.02;
 352         if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
 353             my $line = <$UNICODEFH>;
 354             return unless defined $line;
 355             chomp $line;
 356             my %prop;
 357             @prop{qw(
 358                      code name category
 359                      combining bidi decomposition
 360                      decimal digit numeric
 361                      mirrored unicode10 comment
 362                      upper lower title
 363                     )} = split(/;/, $line, -1);
 364             $hexk =~ s/^0+//;
 365             $hexk =  sprintf("%04X", hex($hexk));
 366             if ($prop{code} eq $hexk) {
 367                 $prop{block}  = charblock($code);
 368                 $prop{script} = charscript($code);
 369                 if(defined $rname){
 370                     $prop{code} = $rcode;
 371                     $prop{name} = $rname;
 372                     $prop{decomposition} = $rdec;
 373                 }
 374                 return \%prop;
 375             }
 376         }
 377     }
 378     return;
 379 }
 380
 381 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 382     my ($table, $lo, $hi, $code) = @_;
 383
 384     return if $lo > $hi;
 385
 386     my $mid = int(($lo+$hi) / 2);
 387
 388     if ($table->[$mid]->[0] < $code) {
 389         if ($table->[$mid]->[1] >= $code) {
 390             return $table->[$mid]->[2];
 391         } else {
 392             _search($table, $mid + 1, $hi, $code);
 393         }
 394     } elsif ($table->[$mid]->[0] > $code) {
 395         _search($table, $lo, $mid - 1, $code);
 396     } else {
 397         return $table->[$mid]->[2];
 398     }
 399 }
 400
 401 sub charinrange {
 402     my ($range, $arg) = @_;
 403     my $code = _getcode($arg);
 404     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 405         unless defined $code;
 406     _search($range, 0, $#$range, $code);
 407 }
 408
 409 =head2 B<charblock()>
 410
 411     use Unicode::UCD 'charblock';
 412
 413     my $charblock = charblock(0x41);
 414     my $charblock = charblock(1234);
 415     my $charblock = charblock(0x263a);
 416     my $charblock = charblock("U+263a");
 417
 418     my $range     = charblock('Armenian');
 419
 420 With a L</code point argument> charblock() returns the I<block> the code point
 421 belongs to, e.g.  C<Basic Latin>.
 422 If the code point is unassigned, this returns the block it would belong to if
 423 it were assigned (which it may in future versions of the Unicode Standard).
 424
 425 See also L</Blocks versus Scripts>.
 426
 427 If supplied with an argument that can't be a code point, charblock() tries
 428 to do the opposite and interpret the argument as a code point block. The
 429 return value is a I<range>: an anonymous list of lists that contain
 430 I<start-of-range>, I<end-of-range> code point pairs. You can test whether
 431 a code point is in a range using the L</charinrange()> function. If the
 432 argument is not a known code point block, B<undef> is returned.
 433
 434 =cut
 435
 436 my @BLOCKS;
 437 my %BLOCKS;
 438
 439 sub _charblocks {
 440     unless (@BLOCKS) {
 441         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 442             local $_;
 443             while (<$BLOCKSFH>) {
 444                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 445                     my ($lo, $hi) = (hex($1), hex($2));
 446                     my $subrange = [ $lo, $hi, $3 ];
 447                     push @BLOCKS, $subrange;
 448                     push @{$BLOCKS{$3}}, $subrange;
 449                 }
 450             }
 451             close($BLOCKSFH);
 452         }
 453     }
 454 }
 455
 456 sub charblock {
 457     my $arg = shift;
 458
 459     _charblocks() unless @BLOCKS;
 460
 461     my $code = _getcode($arg);
 462
 463     if (defined $code) {
 464         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 465     } else {
 466         if (exists $BLOCKS{$arg}) {
 467             return dclone $BLOCKS{$arg};
 468         } else {
 469             return;
 470         }
 471     }
 472 }
 473
 474 =head2 B<charscript()>
 475
 476     use Unicode::UCD 'charscript';
 477
 478     my $charscript = charscript(0x41);
 479     my $charscript = charscript(1234);
 480     my $charscript = charscript("U+263a");
 481
 482     my $range      = charscript('Thai');
 483
 484 With a L</code point argument> charscript() returns the I<script> the
 485 code point belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 486 If the code point is unassigned, it returns B<undef>
 487
 488 If supplied with an argument that can't be a code point, charscript() tries
 489 to do the opposite and interpret the argument as a code point script. The
 490 return value is a I<range>: an anonymous list of lists that contain
 491 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
 492 code point is in a range using the L</charinrange()> function. If the
 493 argument is not a known code point script, B<undef> is returned.
 494
 495 See also L</Blocks versus Scripts>.
 496
 497 =cut
 498
 499 my @SCRIPTS;
 500 my %SCRIPTS;
 501
 502 sub _charscripts {
 503     unless (@SCRIPTS) {
 504         if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 505             local $_;
 506             while (<$SCRIPTSFH>) {
 507                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 508                     my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 509                     my $script = lc($3);
 510                     $script =~ s/\b(\w)/uc($1)/ge;
 511                     my $subrange = [ $lo, $hi, $script ];
 512                     push @SCRIPTS, $subrange;
 513                     push @{$SCRIPTS{$script}}, $subrange;
 514                 }
 515             }
 516             close($SCRIPTSFH);
 517             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 518         }
 519     }
 520 }
 521
 522 sub charscript {
 523     my $arg = shift;
 524
 525     _charscripts() unless @SCRIPTS;
 526
 527     my $code = _getcode($arg);
 528
 529     if (defined $code) {
 530         _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 531     } else {
 532         if (exists $SCRIPTS{$arg}) {
 533             return dclone $SCRIPTS{$arg};
 534         } else {
 535             return;
 536         }
 537     }
 538 }
 539
 540 =head2 B<charblocks()>
 541
 542     use Unicode::UCD 'charblocks';
 543
 544     my $charblocks = charblocks();
 545
 546 charblocks() returns a reference to a hash with the known block names
 547 as the keys, and the code point ranges (see L</charblock()>) as the values.
 548
 549 See also L</Blocks versus Scripts>.
 550
 551 =cut
 552
 553 sub charblocks {
 554     _charblocks() unless %BLOCKS;
 555     return dclone \%BLOCKS;
 556 }
 557
 558 =head2 B<charscripts()>
 559
 560     use Unicode::UCD 'charscripts';
 561
 562     my $charscripts = charscripts();
 563
 564 charscripts() returns a reference to a hash with the known script
 565 names as the keys, and the code point ranges (see L</charscript()>) as
 566 the values.
 567
 568 See also L</Blocks versus Scripts>.
 569
 570 =cut
 571
 572 sub charscripts {
 573     _charscripts() unless %SCRIPTS;
 574     return dclone \%SCRIPTS;
 575 }
 576
 577 =head2 B<charinrange()>
 578
 579 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 580 can also test whether a code point is in the I<range> as returned by
 581 L</charblock()> and L</charscript()> or as the values of the hash returned
 582 by L</charblocks()> and L</charscripts()> by using charinrange():
 583
 584     use Unicode::UCD qw(charscript charinrange);
 585
 586     $range = charscript('Hiragana');
 587     print "looks like hiragana\n" if charinrange($range, $codepoint);
 588
 589 =cut
 590
 591 my %GENERAL_CATEGORIES =
 592  (
 593     'L'  =>         'Letter',
 594     'LC' =>         'CasedLetter',
 595     'Lu' =>         'UppercaseLetter',
 596     'Ll' =>         'LowercaseLetter',
 597     'Lt' =>         'TitlecaseLetter',
 598     'Lm' =>         'ModifierLetter',
 599     'Lo' =>         'OtherLetter',
 600     'M'  =>         'Mark',
 601     'Mn' =>         'NonspacingMark',
 602     'Mc' =>         'SpacingMark',
 603     'Me' =>         'EnclosingMark',
 604     'N'  =>         'Number',
 605     'Nd' =>         'DecimalNumber',
 606     'Nl' =>         'LetterNumber',
 607     'No' =>         'OtherNumber',
 608     'P'  =>         'Punctuation',
 609     'Pc' =>         'ConnectorPunctuation',
 610     'Pd' =>         'DashPunctuation',
 611     'Ps' =>         'OpenPunctuation',
 612     'Pe' =>         'ClosePunctuation',
 613     'Pi' =>         'InitialPunctuation',
 614     'Pf' =>         'FinalPunctuation',
 615     'Po' =>         'OtherPunctuation',
 616     'S'  =>         'Symbol',
 617     'Sm' =>         'MathSymbol',
 618     'Sc' =>         'CurrencySymbol',
 619     'Sk' =>         'ModifierSymbol',
 620     'So' =>         'OtherSymbol',
 621     'Z'  =>         'Separator',
 622     'Zs' =>         'SpaceSeparator',
 623     'Zl' =>         'LineSeparator',
 624     'Zp' =>         'ParagraphSeparator',
 625     'C'  =>         'Other',
 626     'Cc' =>         'Control',
 627     'Cf' =>         'Format',
 628     'Cs' =>         'Surrogate',
 629     'Co' =>         'PrivateUse',
 630     'Cn' =>         'Unassigned',
 631  );
 632
 633 sub general_categories {
 634     return dclone \%GENERAL_CATEGORIES;
 635 }
 636
 637 =head2 B<general_categories()>
 638
 639     use Unicode::UCD 'general_categories';
 640
 641     my $categories = general_categories();
 642
 643 This returns a reference to a hash which has short
 644 general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
 645 names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
 646 C<Symbol>) as values.  The hash is reversible in case you need to go
 647 from the long names to the short names.  The general category is the
 648 one returned from
 649 L</charinfo()> under the C<category> key.
 650
 651 =cut
 652
 653 my %BIDI_TYPES =
 654  (
 655    'L'   => 'Left-to-Right',
 656    'LRE' => 'Left-to-Right Embedding',
 657    'LRO' => 'Left-to-Right Override',
 658    'R'   => 'Right-to-Left',
 659    'AL'  => 'Right-to-Left Arabic',
 660    'RLE' => 'Right-to-Left Embedding',
 661    'RLO' => 'Right-to-Left Override',
 662    'PDF' => 'Pop Directional Format',
 663    'EN'  => 'European Number',
 664    'ES'  => 'European Number Separator',
 665    'ET'  => 'European Number Terminator',
 666    'AN'  => 'Arabic Number',
 667    'CS'  => 'Common Number Separator',
 668    'NSM' => 'Non-Spacing Mark',
 669    'BN'  => 'Boundary Neutral',
 670    'B'   => 'Paragraph Separator',
 671    'S'   => 'Segment Separator',
 672    'WS'  => 'Whitespace',
 673    'ON'  => 'Other Neutrals',
 674  );
 675
 676 =head2 B<bidi_types()>
 677
 678     use Unicode::UCD 'bidi_types';
 679
 680     my $categories = bidi_types();
 681
 682 This returns a reference to a hash which has the short
 683 bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
 684 names (such as C<Left-to-Right>, C<Right-to-Left>) as values.  The
 685 hash is reversible in case you need to go from the long names to the
 686 short names.  The bidi type is the one returned from
 687 L</charinfo()>
 688 under the C<bidi> key.  For the exact meaning of the various bidi classes
 689 the Unicode TR9 is recommended reading:
 690 L<http://www.unicode.org/reports/tr9/>
 691 (as of Unicode 5.0.0)
 692
 693 =cut
 694
 695 sub bidi_types {
 696     return dclone \%BIDI_TYPES;
 697 }
 698
 699 =head2 B<compexcl()>
 700
 701     use Unicode::UCD 'compexcl';
 702
 703     my $compexcl = compexcl(0x09dc);
 704
 705 This returns B<true> if the
 706 L</code point argument> should not be produced by composition normalization,
 707 B<AND> if that fact is not otherwise determinable from the Unicode data base.
 708 It currently does not return B<true> if the code point has a decomposition
 709 consisting of another single code point, nor if its decomposition starts
 710 with a code point whose combining class is non-zero.  Code points that meet
 711 either of these conditions should also not be produced by composition
 712 normalization.
 713
 714 It returns B<false> otherwise.
 715
 716 =cut
 717
 718 my %COMPEXCL;
 719
 720 sub _compexcl {
 721     unless (%COMPEXCL) {
 722         if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
 723             local $_;
 724             while (<$COMPEXCLFH>) {
 725                 if (/^([0-9A-F]+)\s+\#\s+/) {
 726                     my $code = hex($1);
 727                     $COMPEXCL{$code} = undef;
 728                 }
 729             }
 730             close($COMPEXCLFH);
 731         }
 732     }
 733 }
 734
 735 sub compexcl {
 736     my $arg  = shift;
 737     my $code = _getcode($arg);
 738     croak __PACKAGE__, "::compexcl: unknown code '$arg'"
 739         unless defined $code;
 740
 741     _compexcl() unless %COMPEXCL;
 742
 743     return exists $COMPEXCL{$code};
 744 }
 745
 746 =head2 B<casefold()>
 747
 748     use Unicode::UCD 'casefold';
 749
 750     my $casefold = casefold(0xDF);
 751     if (defined $casefold) {
 752         my @full_fold_hex = split / /, $casefold->{'full'};
 753         my $full_fold_string =
 754                     join "", map {chr(hex($_))} @full_fold_hex;
 755         my @turkic_fold_hex =
 756                         split / /, ($casefold->{'turkic'} ne "")
 757                                         ? $casefold->{'turkic'}
 758                                         : $casefold->{'full'};
 759         my $turkic_fold_string =
 760                         join "", map {chr(hex($_))} @turkic_fold_hex;
 761     }
 762     if (defined $casefold && $casefold->{'simple'} ne "") {
 763         my $simple_fold_hex = $casefold->{'simple'};
 764         my $simple_fold_string = chr(hex($simple_fold_hex));
 765     }
 766
 767 This returns the (almost) locale-independent case folding of the
 768 character specified by the L</code point argument>.
 769
 770 If there is no case folding for that code point, B<undef> is returned.
 771
 772 If there is a case folding for that code point, a reference to a hash
 773 with the following fields is returned:
 774
 775 =over
 776
 777 =item B<code>
 778
 779 the input L</code point argument> expressed in hexadecimal, with leading zeros
 780 added if necessary to make it contain at least four hexdigits
 781
 782 =item B<full>
 783
 784 one or more codes (separated by spaces) that taken in order give the
 785 code points for the case folding for I<code>.
 786 Each has at least four hexdigits.
 787
 788 =item B<simple>
 789
 790 is empty, or is exactly one code with at least four hexdigits which can be used
 791 as an alternative case folding when the calling program cannot cope with the
 792 fold being a sequence of multiple code points.  If I<full> is just one code
 793 point, then I<simple> equals I<full>.  If there is no single code point folding
 794 defined for I<code>, then I<simple> is the empty string.  Otherwise, it is an
 795 inferior, but still better-than-nothing alternative folding to I<full>.
 796
 797 =item B<mapping>
 798
 799 is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
 800 otherwise.  It can be considered to be the simplest possible folding for
 801 I<code>.  It is defined primarily for backwards compatibility.
 802
 803 =item B<status>
 804
 805 is C<C> (for C<common>) if the best possible fold is a single code point
 806 (I<simple> equals I<full> equals I<mapping>).  It is C<S> if there are distinct
 807 folds, I<simple> and I<full> (I<mapping> equals I<simple>).  And it is C<F> if
 808 there only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty).  Note
 809 that this
 810 describes the contents of I<mapping>.  It is defined primarily for backwards
 811 compatibility.
 812
 813 On versions 3.1 and earlier of Unicode, I<status> can also be
 814 C<I> which is the same as C<C> but is a special case for dotted uppercase I and
 815 dotless lowercase i:
 816
 817 =over
 818
 819 =item B<*>
 820
 821 If you use this C<I> mapping, the result is case-insensitive,
 822 but dotless and dotted I's are not distinguished
 823
 824 =item B<*>
 825
 826 If you exclude this C<I> mapping, the result is not fully case-insensitive, but
 827 dotless and dotted I's are distinguished
 828
 829 =back
 830
 831 =item B<turkic>
 832
 833 contains any special folding for Turkic languages.  For versions of Unicode
 834 starting with 3.2, this field is empty unless I<code> has a different folding
 835 in Turkic languages, in which case it is one or more codes (separated by
 836 spaces) that taken in order give the code points for the case folding for
 837 I<code> in those languages.
 838 Each code has at least four hexdigits.
 839 Note that this folding does not maintain canonical equivalence without
 840 additional processing.
 841
 842 For versions of Unicode 3.1 and earlier, this field is empty unless there is a
 843 special folding for Turkic languages, in which case I<status> is C<I>, and
 844 I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
 845
 846 =back
 847
 848 Programs that want complete generality and the best folding results should use
 849 the folding contained in the I<full> field.  But note that the fold for some
 850 code points will be a sequence of multiple code points.
 851
 852 Programs that can't cope with the fold mapping being multiple code points can
 853 use the folding contained in the I<simple> field, with the loss of some
 854 generality.  In Unicode 5.1, about 7% of the defined foldings have no single
 855 code point folding.
 856
 857 The I<mapping> and I<status> fields are provided for backwards compatibility for
 858 existing programs.  They contain the same values as in previous versions of
 859 this function.
 860
 861 Locale is not completely independent.  The I<turkic> field contains results to
 862 use when the locale is a Turkic language.
 863
 864 For more information about case mappings see
 865 L<http://www.unicode.org/unicode/reports/tr21>
 866
 867 =cut
 868
 869 my %CASEFOLD;
 870
 871 sub _casefold {
 872     unless (%CASEFOLD) {
 873         if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
 874             local $_;
 875             while (<$CASEFOLDFH>) {
 876                 if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 877                     my $code = hex($1);
 878                     $CASEFOLD{$code}{'code'} = $1;
 879                     $CASEFOLD{$code}{'turkic'} = "" unless
 880                                             defined $CASEFOLD{$code}{'turkic'};
 881                     if ($2 eq 'C' || $2 eq 'I') {       # 'I' is only on 3.1 and
 882                                                         # earlier Unicodes
 883                                                         # Both entries there (I
 884                                                         # only checked 3.1) are
 885                                                         # the same as C, and
 886                                                         # there are no other
 887                                                         # entries for those
 888                                                         # codepoints, so treat
 889                                                         # as if C, but override
 890                                                         # the turkic one for
 891                                                         # 'I'.
 892                         $CASEFOLD{$code}{'status'} = $2;
 893                         $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
 894                         $CASEFOLD{$code}{'mapping'} = $3;
 895                         $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
 896                     } elsif ($2 eq 'F') {
 897                         $CASEFOLD{$code}{'full'} = $3;
 898                         unless (defined $CASEFOLD{$code}{'simple'}) {
 899                                 $CASEFOLD{$code}{'simple'} = "";
 900                                 $CASEFOLD{$code}{'mapping'} = $3;
 901                                 $CASEFOLD{$code}{'status'} = $2;
 902                         }
 903                     } elsif ($2 eq 'S') {
 904
 905
 906                         # There can't be a simple without a full, and simple
 907                         # overrides all but full
 908
 909                         $CASEFOLD{$code}{'simple'} = $3;
 910                         $CASEFOLD{$code}{'mapping'} = $3;
 911                         $CASEFOLD{$code}{'status'} = $2;
 912                     } elsif ($2 eq 'T') {
 913                         $CASEFOLD{$code}{'turkic'} = $3;
 914                     } # else can't happen because only [CIFST] are possible
 915                 }
 916             }
 917             close($CASEFOLDFH);
 918         }
 919     }
 920 }
 921
 922 sub casefold {
 923     my $arg  = shift;
 924     my $code = _getcode($arg);
 925     croak __PACKAGE__, "::casefold: unknown code '$arg'"
 926         unless defined $code;
 927
 928     _casefold() unless %CASEFOLD;
 929
 930     return $CASEFOLD{$code};
 931 }
 932
 933 =head2 B<casespec()>
 934
 935     use Unicode::UCD 'casespec';
 936
 937     my $casespec = casespec(0xFB00);
 938
 939 This returns the potentially locale-dependent case mappings of the L</code point
 940 argument>.  The mappings may be longer than a single code point (which the basic
 941 Unicode case mappings as returned by L</charinfo()> never are).
 942
 943 If there are no case mappings for the L</code point argument>, or if all three
 944 possible mappings (I<lower>, I<title> and I<upper>) result in single code
 945 points and are locale independent and unconditional, B<undef> is returned.
 946
 947 Otherwise, a reference to a hash giving the mappings (or a reference to a hash
 948 of such hashes, explained below) is returned.
 949
 950 The keys in the bottom layer hash with the meanings of their values are:
 951
 952 =over
 953
 954 =item B<code>
 955
 956 the input L</code point argument> expressed in hexadecimal, with leading zeros
 957 added if necessary to make it contain at least four hexdigits
 958
 959 =item B<lower>
 960
 961 one or more codes (separated by spaces) that taken in order give the
 962 code points for the lower case of I<code>.
 963 Each has at least four hexdigits.
 964
 965 =item B<title>
 966
 967 one or more codes (separated by spaces) that taken in order give the
 968 code points for the title case of I<code>.
 969 Each has at least four hexdigits.
 970
 971 =item B<lower>
 972
 973 one or more codes (separated by spaces) that taken in order give the
 974 code points for the upper case of I<code>.
 975 Each has at least four hexdigits.
 976
 977 =item B<condition>
 978
 979 the conditions for the mappings to be valid.
 980 If B<undef>, the mappings are always valid.
 981 When defined, this field is a list of conditions,
 982 all of which must be true for the mappings to be valid.
 983 The list consists of one or more
 984 I<locales> (see below)
 985 and/or I<contexts> (explained in the next paragraph),
 986 separated by spaces.
 987 (Other than as used to separate elements, spaces are to be ignored.)
 988 Case distinctions in the condition list are not significant.
 989 Conditions preceded by "NON_" represent the negation of the condition.
 990
 991 A I<context> is one of those defined in the Unicode standard.
 992 For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
 993 available at
 994 L<http://www.unicode.org/versions/Unicode5.1.0/>
 995
 996 =back
 997
 998 If the return value is to a hash of hashes, it is because there are multiple
 999 case mapping definitions for a single code point
1000 (because of different rules for different locales).
1001 Each sub-hash is of the form above, and the keys of the outer hash are
1002 the locales, which are
1003 defined as 2-letter ISO 3166 country codes, possibly
1004 followed by a "_" and a 2-letter ISO language code (possibly followed
1005 by a "_" and a variant code).  You can find the lists of all possible locales,
1006 see L<Locale::Country> and L<Locale::Language>.
1007 (In Unicode 5.1, the only locales returned by this function
1008 are C<lt>, C<tr>, and C<az>.)
1009
1010 For more information about case mappings see
1011 L<http://www.unicode.org/unicode/reports/tr21/>
1012
1013 =cut
1014
1015 my %CASESPEC;
1016
1017 sub _casespec {
1018     unless (%CASESPEC) {
1019         if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
1020             local $_;
1021             while (<$CASESPECFH>) {
1022                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
1023                     my ($hexcode, $lower, $title, $upper, $condition) =
1024                         ($1, $2, $3, $4, $5);
1025                     my $code = hex($hexcode);
1026                     if (exists $CASESPEC{$code}) {
1027                         if (exists $CASESPEC{$code}->{code}) {
1028                             my ($oldlower,
1029                                 $oldtitle,
1030                                 $oldupper,
1031                                 $oldcondition) =
1032                                     @{$CASESPEC{$code}}{qw(lower
1033                                                            title
1034                                                            upper
1035                                                            condition)};
1036                             if (defined $oldcondition) {
1037                                 my ($oldlocale) =
1038                                 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
1039                                 delete $CASESPEC{$code};
1040                                 $CASESPEC{$code}->{$oldlocale} =
1041                                 { code      => $hexcode,
1042                                   lower     => $oldlower,
1043                                   title     => $oldtitle,
1044                                   upper     => $oldupper,
1045                                   condition => $oldcondition };
1046                             }
1047                         }
1048                         my ($locale) =
1049                             ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
1050                         $CASESPEC{$code}->{$locale} =
1051                         { code      => $hexcode,
1052                           lower     => $lower,
1053                           title     => $title,
1054                           upper     => $upper,
1055                           condition => $condition };
1056                     } else {
1057                         $CASESPEC{$code} =
1058                         { code      => $hexcode,
1059                           lower     => $lower,
1060                           title     => $title,
1061                           upper     => $upper,
1062                           condition => $condition };
1063                     }
1064                 }
1065             }
1066             close($CASESPECFH);
1067         }
1068     }
1069 }
1070
1071 sub casespec {
1072     my $arg  = shift;
1073     my $code = _getcode($arg);
1074     croak __PACKAGE__, "::casespec: unknown code '$arg'"
1075         unless defined $code;
1076
1077     _casespec() unless %CASESPEC;
1078
1079     return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
1080 }
1081
1082 =head2 B<namedseq()>
1083
1084     use Unicode::UCD 'namedseq';
1085
1086     my $namedseq = namedseq("KATAKANA LETTER AINU P");
1087     my @namedseq = namedseq("KATAKANA LETTER AINU P");
1088     my %namedseq = namedseq();
1089
1090 If used with a single argument in a scalar context, returns the string
1091 consisting of the code points of the named sequence, or B<undef> if no
1092 named sequence by that name exists.  If used with a single argument in
1093 a list context, it returns the list of the code points.  If used with no
1094 arguments in a list context, returns a hash with the names of the
1095 named sequences as the keys and the named sequences as strings as
1096 the values.  Otherwise, it returns B<undef> or an empty list depending
1097 on the context.
1098
1099 This function only operates on officially approved (not provisional) named
1100 sequences.
1101
1102 =cut
1103
1104 my %NAMEDSEQ;
1105
1106 sub _namedseq {
1107     unless (%NAMEDSEQ) {
1108         if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
1109             local $_;
1110             while (<$NAMEDSEQFH>) {
1111                 if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
1112                     my ($n, $s) = ($1, $2);
1113                     my @s = map { chr(hex($_)) } split(' ', $s);
1114                     $NAMEDSEQ{$n} = join("", @s);
1115                 }
1116             }
1117             close($NAMEDSEQFH);
1118         }
1119     }
1120 }
1121
1122 sub namedseq {
1123     _namedseq() unless %NAMEDSEQ;
1124     my $wantarray = wantarray();
1125     if (defined $wantarray) {
1126         if ($wantarray) {
1127             if (@_ == 0) {
1128                 return %NAMEDSEQ;
1129             } elsif (@_ == 1) {
1130                 my $s = $NAMEDSEQ{ $_[0] };
1131                 return defined $s ? map { ord($_) } split('', $s) : ();
1132             }
1133         } elsif (@_ == 1) {
1134             return $NAMEDSEQ{ $_[0] };
1135         }
1136     }
1137     return;
1138 }
1139
1140 =head2 Unicode::UCD::UnicodeVersion
1141
1142 This returns the version of the Unicode Character Database, in other words, the
1143 version of the Unicode standard the database implements.  The version is a
1144 string of numbers delimited by dots (C<'.'>).
1145
1146 =cut
1147
1148 my $UNICODEVERSION;
1149
1150 sub UnicodeVersion {
1151     unless (defined $UNICODEVERSION) {
1152         openunicode(\$VERSIONFH, "version");
1153         chomp($UNICODEVERSION = <$VERSIONFH>);
1154         close($VERSIONFH);
1155         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
1156             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
1157     }
1158     return $UNICODEVERSION;
1159 }
1160
1161 =head2 B<Blocks versus Scripts>
1162
1163 The difference between a block and a script is that scripts are closer
1164 to the linguistic notion of a set of code points required to present
1165 languages, while block is more of an artifact of the Unicode code point
1166 numbering and separation into blocks of (mostly) 256 code points.
1167
1168 For example the Latin B<script> is spread over several B<blocks>, such
1169 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
1170 C<Latin Extended-B>.  On the other hand, the Latin script does not
1171 contain all the characters of the C<Basic Latin> block (also known as
1172 ASCII): it includes only the letters, and not, for example, the digits
1173 or the punctuation.
1174
1175 For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
1176
1177 For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
1178
1179 =head2 B<Matching Scripts and Blocks>
1180
1181 Scripts are matched with the regular-expression construct
1182 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
1183 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
1184 any of the 256 code points in the Tibetan block).
1185
1186
1187 =head2 Implementation Note
1188
1189 The first use of charinfo() opens a read-only filehandle to the Unicode
1190 Character Database (the database is included in the Perl distribution).
1191 The filehandle is then kept open for further queries.  In other words,
1192 if you are wondering where one of your filehandles went, that's where.
1193
1194 =head1 BUGS
1195
1196 Does not yet support EBCDIC platforms.
1197
1198 L</compexcl()> should give a complete list of excluded code points.
1199
1200 =head1 AUTHOR
1201
1202 Jarkko Hietaniemi
1203
1204 =cut
1205
1206 1;