lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '0.27';
   7
   8 use Storable qw(dclone);
   9
  10 require Exporter;
  11
  12 our @ISA = qw(Exporter);
  13
  14 our @EXPORT_OK = qw(charinfo
  15                     charblock charscript
  16                     charblocks charscripts
  17                     charinrange
  18                     general_categories bidi_types
  19                     compexcl
  20                     casefold casespec
  21                     namedseq);
  22
  23 use Carp;
  24
  25 =head1 NAME
  26
  27 Unicode::UCD - Unicode character database
  28
  29 =head1 SYNOPSIS
  30
  31     use Unicode::UCD 'charinfo';
  32     my $charinfo   = charinfo($codepoint);
  33
  34     use Unicode::UCD 'casefold';
  35     my $casefold = casefold(0xFB00);
  36
  37     use Unicode::UCD 'casespec';
  38     my $casespec = casespec(0xFB00);
  39
  40     use Unicode::UCD 'charblock';
  41     my $charblock  = charblock($codepoint);
  42
  43     use Unicode::UCD 'charscript';
  44     my $charscript = charscript($codepoint);
  45
  46     use Unicode::UCD 'charblocks';
  47     my $charblocks = charblocks();
  48
  49     use Unicode::UCD 'charscripts';
  50     my $charscripts = charscripts();
  51
  52     use Unicode::UCD qw(charscript charinrange);
  53     my $range = charscript($script);
  54     print "looks like $script\n" if charinrange($range, $codepoint);
  55
  56     use Unicode::UCD qw(general_categories bidi_types);
  57     my $categories = general_categories();
  58     my $types = bidi_types();
  59
  60     use Unicode::UCD 'compexcl';
  61     my $compexcl = compexcl($codepoint);
  62
  63     use Unicode::UCD 'namedseq';
  64     my $namedseq = namedseq($named_sequence_name);
  65
  66     my $unicode_version = Unicode::UCD::UnicodeVersion();
  67
  68 =head1 DESCRIPTION
  69
  70 The Unicode::UCD module offers a series of functions that
  71 provide a simple interface to the Unicode
  72 Character Database.
  73
  74 =head2 code point argument
  75
  76 Some of the functions are called with a I<code point argument>, which is either
  77 a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
  78 followed by hexadecimals designating a Unicode code point.  In other words, if
  79 you want a code point to be interpreted as a hexadecimal number, you must
  80 prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
  81 interpreted as a decimal code point.  Also note that Unicode is B<not> limited
  82 to 16 bits (the number of Unicode code points is open-ended, in theory
  83 unlimited): you may have more than 4 hexdigits.
  84 =cut
  85
  86 my $UNICODEFH;
  87 my $BLOCKSFH;
  88 my $SCRIPTSFH;
  89 my $VERSIONFH;
  90 my $COMPEXCLFH;
  91 my $CASEFOLDFH;
  92 my $CASESPECFH;
  93 my $NAMEDSEQFH;
  94
  95 sub openunicode {
  96     my ($rfh, @path) = @_;
  97     my $f;
  98     unless (defined $$rfh) {
  99         for my $d (@INC) {
 100             use File::Spec;
 101             $f = File::Spec->catfile($d, "unicore", @path);
 102             last if open($$rfh, $f);
 103             undef $f;
 104         }
 105         croak __PACKAGE__, ": failed to find ",
 106               File::Spec->catfile(@path), " in @INC"
 107             unless defined $f;
 108     }
 109     return $f;
 110 }
 111
 112 =head2 B<charinfo()>
 113
 114     use Unicode::UCD 'charinfo';
 115
 116     my $charinfo = charinfo(0x41);
 117
 118 This returns information about the input L</code point argument>
 119 as a reference to a hash of fields as defined by the Unicode
 120 standard.  If the L</code point argument> is not assigned in the standard
 121 (i.e., has the general category C<Cn> meaning C<Unassigned>)
 122 or is a non-character (meaning it is guaranteed to never be assigned in
 123 the standard),
 124 B<undef> is returned.
 125
 126 Fields that aren't applicable to the particular code point argument exist in the
 127 returned hash, and are empty.
 128
 129 The keys in the hash with the meanings of their values are:
 130
 131 =over
 132
 133 =item B<code>
 134
 135 the input L</code point argument> expressed in hexadecimal, with leading zeros
 136 added if necessary to make it contain at least four hexdigits
 137
 138 =item B<name>
 139
 140 name of I<code>, all IN UPPER CASE.
 141 Some control-type code points do not have names.
 142 This field will be empty for C<Surrogate> and C<Private Use> code points,
 143 and for the others without a name,
 144 it will contain a description enclosed in angle brackets, like
 145 C<E<lt>controlE<gt>>.
 146
 147
 148 =item B<category>
 149
 150 The short name of the general category of I<code>.
 151 This will match one of the keys in the hash returned by L</general_categories()>.
 152
 153 =item B<combining>
 154
 155 the combining class number for I<code> used in the Canonical Ordering Algorithm.
 156 For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
 157 available at
 158 L<http://www.unicode.org/versions/Unicode5.1.0/>
 159
 160 =item B<bidi>
 161
 162 bidirectional type of I<code>.
 163 This will match one of the keys in the hash returned by L</bidi_types()>.
 164
 165 =item B<decomposition>
 166
 167 is empty if I<code> has no decomposition; or is one or more codes
 168 (separated by spaces) that taken in order represent a decomposition for
 169 I<code>.  Each has at least four hexdigits.
 170 The codes may be preceded by a word enclosed in angle brackets then a space,
 171 like C<E<lt>compatE<gt> >, giving the type of decomposition
 172
 173 =item B<decimal>
 174
 175 if I<code> is a decimal digit this is its integer numeric value
 176
 177 =item B<digit>
 178
 179 if I<code> represents a whole number, this is its integer numeric value
 180
 181 =item B<numeric>
 182
 183 if I<code> represents a whole or rational number, this is its numeric value.
 184 Rational values are expressed as a string like C<1/4>.
 185
 186 =item B<mirrored>
 187
 188 C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
 189
 190 =item B<unicode10>
 191
 192 name of I<code> in the Unicode 1.0 standard if one
 193 existed for this code point and is different from the current name
 194
 195 =item B<comment>
 196
 197 ISO 10646 comment field.
 198 It appears in parentheses in the ISO 10646 names list,
 199 or contains an asterisk to indicate there is
 200 a note for this code point in Annex P of that standard.
 201
 202 =item B<upper>
 203
 204 is empty if there is no single code point uppercase mapping for I<code>;
 205 otherwise it is that mapping expressed as at least four hexdigits.
 206 (L</casespec()> should be used in addition to B<charinfo()>
 207 for case mappings when the calling program can cope with multiple code point
 208 mappings.)
 209
 210 =item B<lower>
 211
 212 is empty if there is no single code point lowercase mapping for I<code>;
 213 otherwise it is that mapping expressed as at least four hexdigits.
 214 (L</casespec()> should be used in addition to B<charinfo()>
 215 for case mappings when the calling program can cope with multiple code point
 216 mappings.)
 217
 218 =item B<title>
 219
 220 is empty if there is no single code point titlecase mapping for I<code>;
 221 otherwise it is that mapping expressed as at least four hexdigits.
 222 (L</casespec()> should be used in addition to B<charinfo()>
 223 for case mappings when the calling program can cope with multiple code point
 224 mappings.)
 225
 226 =item B<block>
 227
 228 block I<code> belongs to (used in \p{In...}).
 229 See L</Blocks versus Scripts>.
 230
 231
 232 =item B<script>
 233
 234 script I<code> belongs to.
 235 See L</Blocks versus Scripts>.
 236
 237 =back
 238
 239 Note that you cannot do (de)composition and casing based solely on the
 240 I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields;
 241 you will need also the L</compexcl()>, and L</casespec()> functions.
 242
 243 =cut
 244
 245 # NB: This function is duplicated in charnames.pm
 246 sub _getcode {
 247     my $arg = shift;
 248
 249     if ($arg =~ /^[1-9]\d*$/) {
 250         return $arg;
 251     } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
 252         return hex($1);
 253     }
 254
 255     return;
 256 }
 257
 258 # Lingua::KO::Hangul::Util not part of the standard distribution
 259 # but it will be used if available.
 260
 261 eval { require Lingua::KO::Hangul::Util };
 262 my $hasHangulUtil = ! $@;
 263 if ($hasHangulUtil) {
 264     Lingua::KO::Hangul::Util->import();
 265 }
 266
 267 sub hangul_decomp { # internal: called from charinfo
 268     if ($hasHangulUtil) {
 269         my @tmp = decomposeHangul(shift);
 270         return sprintf("%04X %04X",      @tmp) if @tmp == 2;
 271         return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
 272     }
 273     return;
 274 }
 275
 276 sub hangul_charname { # internal: called from charinfo
 277     return sprintf("HANGUL SYLLABLE-%04X", shift);
 278 }
 279
 280 sub han_charname { # internal: called from charinfo
 281     return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
 282 }
 283
 284 # Overwritten by data in file
 285 my %first_last = (
 286    'CJK Ideograph Extension A' => [ 0x3400,   0x4DB5   ],
 287    'CJK Ideograph'             => [ 0x4E00,   0x9FA5   ],
 288    'CJK Ideograph Extension B' => [ 0x20000,  0x2A6D6  ],
 289 );
 290
 291 get_charinfo_ranges();
 292
 293 sub get_charinfo_ranges {
 294    my @blocks = keys %first_last;
 295
 296    my $fh;
 297    openunicode( \$fh, 'UnicodeData.txt' );
 298    if( defined $fh ){
 299       while( my $line = <$fh> ){
 300          next unless $line =~ /(?:First|Last)/;
 301          if( grep{ $line =~ /[^;]+;<$_\s*,\s*(?:First|Last)>/ }@blocks ){
 302             my ($number,$block,$type);
 303             ($number,$block) = split /;/, $line;
 304             $block =~ s/<|>//g;
 305             ($block,$type) = split /, /, $block;
 306             my $index = $type eq 'First' ? 0 : 1;
 307             $first_last{ $block }->[$index] = hex $number;
 308          }
 309       }
 310    }
 311 }
 312
 313 my @CharinfoRanges = (
 314 # block name
 315 # [ first, last, coderef to name, coderef to decompose ],
 316 # CJK Ideographs Extension A
 317   [ @{ $first_last{'CJK Ideograph Extension A'} },        \&han_charname,   undef  ],
 318 # CJK Ideographs
 319   [ @{ $first_last{'CJK Ideograph'} },                    \&han_charname,   undef  ],
 320 # Hangul Syllables
 321   [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
 322 # Non-Private Use High Surrogates
 323   [ 0xD800,   0xDB7F,   undef,   undef  ],
 324 # Private Use High Surrogates
 325   [ 0xDB80,   0xDBFF,   undef,   undef  ],
 326 # Low Surrogates
 327   [ 0xDC00,   0xDFFF,   undef,   undef  ],
 328 # The Private Use Area
 329   [ 0xE000,   0xF8FF,   undef,   undef  ],
 330 # CJK Ideographs Extension B
 331   [ @{ $first_last{'CJK Ideograph Extension B'} },        \&han_charname,   undef  ],
 332 # Plane 15 Private Use Area
 333   [ 0xF0000,  0xFFFFD,  undef,   undef  ],
 334 # Plane 16 Private Use Area
 335   [ 0x100000, 0x10FFFD, undef,   undef  ],
 336 );
 337
 338 sub charinfo {
 339     my $arg  = shift;
 340     my $code = _getcode($arg);
 341     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 342         unless defined $code;
 343     my $hexk = sprintf("%06X", $code);
 344     my($rcode,$rname,$rdec);
 345     foreach my $range (@CharinfoRanges){
 346       if ($range->[0] <= $code && $code <= $range->[1]) {
 347         $rcode = $hexk;
 348         $rcode =~ s/^0+//;
 349         $rcode =  sprintf("%04X", hex($rcode));
 350         $rname = $range->[2] ? $range->[2]->($code) : '';
 351         $rdec  = $range->[3] ? $range->[3]->($code) : '';
 352         $hexk  = sprintf("%06X", $range->[0]); # replace by the first
 353         last;
 354       }
 355     }
 356     openunicode(\$UNICODEFH, "UnicodeData.txt");
 357     if (defined $UNICODEFH) {
 358         use Search::Dict 1.02;
 359         if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
 360             my $line = <$UNICODEFH>;
 361             return unless defined $line;
 362             chomp $line;
 363             my %prop;
 364             @prop{qw(
 365                      code name category
 366                      combining bidi decomposition
 367                      decimal digit numeric
 368                      mirrored unicode10 comment
 369                      upper lower title
 370                     )} = split(/;/, $line, -1);
 371             $hexk =~ s/^0+//;
 372             $hexk =  sprintf("%04X", hex($hexk));
 373             if ($prop{code} eq $hexk) {
 374                 $prop{block}  = charblock($code);
 375                 $prop{script} = charscript($code);
 376                 if(defined $rname){
 377                     $prop{code} = $rcode;
 378                     $prop{name} = $rname;
 379                     $prop{decomposition} = $rdec;
 380                 }
 381                 return \%prop;
 382             }
 383         }
 384     }
 385     return;
 386 }
 387
 388 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 389     my ($table, $lo, $hi, $code) = @_;
 390
 391     return if $lo > $hi;
 392
 393     my $mid = int(($lo+$hi) / 2);
 394
 395     if ($table->[$mid]->[0] < $code) {
 396         if ($table->[$mid]->[1] >= $code) {
 397             return $table->[$mid]->[2];
 398         } else {
 399             _search($table, $mid + 1, $hi, $code);
 400         }
 401     } elsif ($table->[$mid]->[0] > $code) {
 402         _search($table, $lo, $mid - 1, $code);
 403     } else {
 404         return $table->[$mid]->[2];
 405     }
 406 }
 407
 408 sub charinrange {
 409     my ($range, $arg) = @_;
 410     my $code = _getcode($arg);
 411     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 412         unless defined $code;
 413     _search($range, 0, $#$range, $code);
 414 }
 415
 416 =head2 B<charblock()>
 417
 418     use Unicode::UCD 'charblock';
 419
 420     my $charblock = charblock(0x41);
 421     my $charblock = charblock(1234);
 422     my $charblock = charblock(0x263a);
 423     my $charblock = charblock("U+263a");
 424
 425     my $range     = charblock('Armenian');
 426
 427 With a L</code point argument> charblock() returns the I<block> the code point
 428 belongs to, e.g.  C<Basic Latin>.
 429 If the code point is unassigned, this returns the block it would belong to if
 430 it were assigned (which it may in future versions of the Unicode Standard).
 431
 432 See also L</Blocks versus Scripts>.
 433
 434 If supplied with an argument that can't be a code point, charblock() tries
 435 to do the opposite and interpret the argument as a code point block. The
 436 return value is a I<range>: an anonymous list of lists that contain
 437 I<start-of-range>, I<end-of-range> code point pairs. You can test whether
 438 a code point is in a range using the L</charinrange()> function. If the
 439 argument is not a known code point block, B<undef> is returned.
 440
 441 =cut
 442
 443 my @BLOCKS;
 444 my %BLOCKS;
 445
 446 sub _charblocks {
 447     unless (@BLOCKS) {
 448         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 449             local $_;
 450             while (<$BLOCKSFH>) {
 451                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 452                     my ($lo, $hi) = (hex($1), hex($2));
 453                     my $subrange = [ $lo, $hi, $3 ];
 454                     push @BLOCKS, $subrange;
 455                     push @{$BLOCKS{$3}}, $subrange;
 456                 }
 457             }
 458             close($BLOCKSFH);
 459         }
 460     }
 461 }
 462
 463 sub charblock {
 464     my $arg = shift;
 465
 466     _charblocks() unless @BLOCKS;
 467
 468     my $code = _getcode($arg);
 469
 470     if (defined $code) {
 471         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 472     } else {
 473         if (exists $BLOCKS{$arg}) {
 474             return dclone $BLOCKS{$arg};
 475         } else {
 476             return;
 477         }
 478     }
 479 }
 480
 481 =head2 B<charscript()>
 482
 483     use Unicode::UCD 'charscript';
 484
 485     my $charscript = charscript(0x41);
 486     my $charscript = charscript(1234);
 487     my $charscript = charscript("U+263a");
 488
 489     my $range      = charscript('Thai');
 490
 491 With a L</code point argument> charscript() returns the I<script> the
 492 code point belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 493 If the code point is unassigned, it returns B<undef>
 494
 495 If supplied with an argument that can't be a code point, charscript() tries
 496 to do the opposite and interpret the argument as a code point script. The
 497 return value is a I<range>: an anonymous list of lists that contain
 498 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
 499 code point is in a range using the L</charinrange()> function. If the
 500 argument is not a known code point script, B<undef> is returned.
 501
 502 See also L</Blocks versus Scripts>.
 503
 504 =cut
 505
 506 my @SCRIPTS;
 507 my %SCRIPTS;
 508
 509 sub _charscripts {
 510     unless (@SCRIPTS) {
 511         if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 512             local $_;
 513             while (<$SCRIPTSFH>) {
 514                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 515                     my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 516                     my $script = lc($3);
 517                     $script =~ s/\b(\w)/uc($1)/ge;
 518                     my $subrange = [ $lo, $hi, $script ];
 519                     push @SCRIPTS, $subrange;
 520                     push @{$SCRIPTS{$script}}, $subrange;
 521                 }
 522             }
 523             close($SCRIPTSFH);
 524             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 525         }
 526     }
 527 }
 528
 529 sub charscript {
 530     my $arg = shift;
 531
 532     _charscripts() unless @SCRIPTS;
 533
 534     my $code = _getcode($arg);
 535
 536     if (defined $code) {
 537         _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 538     } else {
 539         if (exists $SCRIPTS{$arg}) {
 540             return dclone $SCRIPTS{$arg};
 541         } else {
 542             return;
 543         }
 544     }
 545 }
 546
 547 =head2 B<charblocks()>
 548
 549     use Unicode::UCD 'charblocks';
 550
 551     my $charblocks = charblocks();
 552
 553 charblocks() returns a reference to a hash with the known block names
 554 as the keys, and the code point ranges (see L</charblock()>) as the values.
 555
 556 See also L</Blocks versus Scripts>.
 557
 558 =cut
 559
 560 sub charblocks {
 561     _charblocks() unless %BLOCKS;
 562     return dclone \%BLOCKS;
 563 }
 564
 565 =head2 B<charscripts()>
 566
 567     use Unicode::UCD 'charscripts';
 568
 569     my $charscripts = charscripts();
 570
 571 charscripts() returns a reference to a hash with the known script
 572 names as the keys, and the code point ranges (see L</charscript()>) as
 573 the values.
 574
 575 See also L</Blocks versus Scripts>.
 576
 577 =cut
 578
 579 sub charscripts {
 580     _charscripts() unless %SCRIPTS;
 581     return dclone \%SCRIPTS;
 582 }
 583
 584 =head2 B<charinrange()>
 585
 586 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 587 can also test whether a code point is in the I<range> as returned by
 588 L</charblock()> and L</charscript()> or as the values of the hash returned
 589 by L</charblocks()> and L</charscripts()> by using charinrange():
 590
 591     use Unicode::UCD qw(charscript charinrange);
 592
 593     $range = charscript('Hiragana');
 594     print "looks like hiragana\n" if charinrange($range, $codepoint);
 595
 596 =cut
 597
 598 my %GENERAL_CATEGORIES =
 599  (
 600     'L'  =>         'Letter',
 601     'LC' =>         'CasedLetter',
 602     'Lu' =>         'UppercaseLetter',
 603     'Ll' =>         'LowercaseLetter',
 604     'Lt' =>         'TitlecaseLetter',
 605     'Lm' =>         'ModifierLetter',
 606     'Lo' =>         'OtherLetter',
 607     'M'  =>         'Mark',
 608     'Mn' =>         'NonspacingMark',
 609     'Mc' =>         'SpacingMark',
 610     'Me' =>         'EnclosingMark',
 611     'N'  =>         'Number',
 612     'Nd' =>         'DecimalNumber',
 613     'Nl' =>         'LetterNumber',
 614     'No' =>         'OtherNumber',
 615     'P'  =>         'Punctuation',
 616     'Pc' =>         'ConnectorPunctuation',
 617     'Pd' =>         'DashPunctuation',
 618     'Ps' =>         'OpenPunctuation',
 619     'Pe' =>         'ClosePunctuation',
 620     'Pi' =>         'InitialPunctuation',
 621     'Pf' =>         'FinalPunctuation',
 622     'Po' =>         'OtherPunctuation',
 623     'S'  =>         'Symbol',
 624     'Sm' =>         'MathSymbol',
 625     'Sc' =>         'CurrencySymbol',
 626     'Sk' =>         'ModifierSymbol',
 627     'So' =>         'OtherSymbol',
 628     'Z'  =>         'Separator',
 629     'Zs' =>         'SpaceSeparator',
 630     'Zl' =>         'LineSeparator',
 631     'Zp' =>         'ParagraphSeparator',
 632     'C'  =>         'Other',
 633     'Cc' =>         'Control',
 634     'Cf' =>         'Format',
 635     'Cs' =>         'Surrogate',
 636     'Co' =>         'PrivateUse',
 637     'Cn' =>         'Unassigned',
 638  );
 639
 640 sub general_categories {
 641     return dclone \%GENERAL_CATEGORIES;
 642 }
 643
 644 =head2 B<general_categories()>
 645
 646     use Unicode::UCD 'general_categories';
 647
 648     my $categories = general_categories();
 649
 650 This returns a reference to a hash which has short
 651 general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
 652 names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
 653 C<Symbol>) as values.  The hash is reversible in case you need to go
 654 from the long names to the short names.  The general category is the
 655 one returned from
 656 L</charinfo()> under the C<category> key.
 657
 658 =cut
 659
 660 my %BIDI_TYPES =
 661  (
 662    'L'   => 'Left-to-Right',
 663    'LRE' => 'Left-to-Right Embedding',
 664    'LRO' => 'Left-to-Right Override',
 665    'R'   => 'Right-to-Left',
 666    'AL'  => 'Right-to-Left Arabic',
 667    'RLE' => 'Right-to-Left Embedding',
 668    'RLO' => 'Right-to-Left Override',
 669    'PDF' => 'Pop Directional Format',
 670    'EN'  => 'European Number',
 671    'ES'  => 'European Number Separator',
 672    'ET'  => 'European Number Terminator',
 673    'AN'  => 'Arabic Number',
 674    'CS'  => 'Common Number Separator',
 675    'NSM' => 'Non-Spacing Mark',
 676    'BN'  => 'Boundary Neutral',
 677    'B'   => 'Paragraph Separator',
 678    'S'   => 'Segment Separator',
 679    'WS'  => 'Whitespace',
 680    'ON'  => 'Other Neutrals',
 681  );
 682
 683 =head2 B<bidi_types()>
 684
 685     use Unicode::UCD 'bidi_types';
 686
 687     my $categories = bidi_types();
 688
 689 This returns a reference to a hash which has the short
 690 bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
 691 names (such as C<Left-to-Right>, C<Right-to-Left>) as values.  The
 692 hash is reversible in case you need to go from the long names to the
 693 short names.  The bidi type is the one returned from
 694 L</charinfo()>
 695 under the C<bidi> key.  For the exact meaning of the various bidi classes
 696 the Unicode TR9 is recommended reading:
 697 L<http://www.unicode.org/reports/tr9/>
 698 (as of Unicode 5.0.0)
 699
 700 =cut
 701
 702 sub bidi_types {
 703     return dclone \%BIDI_TYPES;
 704 }
 705
 706 =head2 B<compexcl()>
 707
 708     use Unicode::UCD 'compexcl';
 709
 710     my $compexcl = compexcl(0x09dc);
 711
 712 This returns B<true> if the
 713 L</code point argument> should not be produced by composition normalization,
 714 B<AND> if that fact is not otherwise determinable from the Unicode data base.
 715 It currently does not return B<true> if the code point has a decomposition
 716 consisting of another single code point, nor if its decomposition starts
 717 with a code point whose combining class is non-zero.  Code points that meet
 718 either of these conditions should also not be produced by composition
 719 normalization.
 720
 721 It returns B<false> otherwise.
 722
 723 =cut
 724
 725 my %COMPEXCL;
 726
 727 sub _compexcl {
 728     unless (%COMPEXCL) {
 729         if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
 730             local $_;
 731             while (<$COMPEXCLFH>) {
 732                 if (/^([0-9A-F]+)\s+\#\s+/) {
 733                     my $code = hex($1);
 734                     $COMPEXCL{$code} = undef;
 735                 }
 736             }
 737             close($COMPEXCLFH);
 738         }
 739     }
 740 }
 741
 742 sub compexcl {
 743     my $arg  = shift;
 744     my $code = _getcode($arg);
 745     croak __PACKAGE__, "::compexcl: unknown code '$arg'"
 746         unless defined $code;
 747
 748     _compexcl() unless %COMPEXCL;
 749
 750     return exists $COMPEXCL{$code};
 751 }
 752
 753 =head2 B<casefold()>
 754
 755     use Unicode::UCD 'casefold';
 756
 757     my $casefold = casefold(0xDF);
 758     if (defined $casefold) {
 759         my @full_fold_hex = split / /, $casefold->{'full'};
 760         my $full_fold_string =
 761                     join "", map {chr(hex($_))} @full_fold_hex;
 762         my @turkic_fold_hex =
 763                         split / /, ($casefold->{'turkic'} ne "")
 764                                         ? $casefold->{'turkic'}
 765                                         : $casefold->{'full'};
 766         my $turkic_fold_string =
 767                         join "", map {chr(hex($_))} @turkic_fold_hex;
 768     }
 769     if (defined $casefold && $casefold->{'simple'} ne "") {
 770         my $simple_fold_hex = $casefold->{'simple'};
 771         my $simple_fold_string = chr(hex($simple_fold_hex));
 772     }
 773
 774 This returns the (almost) locale-independent case folding of the
 775 character specified by the L</code point argument>.
 776
 777 If there is no case folding for that code point, B<undef> is returned.
 778
 779 If there is a case folding for that code point, a reference to a hash
 780 with the following fields is returned:
 781
 782 =over
 783
 784 =item B<code>
 785
 786 the input L</code point argument> expressed in hexadecimal, with leading zeros
 787 added if necessary to make it contain at least four hexdigits
 788
 789 =item B<full>
 790
 791 one or more codes (separated by spaces) that taken in order give the
 792 code points for the case folding for I<code>.
 793 Each has at least four hexdigits.
 794
 795 =item B<simple>
 796
 797 is empty, or is exactly one code with at least four hexdigits which can be used
 798 as an alternative case folding when the calling program cannot cope with the
 799 fold being a sequence of multiple code points.  If I<full> is just one code
 800 point, then I<simple> equals I<full>.  If there is no single code point folding
 801 defined for I<code>, then I<simple> is the empty string.  Otherwise, it is an
 802 inferior, but still better-than-nothing alternative folding to I<full>.
 803
 804 =item B<mapping>
 805
 806 is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
 807 otherwise.  It can be considered to be the simplest possible folding for
 808 I<code>.  It is defined primarily for backwards compatibility.
 809
 810 =item B<status>
 811
 812 is C<C> (for C<common>) if the best possible fold is a single code point
 813 (I<simple> equals I<full> equals I<mapping>).  It is C<S> if there are distinct
 814 folds, I<simple> and I<full> (I<mapping> equals I<simple>).  And it is C<F> if
 815 there only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty).  Note
 816 that this
 817 describes the contents of I<mapping>.  It is defined primarily for backwards
 818 compatibility.
 819
 820 On versions 3.1 and earlier of Unicode, I<status> can also be
 821 C<I> which is the same as C<C> but is a special case for dotted uppercase I and
 822 dotless lowercase i:
 823
 824 =over
 825
 826 =item B<*>
 827
 828 If you use this C<I> mapping, the result is case-insensitive,
 829 but dotless and dotted I's are not distinguished
 830
 831 =item B<*>
 832
 833 If you exclude this C<I> mapping, the result is not fully case-insensitive, but
 834 dotless and dotted I's are distinguished
 835
 836 =back
 837
 838 =item B<turkic>
 839
 840 contains any special folding for Turkic languages.  For versions of Unicode
 841 starting with 3.2, this field is empty unless I<code> has a different folding
 842 in Turkic languages, in which case it is one or more codes (separated by
 843 spaces) that taken in order give the code points for the case folding for
 844 I<code> in those languages.
 845 Each code has at least four hexdigits.
 846 Note that this folding does not maintain canonical equivalence without
 847 additional processing.
 848
 849 For versions of Unicode 3.1 and earlier, this field is empty unless there is a
 850 special folding for Turkic languages, in which case I<status> is C<I>, and
 851 I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
 852
 853 =back
 854
 855 Programs that want complete generality and the best folding results should use
 856 the folding contained in the I<full> field.  But note that the fold for some
 857 code points will be a sequence of multiple code points.
 858
 859 Programs that can't cope with the fold mapping being multiple code points can
 860 use the folding contained in the I<simple> field, with the loss of some
 861 generality.  In Unicode 5.1, about 7% of the defined foldings have no single
 862 code point folding.
 863
 864 The I<mapping> and I<status> fields are provided for backwards compatibility for
 865 existing programs.  They contain the same values as in previous versions of
 866 this function.
 867
 868 Locale is not completely independent.  The I<turkic> field contains results to
 869 use when the locale is a Turkic language.
 870
 871 For more information about case mappings see
 872 L<http://www.unicode.org/unicode/reports/tr21>
 873
 874 =cut
 875
 876 my %CASEFOLD;
 877
 878 sub _casefold {
 879     unless (%CASEFOLD) {
 880         if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
 881             local $_;
 882             while (<$CASEFOLDFH>) {
 883                 if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 884                     my $code = hex($1);
 885                     $CASEFOLD{$code}{'code'} = $1;
 886                     $CASEFOLD{$code}{'turkic'} = "" unless
 887                                             defined $CASEFOLD{$code}{'turkic'};
 888                     if ($2 eq 'C' || $2 eq 'I') {       # 'I' is only on 3.1 and
 889                                                         # earlier Unicodes
 890                                                         # Both entries there (I
 891                                                         # only checked 3.1) are
 892                                                         # the same as C, and
 893                                                         # there are no other
 894                                                         # entries for those
 895                                                         # codepoints, so treat
 896                                                         # as if C, but override
 897                                                         # the turkic one for
 898                                                         # 'I'.
 899                         $CASEFOLD{$code}{'status'} = $2;
 900                         $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
 901                         $CASEFOLD{$code}{'mapping'} = $3;
 902                         $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
 903                     } elsif ($2 eq 'F') {
 904                         $CASEFOLD{$code}{'full'} = $3;
 905                         unless (defined $CASEFOLD{$code}{'simple'}) {
 906                                 $CASEFOLD{$code}{'simple'} = "";
 907                                 $CASEFOLD{$code}{'mapping'} = $3;
 908                                 $CASEFOLD{$code}{'status'} = $2;
 909                         }
 910                     } elsif ($2 eq 'S') {
 911
 912
 913                         # There can't be a simple without a full, and simple
 914                         # overrides all but full
 915
 916                         $CASEFOLD{$code}{'simple'} = $3;
 917                         $CASEFOLD{$code}{'mapping'} = $3;
 918                         $CASEFOLD{$code}{'status'} = $2;
 919                     } elsif ($2 eq 'T') {
 920                         $CASEFOLD{$code}{'turkic'} = $3;
 921                     } # else can't happen because only [CIFST] are possible
 922                 }
 923             }
 924             close($CASEFOLDFH);
 925         }
 926     }
 927 }
 928
 929 sub casefold {
 930     my $arg  = shift;
 931     my $code = _getcode($arg);
 932     croak __PACKAGE__, "::casefold: unknown code '$arg'"
 933         unless defined $code;
 934
 935     _casefold() unless %CASEFOLD;
 936
 937     return $CASEFOLD{$code};
 938 }
 939
 940 =head2 B<casespec()>
 941
 942     use Unicode::UCD 'casespec';
 943
 944     my $casespec = casespec(0xFB00);
 945
 946 This returns the potentially locale-dependent case mappings of the L</code point
 947 argument>.  The mappings may be longer than a single code point (which the basic
 948 Unicode case mappings as returned by L</charinfo()> never are).
 949
 950 If there are no case mappings for the L</code point argument>, or if all three
 951 possible mappings (I<lower>, I<title> and I<upper>) result in single code
 952 points and are locale independent and unconditional, B<undef> is returned
 953 (which means that the case mappings, if any, for the code point are those
 954 returned by L</charinfo()>).
 955
 956 Otherwise, a reference to a hash giving the mappings (or a reference to a hash
 957 of such hashes, explained below) is returned with the following keys and their
 958 meanings:
 959
 960 The keys in the bottom layer hash with the meanings of their values are:
 961
 962 =over
 963
 964 =item B<code>
 965
 966 the input L</code point argument> expressed in hexadecimal, with leading zeros
 967 added if necessary to make it contain at least four hexdigits
 968
 969 =item B<lower>
 970
 971 one or more codes (separated by spaces) that taken in order give the
 972 code points for the lower case of I<code>.
 973 Each has at least four hexdigits.
 974
 975 =item B<title>
 976
 977 one or more codes (separated by spaces) that taken in order give the
 978 code points for the title case of I<code>.
 979 Each has at least four hexdigits.
 980
 981 =item B<lower>
 982
 983 one or more codes (separated by spaces) that taken in order give the
 984 code points for the upper case of I<code>.
 985 Each has at least four hexdigits.
 986
 987 =item B<condition>
 988
 989 the conditions for the mappings to be valid.
 990 If B<undef>, the mappings are always valid.
 991 When defined, this field is a list of conditions,
 992 all of which must be true for the mappings to be valid.
 993 The list consists of one or more
 994 I<locales> (see below)
 995 and/or I<contexts> (explained in the next paragraph),
 996 separated by spaces.
 997 (Other than as used to separate elements, spaces are to be ignored.)
 998 Case distinctions in the condition list are not significant.
 999 Conditions preceded by "NON_" represent the negation of the condition.
1000
1001 A I<context> is one of those defined in the Unicode standard.
1002 For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
1003 available at
1004 L<http://www.unicode.org/versions/Unicode5.1.0/>.
1005 These are for context-sensitive casing.
1006
1007 =back
1008
1009 The hash described above is returned for locale-independent casing, where
1010 at least one of the mappings has length longer than one.  If B<undef> is
1011 returned, the code point may have mappings, but if so, all are length one,
1012 and are returned by L</charinfo()>.
1013 Note that when this function does return a value, it will be for the complete
1014 set of mappings for a code point, even those whose length is one.
1015
1016 If there are additional casing rules that apply only in certain locales,
1017 an additional key for each will be defined in the returned hash.  Each such key
1018 will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
1019 followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
1020 and a variant code).  You can find the lists of all possible locales, see
1021 L<Locale::Country> and L<Locale::Language>.
1022 (In Unicode 5.1, the only locales returned by this function
1023 are C<lt>, C<tr>, and C<az>.)
1024
1025 Each locale key is a reference to a hash that has the form above, and gives
1026 the casing rules for that particular locale, which take precedence over the
1027 locale-independent ones when in that locale.
1028
1029 If the only casing for a code point is locale-dependent, then the returned
1030 hash will not have any of the base keys, like C<code>, C<upper>, etc., but
1031 will contain only locale keys.
1032
1033 For more information about case mappings see
1034 L<http://www.unicode.org/unicode/reports/tr21/>
1035
1036 =cut
1037
1038 my %CASESPEC;
1039
1040 sub _casespec {
1041     unless (%CASESPEC) {
1042         if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
1043             local $_;
1044             while (<$CASESPECFH>) {
1045                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
1046                     my ($hexcode, $lower, $title, $upper, $condition) =
1047                         ($1, $2, $3, $4, $5);
1048                     my $code = hex($hexcode);
1049                     if (exists $CASESPEC{$code}) {
1050                         if (exists $CASESPEC{$code}->{code}) {
1051                             my ($oldlower,
1052                                 $oldtitle,
1053                                 $oldupper,
1054                                 $oldcondition) =
1055                                     @{$CASESPEC{$code}}{qw(lower
1056                                                            title
1057                                                            upper
1058                                                            condition)};
1059                             if (defined $oldcondition) {
1060                                 my ($oldlocale) =
1061                                 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
1062                                 delete $CASESPEC{$code};
1063                                 $CASESPEC{$code}->{$oldlocale} =
1064                                 { code      => $hexcode,
1065                                   lower     => $oldlower,
1066                                   title     => $oldtitle,
1067                                   upper     => $oldupper,
1068                                   condition => $oldcondition };
1069                             }
1070                         }
1071                         my ($locale) =
1072                             ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
1073                         $CASESPEC{$code}->{$locale} =
1074                         { code      => $hexcode,
1075                           lower     => $lower,
1076                           title     => $title,
1077                           upper     => $upper,
1078                           condition => $condition };
1079                     } else {
1080                         $CASESPEC{$code} =
1081                         { code      => $hexcode,
1082                           lower     => $lower,
1083                           title     => $title,
1084                           upper     => $upper,
1085                           condition => $condition };
1086                     }
1087                 }
1088             }
1089             close($CASESPECFH);
1090         }
1091     }
1092 }
1093
1094 sub casespec {
1095     my $arg  = shift;
1096     my $code = _getcode($arg);
1097     croak __PACKAGE__, "::casespec: unknown code '$arg'"
1098         unless defined $code;
1099
1100     _casespec() unless %CASESPEC;
1101
1102     return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
1103 }
1104
1105 =head2 B<namedseq()>
1106
1107     use Unicode::UCD 'namedseq';
1108
1109     my $namedseq = namedseq("KATAKANA LETTER AINU P");
1110     my @namedseq = namedseq("KATAKANA LETTER AINU P");
1111     my %namedseq = namedseq();
1112
1113 If used with a single argument in a scalar context, returns the string
1114 consisting of the code points of the named sequence, or B<undef> if no
1115 named sequence by that name exists.  If used with a single argument in
1116 a list context, it returns the list of the ordinals of the code points.  If used
1117 with no
1118 arguments in a list context, returns a hash with the names of the
1119 named sequences as the keys and the named sequences as strings as
1120 the values.  Otherwise, it returns B<undef> or an empty list depending
1121 on the context.
1122
1123 This function only operates on officially approved (not provisional) named
1124 sequences.
1125
1126 =cut
1127
1128 my %NAMEDSEQ;
1129
1130 sub _namedseq {
1131     unless (%NAMEDSEQ) {
1132         if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
1133             local $_;
1134             while (<$NAMEDSEQFH>) {
1135                 if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
1136                     my ($n, $s) = ($1, $2);
1137                     my @s = map { chr(hex($_)) } split(' ', $s);
1138                     $NAMEDSEQ{$n} = join("", @s);
1139                 }
1140             }
1141             close($NAMEDSEQFH);
1142         }
1143     }
1144 }
1145
1146 sub namedseq {
1147     _namedseq() unless %NAMEDSEQ;
1148     my $wantarray = wantarray();
1149     if (defined $wantarray) {
1150         if ($wantarray) {
1151             if (@_ == 0) {
1152                 return %NAMEDSEQ;
1153             } elsif (@_ == 1) {
1154                 my $s = $NAMEDSEQ{ $_[0] };
1155                 return defined $s ? map { ord($_) } split('', $s) : ();
1156             }
1157         } elsif (@_ == 1) {
1158             return $NAMEDSEQ{ $_[0] };
1159         }
1160     }
1161     return;
1162 }
1163
1164 =head2 Unicode::UCD::UnicodeVersion
1165
1166 This returns the version of the Unicode Character Database, in other words, the
1167 version of the Unicode standard the database implements.  The version is a
1168 string of numbers delimited by dots (C<'.'>).
1169
1170 =cut
1171
1172 my $UNICODEVERSION;
1173
1174 sub UnicodeVersion {
1175     unless (defined $UNICODEVERSION) {
1176         openunicode(\$VERSIONFH, "version");
1177         chomp($UNICODEVERSION = <$VERSIONFH>);
1178         close($VERSIONFH);
1179         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
1180             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
1181     }
1182     return $UNICODEVERSION;
1183 }
1184
1185 =head2 B<Blocks versus Scripts>
1186
1187 The difference between a block and a script is that scripts are closer
1188 to the linguistic notion of a set of code points required to present
1189 languages, while block is more of an artifact of the Unicode code point
1190 numbering and separation into blocks of (mostly) 256 code points.
1191
1192 For example the Latin B<script> is spread over several B<blocks>, such
1193 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
1194 C<Latin Extended-B>.  On the other hand, the Latin script does not
1195 contain all the characters of the C<Basic Latin> block (also known as
1196 ASCII): it includes only the letters, and not, for example, the digits
1197 or the punctuation.
1198
1199 For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
1200
1201 For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
1202
1203 =head2 B<Matching Scripts and Blocks>
1204
1205 Scripts are matched with the regular-expression construct
1206 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
1207 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
1208 any of the 256 code points in the Tibetan block).
1209
1210
1211 =head2 Implementation Note
1212
1213 The first use of charinfo() opens a read-only filehandle to the Unicode
1214 Character Database (the database is included in the Perl distribution).
1215 The filehandle is then kept open for further queries.  In other words,
1216 if you are wondering where one of your filehandles went, that's where.
1217
1218 =head1 BUGS
1219
1220 Does not yet support EBCDIC platforms.
1221
1222 L</compexcl()> should give a complete list of excluded code points.
1223
1224 =head1 AUTHOR
1225
1226 Jarkko Hietaniemi
1227
1228 =cut
1229
1230 1;