lib/UnicodeCD.pm

   1 package UnicodeCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '0.1';
   7
   8 require Exporter;
   9
  10 our @ISA = qw(Exporter);
  11 our @EXPORT_OK = qw(charinfo
  12                     charblock charscript
  13                     charblocks charscripts
  14                     charinrange
  15                     compexcl
  16                     casefold casespec);
  17
  18 use Carp;
  19
  20 =head1 NAME
  21
  22 UnicodeCD - Unicode character database
  23
  24 =head1 SYNOPSIS
  25
  26     use UnicodeCD 'charinfo';
  27     my $charinfo   = charinfo($codepoint);
  28
  29     use UnicodeCD 'charblock';
  30     my $charblock  = charblock($codepoint);
  31
  32     use UnicodeCD 'charscript';
  33     my $charscript = charblock($codepoint);
  34
  35     use UnicodeCD 'charblocks';
  36     my $charblocks = charblocks();
  37
  38     use UnicodeCD 'charscripts';
  39     my %charscripts = charscripts();
  40
  41     use UnicodeCD qw(charscript charinrange);
  42     my $range = charscript($script);
  43     print "looks like $script\n" if charinrange($range, $codepoint);
  44
  45     use UnicodeCD 'compexcl';
  46     my $compexcl = compexcl($codepoint);
  47
  48     my $unicode_version = UnicodeCD::UnicodeVersion();
  49
  50 =head1 DESCRIPTION
  51
  52 The UnicodeCD module offers a simple interface to the Unicode Character
  53 Database.
  54
  55 =cut
  56
  57 my $UNICODEFH;
  58 my $BLOCKSFH;
  59 my $SCRIPTSFH;
  60 my $VERSIONFH;
  61 my $COMPEXCLFH;
  62 my $CASEFOLDFH;
  63 my $CASESPECFH;
  64
  65 sub openunicode {
  66     my ($rfh, @path) = @_;
  67     my $f;
  68     unless (defined $$rfh) {
  69         for my $d (@INC) {
  70             use File::Spec;
  71             $f = File::Spec->catfile($d, "unicode", @path);
  72             last if open($$rfh, $f);
  73             undef $f;
  74         }
  75         croak __PACKAGE__, ": failed to find ",
  76               File::Spec->catfile(@path), " in @INC"
  77             unless defined $f;
  78     }
  79     return $f;
  80 }
  81
  82 =head2 charinfo
  83
  84     use UnicodeCD 'charinfo';
  85
  86     my $charinfo = charinfo(0x41);
  87
  88 charinfo() returns a reference to a hash that has the following fields
  89 as defined by the Unicode standard:
  90
  91     key
  92
  93     code             code point with at least four hexdigits
  94     name             name of the character IN UPPER CASE
  95     category         general category of the character
  96     combining        classes used in the Canonical Ordering Algorithm
  97     bidi             bidirectional category
  98     decomposition    character decomposition mapping
  99     decimal          if decimal digit this is the integer numeric value
 100     digit            if digit this is the numeric value
 101     numeric          if numeric is the integer or rational numeric value
 102     mirrored         if mirrored in bidirectional text
 103     unicode10        Unicode 1.0 name if existed and different
 104     comment          ISO 10646 comment field
 105     upper            uppercase equivalent mapping
 106     lower            lowercase equivalent mapping
 107     title            titlecase equivalent mapping
 108
 109     block            block the character belongs to (used in \p{In...})
 110     script           script the character belongs to
 111
 112 If no match is found, a reference to an empty hash is returned.
 113
 114 The C<block> property is the same as as returned by charinfo().  It is
 115 not defined in the Unicode Character Database proper (Chapter 4 of the
 116 Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
 117 of TUS3).  Similarly for the C<script> property.
 118
 119 Note that you cannot do (de)composition and casing based solely on the
 120 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
 121 you will need also the compexcl(), casefold(), and casespec() functions.
 122
 123 =cut
 124
 125 sub _getcode {
 126     my $arg = shift;
 127
 128     if ($arg =~ /^\d+$/) {
 129         return $arg;
 130     } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
 131         return hex($1);
 132     }
 133
 134     return;
 135 }
 136
 137 sub charinfo {
 138     my $arg  = shift;
 139     my $code = _getcode($arg);
 140     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 141         unless defined $code;
 142     my $hexk = sprintf("%04X", $code);
 143
 144     openunicode(\$UNICODEFH, "Unicode.txt");
 145     if (defined $UNICODEFH) {
 146         use Search::Dict;
 147         if (look($UNICODEFH, "$hexk;") >= 0) {
 148             my $line = <$UNICODEFH>;
 149             chomp $line;
 150             my %prop;
 151             @prop{qw(
 152                      code name category
 153                      combining bidi decomposition
 154                      decimal digit numeric
 155                      mirrored unicode10 comment
 156                      upper lower title
 157                     )} = split(/;/, $line, -1);
 158             if ($prop{code} eq $hexk) {
 159                 $prop{block}  = charblock($code);
 160                 $prop{script} = charscript($code);
 161                 return \%prop;
 162             }
 163         }
 164     }
 165     return;
 166 }
 167
 168 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 169     my ($table, $lo, $hi, $code) = @_;
 170
 171     return if $lo > $hi;
 172
 173     my $mid = int(($lo+$hi) / 2);
 174
 175     if ($table->[$mid]->[0] < $code) {
 176         if ($table->[$mid]->[1] >= $code) {
 177             return $table->[$mid]->[2];
 178         } else {
 179             _search($table, $mid + 1, $hi, $code);
 180         }
 181     } elsif ($table->[$mid]->[0] > $code) {
 182         _search($table, $lo, $mid - 1, $code);
 183     } else {
 184         return $table->[$mid]->[2];
 185     }
 186 }
 187
 188 sub charinrange {
 189     my ($range, $arg) = @_;
 190     my $code = _getcode($arg);
 191     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 192         unless defined $code;
 193     _search($range, 0, $#$range, $code);
 194 }
 195
 196 =head2 charblock
 197
 198     use UnicodeCD 'charblock';
 199
 200     my $charblock = charblock(0x41);
 201     my $charblock = charblock(1234);
 202     my $charblock = charblock("0x263a");
 203     my $charblock = charblock("U+263a");
 204
 205     my $ranges    = charblock('Armenian');
 206
 207 With a B<code point argument> charblock() returns the block the character
 208 belongs to, e.g.  C<Basic Latin>.  Note that not all the character
 209 positions within all blocks are defined.
 210
 211 If supplied with an argument that can't be a code point, charblock()
 212 tries to do the opposite and interpret the argument as a character
 213 block.  The return value is a I<range>: an anonymous list that
 214 contains anonymous lists, which in turn contain I<start-of-range>,
 215 I<end-of-range> code point pairs.  You can test whether a code point
 216 is in a range using the L</charinrange> function.  If the argument is
 217 not a known charater block, C<undef> is returned.
 218
 219 =cut
 220
 221 my @BLOCKS;
 222 my %BLOCKS;
 223
 224 sub _charblocks {
 225     unless (@BLOCKS) {
 226         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 227             while (<$BLOCKSFH>) {
 228                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 229                     my ($lo, $hi) = (hex($1), hex($2));
 230                     my $subrange = [ $lo, $hi, $3 ];
 231                     push @BLOCKS, $subrange;
 232                     push @{$BLOCKS{$3}}, $subrange;
 233                 }
 234             }
 235             close($BLOCKSFH);
 236         }
 237     }
 238 }
 239
 240 sub charblock {
 241     my $arg = shift;
 242
 243     _charblocks() unless @BLOCKS;
 244
 245     my $code = _getcode($arg);
 246
 247     if (defined $code) {
 248         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 249     } else {
 250         if (exists $BLOCKS{$arg}) {
 251             return $BLOCKS{$arg};
 252         } else {
 253             return;
 254         }
 255     }
 256 }
 257
 258 =head2 charscript
 259
 260     use UnicodeCD 'charscript';
 261
 262     my $charscript = charscript(0x41);
 263     my $charscript = charscript(1234);
 264     my $charscript = charscript("U+263a");
 265
 266     my $ranges     = charscript('Thai');
 267
 268 With a B<code point argument> charscript() returns the script the
 269 character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 270
 271 If supplied with an argument that can't be a code point, charscript()
 272 tries to do the opposite and interpret the argument as a character
 273 script.  The return value is a I<range>: an anonymous list that
 274 contains anonymous lists, which in turn contain I<start-of-range>,
 275 I<end-of-range> code point pairs.  You can test whether a code point
 276 is in a range using the L</charinrange> function.  If the argument is
 277 not a known charater script, C<undef> is returned.
 278
 279 =cut
 280
 281 my @SCRIPTS;
 282 my %SCRIPTS;
 283
 284 sub _charscripts {
 285     unless (@SCRIPTS) {
 286         if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 287             while (<$SCRIPTSFH>) {
 288                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 289                     my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 290                     my $script = lc($3);
 291                     $script =~ s/\b(\w)/uc($1)/ge;
 292                     my $subrange = [ $lo, $hi, $script ];
 293                     push @SCRIPTS, $subrange;
 294                     push @{$SCRIPTS{$script}}, $subrange;
 295                 }
 296             }
 297             close($SCRIPTSFH);
 298             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 299         }
 300     }
 301 }
 302
 303 sub charscript {
 304     my $arg = shift;
 305
 306     _charscripts() unless @SCRIPTS;
 307
 308     my $code = _getcode($arg);
 309
 310     if (defined $code) {
 311         _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 312     } else {
 313         if (exists $SCRIPTS{$arg}) {
 314             return $SCRIPTS{$arg};
 315         } else {
 316             return;
 317         }
 318     }
 319 }
 320
 321 =head2 charblocks
 322
 323     use UnicodeCD 'charblocks';
 324
 325     my $charblocks = charblocks();
 326
 327 charblocks() returns a reference to a hash with the known block names
 328 as the keys, and the code point ranges (see L</charblock>) as the values.
 329
 330 =cut
 331
 332 sub charblocks {
 333     _charblocks() unless %BLOCKS;
 334     return \%BLOCKS;
 335 }
 336
 337 =head2 charscripts
 338
 339     use UnicodeCD 'charscripts';
 340
 341     my %charscripts = charscripts();
 342
 343 charscripts() returns a hash with the known script names as the keys,
 344 and the code point ranges (see L</charscript>) as the values.
 345
 346 =cut
 347
 348 sub charscripts {
 349     _charscripts() unless %SCRIPTS;
 350     return \%SCRIPTS;
 351 }
 352
 353 =head2 Blocks versus Scripts
 354
 355 The difference between a block and a script is that scripts are closer
 356 to the linguistic notion of a set of characters required to present
 357 languages, while block is more of an artifact of the Unicode character
 358 numbering and separation into blocks of 256 characters.
 359
 360 For example the Latin B<script> is spread over several B<blocks>, such
 361 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
 362 C<Latin Extended-B>.  On the other hand, the Latin script does not
 363 contain all the characters of the C<Basic Latin> block (also known as
 364 the ASCII): it includes only the letters, not for example the digits
 365 or the punctuation.
 366
 367 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
 368
 369 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
 370
 371 =head2 Matching Scripts and Blocks
 372
 373 Both scripts and blocks can be matched using the regular expression
 374 construct C<\p{In...}> and its negation C<\P{In...}>.
 375
 376 The name of the script or the block comes after the C<In>, for example
 377 C<\p{InCyrillic}>, C<\P{InBasicLatin}>.  Spaces and dashes ('-') are
 378 removed from the names for the C<\p{In...}>, for example
 379 C<LatinExtendedA> instead of C<Latin Extended-A>.
 380
 381 There are a few cases where there exists both a script and a block by
 382 the same name, in these cases the block version has C<Block> appended:
 383 C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
 384
 385 =head2 Code Point Arguments
 386
 387 A <code point argument> is either a decimal or a hexadecimal scalar,
 388 or "U+" followed by hexadecimals.
 389
 390 =head2 charinrange
 391
 392 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 393 can also test whether a code point is in the I<range> as returned by
 394 L</charblock> and L</charscript> or as the values of the hash returned
 395 by L</charblocks> and L</charscripts> by using charinrange():
 396
 397     use UnicodeCD qw(charscript charinrange);
 398
 399     $range = charscript('Hiragana');
 400     print "looks like hiragana\n" if charinrange($range, $codepoint);
 401
 402 =cut
 403
 404 =head2 compexcl
 405
 406     use UnicodeCD 'compexcl';
 407
 408     my $compexcl = compexcl("09dc");
 409
 410 The compexcl() returns the composition exclusion (that is, if the
 411 character should not be produced during a precomposition) of the
 412 character specified by a B<code point argument>.
 413
 414 If there is a composition exclusion for the character, true is
 415 returned.  Otherwise, false is returned.
 416
 417 =cut
 418
 419 my %COMPEXCL;
 420
 421 sub _compexcl {
 422     unless (%COMPEXCL) {
 423         if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
 424             while (<$COMPEXCLFH>) {
 425                 if (/^([0-9A-F]+) \# /) {
 426                     my $code = hex($1);
 427                     $COMPEXCL{$code} = undef;
 428                 }
 429             }
 430             close($COMPEXCLFH);
 431         }
 432     }
 433 }
 434
 435 sub compexcl {
 436     my $arg  = shift;
 437     my $code = _getcode($arg);
 438
 439     _compexcl() unless %COMPEXCL;
 440
 441     return exists $COMPEXCL{$code};
 442 }
 443
 444 =head2 casefold
 445
 446     use UnicodeCD 'casefold';
 447
 448     my %casefold = casefold("09dc");
 449
 450 The casefold() returns the locale-independent case folding of the
 451 character specified by a B<code point argument>.
 452
 453 If there is a case folding for that character, a reference to a hash
 454 with the following fields is returned:
 455
 456     key
 457
 458     code             code point with at least four hexdigits
 459     status           "C", "F", "S", or "I"
 460     mapping          one or more codes separated by spaces
 461
 462 The meaning of the I<status> is as follows:
 463
 464    C                 common case folding, common mappings shared
 465                      by both simple and full mappings
 466    F                 full case folding, mappings that cause strings
 467                      to grow in length. Multiple characters are separated
 468                      by spaces
 469    S                 simple case folding, mappings to single characters
 470                      where different from F
 471    I                 special case for dotted uppercase I and
 472                      dotless lowercase i
 473                      - If this mapping is included, the result is
 474                        case-insensitive, but dotless and dotted I's
 475                        are not distinguished
 476                      - If this mapping is excluded, the result is not
 477                        fully case-insensitive, but dotless and dotted
 478                        I's are distinguished
 479
 480 If there is no case folding for that character, C<undef> is returned.
 481
 482 For more information about case mappings see
 483 http://www.unicode.org/unicode/reports/tr21/
 484
 485 =cut
 486
 487 my %CASEFOLD;
 488
 489 sub _casefold {
 490     unless (%CASEFOLD) {
 491         if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
 492             while (<$CASEFOLDFH>) {
 493                 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 494                     my $code = hex($1);
 495                     $CASEFOLD{$code} = { code    => $1,
 496                                          status  => $2,
 497                                          mapping => $3 };
 498                 }
 499             }
 500             close($CASEFOLDFH);
 501         }
 502     }
 503 }
 504
 505 sub casefold {
 506     my $arg  = shift;
 507     my $code = _getcode($arg);
 508
 509     _casefold() unless %CASEFOLD;
 510
 511     return $CASEFOLD{$code};
 512 }
 513
 514 =head2 casespec
 515
 516     use UnicodeCD 'casespec';
 517
 518     my %casespec = casespec("09dc");
 519
 520 The casespec() returns the potentially locale-dependent case mapping
 521 of the character specified by a B<code point argument>.  The mapping
 522 may change the length of the string (which the basic Unicode case
 523 mappings as returned by charinfo() never do).
 524
 525 If there is a case folding for that character, a reference to a hash
 526 with the following fields is returned:
 527
 528     key
 529
 530     code             code point with at least four hexdigits
 531     lower            lowercase
 532     title            titlecase
 533     upper            uppercase
 534     condition        condition list (may be undef)
 535
 536 The C<condition> is optional.  Where present, it consists of one or
 537 more I<locales> or I<contexts>, separated by spaces (other than as
 538 used to separate elements, spaces are to be ignored).  A condition
 539 list overrides the normal behavior if all of the listed conditions are
 540 true.  Case distinctions in the condition list are not significant.
 541 Conditions preceded by "NON_" represent the negation of the condition
 542
 543 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
 544 followed by a "_" and a 2-letter ISO language code (possibly followed
 545 by a "_" and a variant code).  You can find the lists of those codes,
 546 see L<Locale::Country> and L<Locale::Language>.
 547
 548 A I<context> is one of the following choices:
 549
 550     FINAL            The letter is not followed by a letter of
 551                      general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
 552     MODERN           The mapping is only used for modern text
 553     AFTER_i          The last base character was "i" (U+0069)
 554
 555 For more information about case mappings see
 556 http://www.unicode.org/unicode/reports/tr21/
 557
 558 =cut
 559
 560 my %CASESPEC;
 561
 562 sub _casespec {
 563     unless (%CASESPEC) {
 564         if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
 565             while (<$CASESPECFH>) {
 566                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
 567                     my $code = hex($1);
 568                     $CASESPEC{$code} = { code      => $1,
 569                                          lower     => $2,
 570                                          title     => $3,
 571                                          upper     => $4,
 572                                          condition => $5 };
 573                 }
 574             }
 575             close($CASESPECFH);
 576         }
 577     }
 578 }
 579
 580 sub casespec {
 581     my $arg  = shift;
 582     my $code = _getcode($arg);
 583
 584     _casespec() unless %CASESPEC;
 585
 586     return $CASESPEC{$code};
 587 }
 588
 589 =head2 UnicodeCD::UnicodeVersion
 590
 591 UnicodeCD::UnicodeVersion() returns the version of the Unicode Character
 592 Database, in other words, the version of the Unicode standard the
 593 database implements.
 594
 595 =cut
 596
 597 my $UNICODEVERSION;
 598
 599 sub UnicodeVersion {
 600     unless (defined $UNICODEVERSION) {
 601         openunicode(\$VERSIONFH, "version");
 602         chomp($UNICODEVERSION = <$VERSIONFH>);
 603         close($VERSIONFH);
 604         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
 605             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
 606     }
 607     return $UNICODEVERSION;
 608 }
 609
 610 =head2 Implementation Note
 611
 612 The first use of charinfo() opens a read-only filehandle to the Unicode
 613 Character Database (the database is included in the Perl distribution).
 614 The filehandle is then kept open for further queries.
 615
 616 =head1 AUTHOR
 617
 618 Jarkko Hietaniemi
 619
 620 =cut
 621
 622 1;