lib/UnicodeCD.pm

   1 package UnicodeCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '0.1';
   7
   8 require Exporter;
   9
  10 our @ISA = qw(Exporter);
  11 our @EXPORT_OK = qw(charinfo
  12                     charblock charscript
  13                     charblocks charscripts
  14                     charinrange
  15                     compexcl
  16                     casefold casespec);
  17
  18 use Carp;
  19
  20 =head1 NAME
  21
  22 UnicodeCD - Unicode character database
  23
  24 =head1 SYNOPSIS
  25
  26     use UnicodeCD 'charinfo';
  27     my $charinfo   = charinfo($codepoint);
  28
  29     use UnicodeCD 'charblock';
  30     my $charblock  = charblock($codepoint);
  31
  32     use UnicodeCD 'charscript';
  33     my $charscript = charblock($codepoint);
  34
  35 =head1 DESCRIPTION
  36
  37 The Unicode module offers a simple interface to the Unicode Character
  38 Database.
  39
  40 =cut
  41
  42 my $UNICODEFH;
  43 my $BLOCKSFH;
  44 my $SCRIPTSFH;
  45 my $VERSIONFH;
  46 my $COMPEXCLFH;
  47 my $CASEFOLDFH;
  48 my $CASESPECFH;
  49
  50 sub openunicode {
  51     my ($rfh, @path) = @_;
  52     my $f;
  53     unless (defined $$rfh) {
  54         for my $d (@INC) {
  55             use File::Spec;
  56             $f = File::Spec->catfile($d, "unicode", @path);
  57             last if open($$rfh, $f);
  58             undef $f;
  59         }
  60         croak __PACKAGE__, ": failed to find ",
  61               File::Spec->catfile(@path), " in @INC"
  62             unless defined $f;
  63     }
  64     return $f;
  65 }
  66
  67 =head2 charinfo
  68
  69     use UnicodeCD 'charinfo';
  70
  71     my $charinfo = charinfo(0x41);
  72
  73 charinfo() returns a reference to a hash that has the following fields
  74 as defined by the Unicode standard:
  75
  76     key
  77
  78     code             code point with at least four hexdigits
  79     name             name of the character IN UPPER CASE
  80     category         general category of the character
  81     combining        classes used in the Canonical Ordering Algorithm
  82     bidi             bidirectional category
  83     decomposition    character decomposition mapping
  84     decimal          if decimal digit this is the integer numeric value
  85     digit            if digit this is the numeric value
  86     numeric          if numeric is the integer or rational numeric value
  87     mirrored         if mirrored in bidirectional text
  88     unicode10        Unicode 1.0 name if existed and different
  89     comment          ISO 10646 comment field
  90     upper            uppercase equivalent mapping
  91     lower            lowercase equivalent mapping
  92     title            titlecase equivalent mapping
  93
  94     block            block the character belongs to (used in \p{In...})
  95     script           script the character belongs to
  96
  97 If no match is found, a reference to an empty hash is returned.
  98
  99 The C<block> property is the same as as returned by charinfo().  It is
 100 not defined in the Unicode Character Database proper (Chapter 4 of the
 101 Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
 102 of TUS3).  Similarly for the C<script> property.
 103
 104 Note that you cannot do (de)composition and casing based solely on the
 105 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
 106 you will need also the compexcl(), casefold(), and casespec() functions.
 107
 108 =cut
 109
 110 sub _getcode {
 111     my $arg = shift;
 112
 113     if ($arg =~ /^\d+$/) {
 114         return $arg;
 115     } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
 116         return hex($1);
 117     }
 118
 119     return;
 120 }
 121
 122 sub charinfo {
 123     my $arg  = shift;
 124     my $code = _getcode($arg);
 125     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 126         unless defined $code;
 127     my $hexk = sprintf("%04X", $code);
 128
 129     openunicode(\$UNICODEFH, "Unicode.txt");
 130     if (defined $UNICODEFH) {
 131         use Search::Dict;
 132         if (look($UNICODEFH, "$hexk;") >= 0) {
 133             my $line = <$UNICODEFH>;
 134             chomp $line;
 135             my %prop;
 136             @prop{qw(
 137                      code name category
 138                      combining bidi decomposition
 139                      decimal digit numeric
 140                      mirrored unicode10 comment
 141                      upper lower title
 142                     )} = split(/;/, $line, -1);
 143             if ($prop{code} eq $hexk) {
 144                 $prop{block}  = charblock($code);
 145                 $prop{script} = charscript($code);
 146                 return \%prop;
 147             }
 148         }
 149     }
 150     return;
 151 }
 152
 153 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 154     my ($table, $lo, $hi, $code) = @_;
 155
 156     return if $lo > $hi;
 157
 158     my $mid = int(($lo+$hi) / 2);
 159
 160     if ($table->[$mid]->[0] < $code) {
 161         if ($table->[$mid]->[1] >= $code) {
 162             return $table->[$mid]->[2];
 163         } else {
 164             _search($table, $mid + 1, $hi, $code);
 165         }
 166     } elsif ($table->[$mid]->[0] > $code) {
 167         _search($table, $lo, $mid - 1, $code);
 168     } else {
 169         return $table->[$mid]->[2];
 170     }
 171 }
 172
 173 sub charinrange {
 174     my ($range, $arg) = @_;
 175     my $code = _getcode($arg);
 176     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 177         unless defined $code;
 178     _search($range, 0, $#$range, $code);
 179 }
 180
 181 =head2 charblock
 182
 183     use UnicodeCD 'charblock';
 184
 185     my $charblock = charblock(0x41);
 186     my $charblock = charblock(1234);
 187     my $charblock = charblock("0x263a");
 188     my $charblock = charblock("U+263a");
 189
 190     my $ranges    = charblock('Armenian');
 191
 192 With a B<code point argument> charblock() returns the block the character
 193 belongs to, e.g.  C<Basic Latin>.  Note that not all the character
 194 positions within all blocks are defined.
 195
 196 If supplied with an argument that can't be a code point, charblock()
 197 tries to do the opposite and interpret the argument as a character
 198 block.  The return value is a I<range>: an anonymous list that
 199 contains anonymous lists, which in turn contain I<start-of-range>,
 200 I<end-of-range> code point pairs.  You can test whether a code point
 201 is in a range using the L</charinrange> function.  If the argument is
 202 not a known charater block, C<undef> is returned.
 203
 204 =cut
 205
 206 my @BLOCKS;
 207 my %BLOCKS;
 208
 209 sub _charblocks {
 210     unless (@BLOCKS) {
 211         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 212             while (<$BLOCKSFH>) {
 213                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 214                     my ($lo, $hi) = (hex($1), hex($2));
 215                     my $subrange = [ $lo, $hi, $3 ];
 216                     push @BLOCKS, $subrange;
 217                     push @{$BLOCKS{$3}}, $subrange;
 218                 }
 219             }
 220             close($BLOCKSFH);
 221         }
 222     }
 223 }
 224
 225 sub charblock {
 226     my $arg = shift;
 227
 228     _charblocks() unless @BLOCKS;
 229
 230     my $code = _getcode($arg);
 231
 232     if (defined $code) {
 233         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 234     } else {
 235         if (exists $BLOCKS{$arg}) {
 236             return $BLOCKS{$arg};
 237         } else {
 238             return;
 239         }
 240     }
 241 }
 242
 243 =head2 charscript
 244
 245     use UnicodeCD 'charscript';
 246
 247     my $charscript = charscript(0x41);
 248     my $charscript = charscript(1234);
 249     my $charscript = charscript("U+263a");
 250
 251     my $ranges     = charscript('Thai');
 252
 253 With a B<code point argument> charscript() returns the script the
 254 character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 255
 256 If supplied with an argument that can't be a code point, charscript()
 257 tries to do the opposite and interpret the argument as a character
 258 script.  The return value is a I<range>: an anonymous list that
 259 contains anonymous lists, which in turn contain I<start-of-range>,
 260 I<end-of-range> code point pairs.  You can test whether a code point
 261 is in a range using the L</charinrange> function.  If the argument is
 262 not a known charater script, C<undef> is returned.
 263
 264 =cut
 265
 266 my @SCRIPTS;
 267 my %SCRIPTS;
 268
 269 sub _charscripts {
 270     unless (@SCRIPTS) {
 271         if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 272             while (<$SCRIPTSFH>) {
 273                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 274                     my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 275                     my $script = lc($3);
 276                     $script =~ s/\b(\w)/uc($1)/ge;
 277                     my $subrange = [ $lo, $hi, $script ];
 278                     push @SCRIPTS, $subrange;
 279                     push @{$SCRIPTS{$script}}, $subrange;
 280                 }
 281             }
 282             close($SCRIPTSFH);
 283             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 284         }
 285     }
 286 }
 287
 288 sub charscript {
 289     my $arg = shift;
 290
 291     _charscripts() unless @SCRIPTS;
 292
 293     my $code = _getcode($arg);
 294
 295     if (defined $code) {
 296         _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 297     } else {
 298         if (exists $SCRIPTS{$arg}) {
 299             return $SCRIPTS{$arg};
 300         } else {
 301             return;
 302         }
 303     }
 304 }
 305
 306 =head2 charblocks
 307
 308     use UnicodeCD 'charblocks';
 309
 310     my $charblocks = charblocks();
 311
 312 charblocks() returns a reference to a hash with the known block names
 313 as the keys, and the code point ranges (see L</charblock>) as the values.
 314
 315 =cut
 316
 317 sub charblocks {
 318     _charblocks() unless %BLOCKS;
 319     return \%BLOCKS;
 320 }
 321
 322 =head2 charscripts
 323
 324     use UnicodeCD 'charscripts';
 325
 326     my %charscripts = charscripts();
 327
 328 charscripts() returns a hash with the known script names as the keys,
 329 and the code point ranges (see L</charscript>) as the values.
 330
 331 =cut
 332
 333 sub charscripts {
 334     _charscripts() unless %SCRIPTS;
 335     return \%SCRIPTS;
 336 }
 337
 338 =head2 Blocks versus Scripts
 339
 340 The difference between a block and a script is that scripts are closer
 341 to the linguistic notion of a set of characters required to present
 342 languages, while block is more of an artifact of the Unicode character
 343 numbering and separation into blocks of 256 characters.
 344
 345 For example the Latin B<script> is spread over several B<blocks>, such
 346 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
 347 C<Latin Extended-B>.  On the other hand, the Latin script does not
 348 contain all the characters of the C<Basic Latin> block (also known as
 349 the ASCII): it includes only the letters, not for example the digits
 350 or the punctuation.
 351
 352 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
 353
 354 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
 355
 356 =head2 Matching Scripts and Blocks
 357
 358 Both scripts and blocks can be matched using the regular expression
 359 construct C<\p{In...}> and its negation C<\P{In...}>.
 360
 361 The name of the script or the block comes after the C<In>, for example
 362 C<\p{InCyrillic}>, C<\P{InBasicLatin}>.  Spaces and dashes ('-') are
 363 removed from the names for the C<\p{In...}>, for example
 364 C<LatinExtendedA> instead of C<Latin Extended-A>.
 365
 366 There are a few cases where there exists both a script and a block by
 367 the same name, in these cases the block version has C<Block> appended:
 368 C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
 369
 370 =head2 Code Point Arguments
 371
 372 A <code point argument> is either a decimal or a hexadecimal scalar,
 373 or "U+" followed by hexadecimals.
 374
 375 =head2 charinrange
 376
 377 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 378 can also test whether a code point is in the I<range> as returned by
 379 L</charblock> and L</charscript> or as the values of the hash returned
 380 by L</charblocks> and </charscripts> by using charinrange():
 381
 382     use UnicodeCD qw(charscript charinrange);
 383
 384     $range = charscript('Hiragana');
 385     print "looks like hiragana\n" if charinrange($range, $code);
 386
 387 =cut
 388
 389 =head2 compexcl
 390
 391     use UnicodeCD 'compexcl';
 392
 393     my $compexcl = compexcl("09dc");
 394
 395 The compexcl() returns the composition exclusion (that is, if the
 396 character should not be produced during a precomposition) of the
 397 character specified by a B<code point argument>.
 398
 399 If there is a composition exclusion for the character, true is
 400 returned.  Otherwise, false is returned.
 401
 402 =cut
 403
 404 my %COMPEXCL;
 405
 406 sub _compexcl {
 407     unless (%COMPEXCL) {
 408         if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
 409             while (<$COMPEXCLFH>) {
 410                 if (/^([0-9A-F]+) \# /) {
 411                     my $code = hex($1);
 412                     $COMPEXCL{$code} = undef;
 413                 }
 414             }
 415             close($COMPEXCLFH);
 416         }
 417     }
 418 }
 419
 420 sub compexcl {
 421     my $arg  = shift;
 422     my $code = _getcode($arg);
 423
 424     _compexcl() unless %COMPEXCL;
 425
 426     return exists $COMPEXCL{$code};
 427 }
 428
 429 =head2 casefold
 430
 431     use UnicodeCD 'casefold';
 432
 433     my %casefold = casefold("09dc");
 434
 435 The casefold() returns the locale-independent case folding of the
 436 character specified by a B<code point argument>.
 437
 438 If there is a case folding for that character, a reference to a hash
 439 with the following fields is returned:
 440
 441     key
 442
 443     code             code point with at least four hexdigits
 444     status           "C", "F", "S", or "I"
 445     mapping          one or more codes separated by spaces
 446
 447 The meaning of the I<status> is as follows:
 448
 449    C                 common case folding, common mappings shared
 450                      by both simple and full mappings
 451    F                 full case folding, mappings that cause strings
 452                      to grow in length. Multiple characters are separated
 453                      by spaces
 454    S                 simple case folding, mappings to single characters
 455                      where different from F
 456    I                 special case for dotted uppercase I and
 457                      dotless lowercase i
 458                      - If this mapping is included, the result is
 459                        case-insensitive, but dotless and dotted I's
 460                        are not distinguished
 461                      - If this mapping is excluded, the result is not
 462                        fully case-insensitive, but dotless and dotted
 463                        I's are distinguished
 464
 465 If there is no case folding for that character, C<undef> is returned.
 466
 467 For more information about case mappings see
 468 http://www.unicode.org/unicode/reports/tr21/
 469
 470 =cut
 471
 472 my %CASEFOLD;
 473
 474 sub _casefold {
 475     unless (%CASEFOLD) {
 476         if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
 477             while (<$CASEFOLDFH>) {
 478                 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 479                     my $code = hex($1);
 480                     $CASEFOLD{$code} = { code    => $1,
 481                                          status  => $2,
 482                                          mapping => $3 };
 483                 }
 484             }
 485             close($CASEFOLDFH);
 486         }
 487     }
 488 }
 489
 490 sub casefold {
 491     my $arg  = shift;
 492     my $code = _getcode($arg);
 493
 494     _casefold() unless %CASEFOLD;
 495
 496     return $CASEFOLD{$code};
 497 }
 498
 499 =head2 casespec
 500
 501     use UnicodeCD 'casespec';
 502
 503     my %casespec = casespec("09dc");
 504
 505 The casespec() returns the potentially locale-dependent case mapping
 506 of the character specified by a B<code point argument>.  The mapping
 507 may change the length of the string (which the basic Unicode case
 508 mappings as returned by charinfo() never do).
 509
 510 If there is a case folding for that character, a reference to a hash
 511 with the following fields is returned:
 512
 513     key
 514
 515     code             code point with at least four hexdigits
 516     lower            lowercase
 517     title            titlecase
 518     upper            uppercase
 519     condition        condition list (may be undef)
 520
 521 The C<condition> is optional.  Where present, it consists of one or
 522 more I<locales> or I<contexts>, separated by spaces (other than as
 523 used to separate elements, spaces are to be ignored).  A condition
 524 list overrides the normal behavior if all of the listed conditions are
 525 true.  Case distinctions in the condition list are not significant.
 526 Conditions preceded by "NON_" represent the negation of the condition
 527
 528 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
 529 followed by a "_" and a 2-letter ISO language code (, possibly followed
 530 by a "_" and a variant code).  You can find the list of those codes
 531 in L<Locale::Country> and L<Locale::Language>.
 532
 533 A I<context> is one of the following choices:
 534
 535     FINAL            The letter is not followed by a letter of
 536                      general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
 537     MODERN           The mapping is only used for modern text
 538     AFTER_i          The last base character was "i" 0069
 539
 540 For more information about case mappings see
 541 http://www.unicode.org/unicode/reports/tr21/
 542
 543 =cut
 544
 545 my %CASESPEC;
 546
 547 sub _casespec {
 548     unless (%CASESPEC) {
 549         if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
 550             while (<$CASESPECFH>) {
 551                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
 552                     my $code = hex($1);
 553                     $CASESPEC{$code} = { code      => $1,
 554                                          lower     => $2,
 555                                          title     => $3,
 556                                          upper     => $4,
 557                                          condition => $5 };
 558                 }
 559             }
 560             close($CASESPECFH);
 561         }
 562     }
 563 }
 564
 565 sub casespec {
 566     my $arg  = shift;
 567     my $code = _getcode($arg);
 568
 569     _casespec() unless %CASESPEC;
 570
 571     return $CASESPEC{$code};
 572 }
 573
 574 =head2 UnicodeCD::UnicodeVersion
 575
 576 UnicodeCD::UnicodeVersion() returns the version of the Unicode Character
 577 Database, in other words, the version of the Unicode standard the
 578 database implements.
 579
 580 =cut
 581
 582 my $UNICODEVERSION;
 583
 584 sub UnicodeVersion {
 585     unless (defined $UNICODEVERSION) {
 586         openunicode(\$VERSIONFH, "version");
 587         chomp($UNICODEVERSION = <$VERSIONFH>);
 588         close($VERSIONFH);
 589         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
 590             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
 591     }
 592     return $UNICODEVERSION;
 593 }
 594
 595 =head2 Implementation Note
 596
 597 The first use of charinfo() opens a read-only filehandle to the Unicode
 598 Character Database (the database is included in the Perl distribution).
 599 The filehandle is then kept open for further queries.
 600
 601 =head1 AUTHOR
 602
 603 Jarkko Hietaniemi
 604
 605 =cut
 606
 607 1;