lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '0.1';
   7
   8 require Exporter;
   9
  10 our @ISA = qw(Exporter);
  11 our @EXPORT_OK = qw(charinfo
  12                     charblock charscript
  13                     charblocks charscripts
  14                     charinrange);
  15
  16 use Carp;
  17
  18 =head1 NAME
  19
  20 Unicode::UCD - Unicode character database
  21
  22 =head1 SYNOPSIS
  23
  24     use Unicode::UCD 'charinfo';
  25     my %charinfo   = charinfo($codepoint);
  26
  27     use Unicode::UCD 'charblock';
  28     my $charblock  = charblock($codepoint);
  29
  30     use Unicode::UCD 'charscript';
  31     my $charscript = charblock($codepoint);
  32
  33 =head1 DESCRIPTION
  34
  35 The Unicode module offers a simple interface to the Unicode Character
  36 Database.
  37
  38 =cut
  39
  40 my $UNICODEFH;
  41 my $BLOCKSFH;
  42 my $SCRIPTSFH;
  43 my $VERSIONFH;
  44
  45 sub openunicode {
  46     my ($rfh, @path) = @_;
  47     my $f;
  48     unless (defined $$rfh) {
  49         for my $d (@INC) {
  50             use File::Spec;
  51             $f = File::Spec->catfile($d, "unicode", @path);
  52             last if open($$rfh, $f);
  53             undef $f;
  54         }
  55         croak __PACKAGE__, ": failed to find ",
  56               File::Spec->catfile(@path), " in @INC"
  57             unless defined $f;
  58     }
  59     return $f;
  60 }
  61
  62 =head2 charinfo
  63
  64     use Unicode::UCD 'charinfo';
  65
  66     my %charinfo = charinfo(0x41);
  67
  68 charinfo() returns a hash that has the following fields as defined
  69 by the Unicode standard:
  70
  71     key
  72
  73     code             code point with at least four hexdigits
  74     name             name of the character IN UPPER CASE
  75     category         general category of the character
  76     combining        classes used in the Canonical Ordering Algorithm
  77     bidi             bidirectional category
  78     decomposition    character decomposition mapping
  79     decimal          if decimal digit this is the integer numeric value
  80     digit            if digit this is the numeric value
  81     numeric          if numeric is the integer or rational numeric value
  82     mirrored         if mirrored in bidirectional text
  83     unicode10        Unicode 1.0 name if existed and different
  84     comment          ISO 10646 comment field
  85     upper            uppercase equivalent mapping
  86     lower            lowercase equivalent mapping
  87     title            titlecase equivalent mapping
  88
  89     block            block the character belongs to (used in \p{In...})
  90     script           script the character belongs to
  91
  92 If no match is found, an empty hash is returned.
  93
  94 The C<block> property is the same as as returned by charinfo().  It is
  95 not defined in the Unicode Character Database proper (Chapter 4 of the
  96 Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
  97 of TUS3).  Similarly for the C<script> property.
  98
  99 Note that you cannot do (de)composition and casing based solely on the
 100 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
 101 you will need also the I<Composition Exclusions>, I<Case Folding>, and
 102 I<SpecialCasing> tables, available as files F<CompExcl.txt>,
 103 F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
 104
 105 =cut
 106
 107 sub _getcode {
 108     my $arg = shift;
 109
 110     if ($arg =~ /^\d+$/) {
 111         return $arg;
 112     } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
 113         return hex($1);
 114     }
 115
 116     return;
 117 }
 118
 119 sub charinfo {
 120     my $arg  = shift;
 121     my $code = _getcode($arg);
 122     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 123         unless defined $code;
 124     my $hexk = sprintf("%04X", $code);
 125
 126     openunicode(\$UNICODEFH, "Unicode.txt");
 127     if (defined $UNICODEFH) {
 128         use Search::Dict;
 129         if (look($UNICODEFH, "$hexk;") >= 0) {
 130             my $line = <$UNICODEFH>;
 131             chomp $line;
 132             my %prop;
 133             @prop{qw(
 134                      code name category
 135                      combining bidi decomposition
 136                      decimal digit numeric
 137                      mirrored unicode10 comment
 138                      upper lower title
 139                     )} = split(/;/, $line, -1);
 140             if ($prop{code} eq $hexk) {
 141                 $prop{block}  = charblock($code);
 142                 $prop{script} = charscript($code);
 143                 return %prop;
 144             }
 145         }
 146     }
 147     return;
 148 }
 149
 150 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 151     my ($table, $lo, $hi, $code) = @_;
 152
 153     return if $lo > $hi;
 154
 155     my $mid = int(($lo+$hi) / 2);
 156
 157     if ($table->[$mid]->[0] < $code) {
 158         if ($table->[$mid]->[1] >= $code) {
 159             return $table->[$mid]->[2];
 160         } else {
 161             _search($table, $mid + 1, $hi, $code);
 162         }
 163     } elsif ($table->[$mid]->[0] > $code) {
 164         _search($table, $lo, $mid - 1, $code);
 165     } else {
 166         return $table->[$mid]->[2];
 167     }
 168 }
 169
 170 sub charinrange {
 171     my ($range, $arg) = @_;
 172     my $code = _getcode($arg);
 173     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 174         unless defined $code;
 175     _search($range, 0, $#$range, $code);
 176 }
 177
 178 =head2 charblock
 179
 180     use Unicode::UCD 'charblock';
 181
 182     my $charblock = charblock(0x41);
 183     my $charblock = charblock(1234);
 184     my $charblock = charblock("0x263a");
 185     my $charblock = charblock("U+263a");
 186
 187     my $ranges    = charblock('Armenian');
 188
 189 With a B<code point argument> charblock() returns the block the character
 190 belongs to, e.g.  C<Basic Latin>.  Note that not all the character
 191 positions within all blocks are defined.  A <code point argument>
 192 is either a decimal or a hexadecimal scalar, or "U+" followed
 193 by hexadecimals.
 194
 195 If supplied with an argument that can't be a code point, charblock()
 196 tries to do the opposite and interpret the argument as a character
 197 block.  The return value is a I<range>: an anonymous list that
 198 contains anonymous lists, which in turn contain I<start-of-range>,
 199 I<end-of-range> code point pairs.  You can test whether a code point
 200 is in a range using the L</charinrange> function.  If the argument is
 201 not a known charater block, C<undef> is returned.
 202
 203 =cut
 204
 205 my @BLOCKS;
 206 my %BLOCKS;
 207
 208 sub _charblocks {
 209     unless (@BLOCKS) {
 210         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 211             while (<$BLOCKSFH>) {
 212                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 213                     my ($lo, $hi) = (hex($1), hex($2));
 214                     my $subrange = [ $lo, $hi, $3 ];
 215                     push @BLOCKS, $subrange;
 216                     push @{$BLOCKS{$3}}, $subrange;
 217                 }
 218             }
 219             close($BLOCKSFH);
 220         }
 221     }
 222 }
 223
 224 sub charblock {
 225     my $arg = shift;
 226
 227     _charblocks() unless @BLOCKS;
 228
 229     my $code = _getcode($arg);
 230
 231     if (defined $code) {
 232         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 233     } else {
 234         if (exists $BLOCKS{$arg}) {
 235             return $BLOCKS{$arg};
 236         } else {
 237             return;
 238         }
 239     }
 240 }
 241
 242 =head2 charscript
 243
 244     use Unicode::UCD 'charscript';
 245
 246     my $charscript = charscript(0x41);
 247     my $charscript = charscript(1234);
 248     my $charscript = charscript("U+263a");
 249
 250     my $ranges     = charscript('Thai');
 251
 252 With a B<code point argument> charscript() returns the script the
 253 character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.  A <code point
 254 argument> is either a decimal or a hexadecimal scalar, or "U+"
 255 followed by hexadecimals.
 256
 257 If supplied with an argument that can't be a code point, charscript()
 258 tries to do the opposite and interpret the argument as a character
 259 script.  The return value is a I<range>: an anonymous list that
 260 contains anonymous lists, which in turn contain I<start-of-range>,
 261 I<end-of-range> code point pairs.  You can test whether a code point
 262 is in a range using the L</charinrange> function.  If the argument is
 263 not a known charater script, C<undef> is returned.
 264
 265 =cut
 266
 267 my @SCRIPTS;
 268 my %SCRIPTS;
 269
 270 sub _charscripts {
 271     unless (@SCRIPTS) {
 272         if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 273             while (<$SCRIPTSFH>) {
 274                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 275                     my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 276                     my $script = lc($3);
 277                     $script =~ s/\b(\w)/uc($1)/ge;
 278                     my $subrange = [ $lo, $hi, $script ];
 279                     push @SCRIPTS, $subrange;
 280                     push @{$SCRIPTS{$script}}, $subrange;
 281                 }
 282             }
 283             close($SCRIPTSFH);
 284             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 285         }
 286     }
 287 }
 288
 289 sub charscript {
 290     my $arg = shift;
 291
 292     _charscripts() unless @SCRIPTS;
 293
 294     my $code = _getcode($arg);
 295
 296     if (defined $code) {
 297         _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 298     } else {
 299         if (exists $SCRIPTS{$arg}) {
 300             return $SCRIPTS{$arg};
 301         } else {
 302             return;
 303         }
 304     }
 305 }
 306
 307 =head2 charblocks
 308
 309     use Unicode::UCD 'charblocks';
 310
 311     my %charblocks = charblocks();
 312
 313 charblocks() returns a hash with the known block names as the keys,
 314 and the code point ranges (see L</charblock>) as the values.
 315
 316 =cut
 317
 318 sub charblocks {
 319     _charblocks() unless @BLOCKS;
 320     return %BLOCKS;
 321 }
 322
 323 =head2 charscripts
 324
 325     use Unicode::UCD 'charscripts';
 326
 327     my %charscripts = charscripts();
 328
 329 charscripts() returns a hash with the known script names as the keys,
 330 and the code point ranges (see L</charscript>) as the values.
 331
 332 =cut
 333
 334 sub charscripts {
 335     _charscripts() unless @SCRIPTS;
 336     return %SCRIPTS;
 337 }
 338
 339 =head2 Blocks versus Scripts
 340
 341 The difference between a block and a script is that scripts are closer
 342 to the linguistic notion of a set of characters required to present
 343 languages, while block is more of an artifact of the Unicode character
 344 numbering and separation into blocks of 256 characters.
 345
 346 For example the Latin B<script> is spread over several B<blocks>, such
 347 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
 348 C<Latin Extended-B>.  On the other hand, the Latin script does not
 349 contain all the characters of the C<Basic Latin> block (also known as
 350 the ASCII): it includes only the letters, not for example the digits
 351 or the punctuation.
 352
 353 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
 354
 355 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
 356
 357 =head2 Matching Scripts and Blocks
 358
 359 Both scripts and blocks can be matched using the regular expression
 360 construct C<\p{In...}> and its negation C<\P{In...}>.
 361
 362 The name of the script or the block comes after the C<In>, for example
 363 C<\p{InCyrillic}>, C<\P{InBasicLatin}>.  Spaces and dashes ('-') are
 364 removed from the names for the C<\p{In...}>, for example
 365 C<LatinExtendedA> instead of C<Latin Extended-A>.
 366
 367 There are a few cases where there exists both a script and a block by
 368 the same name, in these cases the block version has C<Block> appended:
 369 C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
 370
 371 =head2 charinrange
 372
 373 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 374 can also test whether a code point is in the I<range> as returned by
 375 L</charblock> and L</charscript> or as the values of the hash returned
 376 by L</charblocks> and </charscripts> by using charinrange():
 377
 378     use Unicode::UCD qw(charscript charinrange);
 379
 380     $range = charscript('Hiragana');
 381     print "looks like hiragana\n" if charinrange($range, $code);
 382
 383 =cut
 384
 385 =head2 Unicode::UCD::UnicodeVersion
 386
 387 Unicode::UCD::UnicodeVersion() returns the version of the Unicode Character
 388 Database, in other words, the version of the Unicode standard the
 389 database implements.
 390
 391 =cut
 392
 393 my $UNICODEVERSION;
 394
 395 sub UnicodeVersion {
 396     unless (defined $UNICODEVERSION) {
 397         openunicode(\$VERSIONFH, "version");
 398         chomp($UNICODEVERSION = <$VERSIONFH>);
 399         close($VERSIONFH);
 400         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
 401             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
 402     }
 403     return $UNICODEVERSION;
 404 }
 405
 406 =head2 Implementation Note
 407
 408 The first use of charinfo() opens a read-only filehandle to the Unicode
 409 Character Database (the database is included in the Perl distribution).
 410 The filehandle is then kept open for further queries.
 411
 412 =head1 AUTHOR
 413
 414 Jarkko Hietaniemi
 415
 416 =cut
 417
 418 1;