lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '3.1.0';
   7
   8 require Exporter;
   9
  10 our @ISA = qw(Exporter);
  11 our @EXPORT_OK = qw(charinfo charblock charscript);
  12
  13 use Carp;
  14
  15 =head1 NAME
  16
  17 Unicode::UCD - Unicode character database
  18
  19 =head1 SYNOPSIS
  20
  21     use Unicode::UCD 3.1.0;
  22     # requires that level of the Unicode character database
  23
  24     use Unicode::UCD 'charinfo';
  25     my %charinfo   = charinfo($codepoint);
  26
  27     use Unicode::UCD 'charblock';
  28     my $charblock  = charblock($codepoint);
  29
  30     use Unicode::UCD 'charscript';
  31     my $charscript = charblock($codepoint);
  32
  33 =head1 DESCRIPTION
  34
  35 The Unicode module offers a simple interface to the Unicode Character
  36 Database.
  37
  38 =cut
  39
  40 my $UNICODE;
  41 my $BLOCKS;
  42 my $SCRIPTS;
  43
  44 sub openunicode {
  45     my ($rfh, @path) = @_;
  46     my $f;
  47     unless (defined $$rfh) {
  48         for my $d (@INC) {
  49             use File::Spec;
  50             $f = File::Spec->catfile($d, "unicode", @path);
  51             last if open($$rfh, $f);
  52             undef $f;
  53         }
  54         croak __PACKAGE__, ": failed to find ",
  55               File::Spec->catfile(@path), " in @INC"
  56             unless defined $f;
  57     }
  58     return $f;
  59 }
  60
  61 =head2 charinfo
  62
  63     use Unicode::UCD 'charinfo';
  64
  65     my %charinfo = charinfo(0x41);
  66
  67 charinfo() returns a hash that has the following fields as defined
  68 by the Unicode standard:
  69
  70     key
  71
  72     code             code point with at least four hexdigits
  73     name             name of the character IN UPPER CASE
  74     category         general category of the character
  75     combining        classes used in the Canonical Ordering Algorithm
  76     bidi             bidirectional category
  77     decomposition    character decomposition mapping
  78     decimal          if decimal digit this is the integer numeric value
  79     digit            if digit this is the numeric value
  80     numeric          if numeric is the integer or rational numeric value
  81     mirrored         if mirrored in bidirectional text
  82     unicode10        Unicode 1.0 name if existed and different
  83     comment          ISO 10646 comment field
  84     upper            uppercase equivalent mapping
  85     lower            lowercase equivalent mapping
  86     title            titlecase equivalent mapping
  87
  88     block            block the character belongs to (used in \p{In...})
  89     script           script the character belongs to
  90
  91 If no match is found, an empty hash is returned.
  92
  93 The C<block> property is the same as as returned by charinfo().  It is
  94 not defined in the Unicode Character Database proper (Chapter 4 of the
  95 Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
  96 of TUS3).  Similarly for the C<script> property.
  97
  98 Note that you cannot do (de)composition and casing based solely on the
  99 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
 100 you will need also the I<Composition Exclusions>, I<Case Folding>, and
 101 I<SpecialCasing> tables, available as files F<CompExcl.txt>,
 102 F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
 103
 104 =cut
 105
 106 sub charinfo {
 107     my $code = shift;
 108     my $hexk = sprintf("%04X", $code);
 109
 110     openunicode(\$UNICODE, "Unicode.txt");
 111     if (defined $UNICODE) {
 112         use Search::Dict;
 113         if (look($UNICODE, "$hexk;") >= 0) {
 114             my $line = <$UNICODE>;
 115             chomp $line;
 116             my %prop;
 117             @prop{qw(
 118                      code name category
 119                      combining bidi decomposition
 120                      decimal digit numeric
 121                      mirrored unicode10 comment
 122                      upper lower title
 123                     )} = split(/;/, $line, -1);
 124             if ($prop{code} eq $hexk) {
 125                 $prop{block}  = charblock($code);
 126                 $prop{script} = charscript($code);
 127                 return %prop;
 128             }
 129         }
 130     }
 131     return;
 132 }
 133
 134 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 135     my ($table, $lo, $hi, $code) = @_;
 136
 137     return if $lo > $hi;
 138
 139     my $mid = int(($lo+$hi) / 2);
 140
 141     if ($table->[$mid]->[0] < $code) {
 142         if (defined $table->[$mid]->[1] && $table->[$mid]->[1] >= $code) {
 143             return $table->[$mid]->[2];
 144         } else {
 145             _search($table, $mid + 1, $hi, $code);
 146         }
 147     } elsif ($table->[$mid]->[0] > $code) {
 148         _search($table, $lo, $mid - 1, $code);
 149     } else {
 150         return $table->[$mid]->[2];
 151     }
 152 }
 153
 154 =head2 charblock
 155
 156     use Unicode::UCD 'charblock';
 157
 158     my $charblock = charblock(0x41);
 159
 160 charblock() returns the block the character belongs to, e.g.
 161 C<Basic Latin>.  Note that not all the character positions within all
 162 blocks are defined.
 163
 164 The name is the same name that is used in the C<\p{In...}> construct,
 165 for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
 166 away from the names for the C<\p{In...}>, for example C<LatinExtendedA>
 167 instead of C<Latin Extended-A>.
 168
 169 =cut
 170
 171 my @BLOCKS;
 172
 173 sub charblock {
 174     my $code = shift;
 175
 176     unless (@BLOCKS) {
 177         if (openunicode(\$BLOCKS, "Blocks.txt")) {
 178             while (<$BLOCKS>) {
 179                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 180                     push @BLOCKS, [ hex($1), hex($2), $3 ];
 181                 }
 182             }
 183             close($BLOCKS);
 184         }
 185     }
 186
 187     _search(\@BLOCKS, 0, $#BLOCKS, $code);
 188 }
 189
 190 =head2 charscript
 191
 192     use Unicode::UCD 'charscript';
 193
 194     my $charscript = charscript(0x41);
 195
 196 charscript() returns the script the character belongs to, e.g.
 197 C<Latin>, C<Greek>, C<Han>.
 198
 199 Unfortunately, currently (Perl 5.8.0) there is no regular expression
 200 notation for matching scripts as there is for blocks (C<\p{In...}>.
 201
 202 =cut
 203
 204 my @SCRIPTS;
 205
 206 sub charscript {
 207     my $code = shift;
 208
 209     unless (@SCRIPTS) {
 210         if (openunicode(\$SCRIPTS, "Scripts.txt")) {
 211             while (<$SCRIPTS>) {
 212                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 213                     push @SCRIPTS, [ hex($1), $2 ? hex($2) : undef, $3 ];
 214                 }
 215             }
 216             close($SCRIPTS);
 217             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 218         }
 219     }
 220
 221     _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 222 }
 223
 224 =head2 charblock versus charscript
 225
 226 The difference between a character block and a script is that scripts
 227 are closer to the linguistic notion of a set of characters required to
 228 present languages, while block is more of an artifact of the Unicode
 229 character numbering.  For example the Latin B<script> is spread over
 230 several B<blocks>, such as C<Basic Latin>, C<Latin 1 Supplement>,
 231 C<Latin Extended-A>, and C<Latin Extended-B>.  On the other hand, the
 232 Latin script does not contain all the characters of the C<Basic Latin>
 233 block (also known as the ASCII): it includes only the letters, not for
 234 example the digits or the punctuation.
 235
 236 For block see http://www.unicode.org/Public/UNIDATA/Blocks.txt
 237
 238 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
 239
 240 Note also that the script names are all in uppercase, e.g. C<HEBREW>,
 241 while the block names are Capitalized and with intermixed spaces,
 242 e.g. C<Yi Syllables>.
 243
 244 Greek
 245 Cyrillic
 246 Armenian
 247 Hebrew
 248 Arabic
 249 Syriac
 250 Thaana
 251 Devanagari
 252 Bengali
 253 Gurmukhi
 254 Gujarati
 255 Oriya
 256 Tamil
 257 Telugu
 258 Kannada
 259 Malayalam
 260 Sinhala
 261 Thai
 262 Lao
 263 Tibetan
 264 Myanmar
 265 Georgian
 266 Ethiopic
 267 Cherokee
 268 Ogham
 269 Runic
 270 Khmer
 271 Hiragana
 272 Katakana
 273 Bopomofo
 274 OldItalic
 275 Gothic
 276 Deseret
 277
 278 =head1 IMPLEMENTATION NOTE
 279
 280 The first use of charinfo() opens a read-only filehandle to the Unicode
 281 Character Database (the database is included in the Perl distribution).
 282 The filehandle is then kept open for further queries.
 283
 284 =head1 AUTHOR
 285
 286 Jarkko Hietaniemi
 287
 288 =cut
 289
 290 1;