lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '3.1.0';
   7
   8 require Exporter;
   9
  10 our @ISA = qw(Exporter);
  11 our @EXPORT_OK = qw(charinfo charblock charscript);
  12
  13 use Carp;
  14
  15 =head1 NAME
  16
  17 Unicode::UCD - Unicode character database
  18
  19 =head1 SYNOPSIS
  20
  21     use Unicode::UCD 3.1.0;
  22     # requires that level of the Unicode character database
  23
  24     use Unicode::UCD 'charinfo';
  25     my %charinfo   = charinfo($codepoint);
  26
  27     use Unicode::UCD 'charblock';
  28     my $charblock  = charblock($codepoint);
  29
  30     use Unicode::UCD 'charscript';
  31     my $charscript = charblock($codepoint);
  32
  33 =head1 DESCRIPTION
  34
  35 The Unicode module offers a simple interface to the Unicode Character
  36 Database.
  37
  38 =cut
  39
  40 my $UNICODE;
  41 my $BLOCKS;
  42 my $SCRIPTS;
  43
  44 sub openunicode {
  45     my ($rfh, @path) = @_;
  46     my $f;
  47     unless (defined $$rfh) {
  48         for my $d (@INC) {
  49             use File::Spec;
  50             $f = File::Spec->catfile($d, "unicode", @path);
  51             last if open($$rfh, $f);
  52             undef $f;
  53         }
  54         croak __PACKAGE__, ": failed to find ",
  55               File::Spec->catfile(@path), " in @INC"
  56             unless defined $f;
  57     }
  58     return $f;
  59 }
  60
  61 =head2 charinfo
  62
  63     use Unicode::UCD 'charinfo';
  64
  65     my %charinfo = charinfo(0x41);
  66
  67 charinfo() returns a hash that has the following fields as defined
  68 by the Unicode standard:
  69
  70     key
  71
  72     code             code point with at least four hexdigits
  73     name             name of the character IN UPPER CASE
  74     category         general category of the character
  75     combining        classes used in the Canonical Ordering Algorithm
  76     bidi             bidirectional category
  77     decomposition    character decomposition mapping
  78     decimal          if decimal digit this is the integer numeric value
  79     digit            if digit this is the numeric value
  80     numeric          if numeric is the integer or rational numeric value
  81     mirrored         if mirrored in bidirectional text
  82     unicode10        Unicode 1.0 name if existed and different
  83     comment          ISO 10646 comment field
  84     upper            uppercase equivalent mapping
  85     lower            lowercase equivalent mapping
  86     title            titlecase equivalent mapping
  87
  88     block            block the character belongs to (used in \p{In...})
  89     script           script the character belongs to
  90
  91 If no match is found, an empty hash is returned.
  92
  93 The C<block> property is the same as as returned by charinfo().  It is
  94 not defined in the Unicode Character Database proper (Chapter 4 of the
  95 Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
  96 of TUS3).  Similarly for the C<script> property.
  97
  98 Note that you cannot do (de)composition and casing based solely on the
  99 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
 100 you will need also the I<Composition Exclusions>, I<Case Folding>, and
 101 I<SpecialCasing> tables, available as files F<CompExcl.txt>,
 102 F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
 103
 104 =cut
 105
 106 sub charinfo {
 107     my $code = shift;
 108     my $hexk = sprintf("%04X", $code);
 109
 110     openunicode(\$UNICODE, "Unicode.txt");
 111     if (defined $UNICODE) {
 112         use Search::Dict;
 113         if (look($UNICODE, "$hexk;") >= 0) {
 114             my $line = <$UNICODE>;
 115             chomp $line;
 116             my %prop;
 117             @prop{qw(
 118                      code name category
 119                      combining bidi decomposition
 120                      decimal digit numeric
 121                      mirrored unicode10 comment
 122                      upper lower title
 123                     )} = split(/;/, $line, -1);
 124             if ($prop{code} eq $hexk) {
 125                 $prop{block} = charblock($code);
 126                 return %prop;
 127             }
 128         }
 129     }
 130     return;
 131 }
 132
 133 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 134     my ($table, $lo, $hi, $code) = @_;
 135
 136     return if $lo > $hi;
 137
 138     my $mid = int(($lo+$hi) / 2);
 139
 140     if ($table->[$mid]->[0] < $code) {
 141         if ($table->[$mid]->[1] >= $code) {
 142             return $table->[$mid]->[2];
 143         } else {
 144             _search($table, $mid + 1, $hi, $code);
 145         }
 146     } elsif ($table->[$mid]->[0] > $code) {
 147         _search($table, $lo, $mid - 1, $code);
 148     } else {
 149         return $table->[$mid]->[2];
 150     }
 151 }
 152
 153 =head2 charblock
 154
 155     use Unicode::UCD 'charblock';
 156
 157     my $charblock = charblock(0x41);
 158
 159 charblock() returns the block the character belongs to, e.g.
 160 C<Basic Latin>.  Note that not all the character positions within all
 161 blocks are defined.
 162
 163 The name is the same name that is used in the C<\p{In...}> construct,
 164 for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
 165 away from the names for the C<\p{In...}>, for example C<LatinExtendedA>
 166 instead of C<Latin Extended-A>.
 167
 168 =cut
 169
 170 my @BLOCKS;
 171
 172 sub charblock {
 173     my $code = shift;
 174
 175     unless (@BLOCKS) {
 176         if (openunicode(\$BLOCKS, "Blocks.pl")) {
 177             while (<$BLOCKS>) {
 178                 if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
 179                     push @BLOCKS, [ hex($1), hex($2), $3 ];
 180                 }
 181             }
 182             close($BLOCKS);
 183         }
 184     }
 185
 186     _search(\@BLOCKS, 0, $#BLOCKS, $code);
 187 }
 188
 189 =head2 charscript
 190
 191     use Unicode::UCD 'charscript';
 192
 193     my $charscript = charscript(0x41);
 194
 195 charscript() returns the script the character belongs to, e.g.
 196 C<Latin>, C<Greek>, C<Han>.  Note that not all the character positions
 197 within all scripts are defined.
 198
 199 The difference between a character block and a script is that script
 200 names are closer to the linguistic notion of a set of characters,
 201 while block is more of an artifact of the Unicode character numbering.
 202 For example the Latin B<script> is spread over several B<blocks>.
 203
 204 Note also that the script names are all in uppercase, e.g. C<HEBREW>,
 205 while the block names are Capitalized and with intermixed spaces,
 206 e.g. C<Yi Syllables>.
 207
 208 Unfortunately, currently (Perl 5.8.0) there is no regular expression
 209 notation for matching scripts as there is for blocks (C<\p{In...}>.
 210
 211 =cut
 212
 213 my @SCRIPTS;
 214
 215 sub charscript {
 216     my $code = shift;
 217
 218     unless (@SCRIPTS) {
 219         if (openunicode(\$SCRIPTS, "Scripts.txt")) {
 220             while (<$SCRIPTS>) {
 221                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 222                     push @SCRIPTS, [ hex($1), $2 ? hex($2) : undef, $3 ];
 223                 }
 224             }
 225             close($SCRIPTS);
 226             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 227         }
 228     }
 229
 230     _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 231 }
 232
 233 =head1 IMPLEMENTATION NOTE
 234
 235 The first use of L<charinfo> opens a read-only filehandle to the Unicode
 236 Character Database.  The filehandle is kept open for further queries.
 237
 238 =head1 AUTHOR
 239
 240 Jarkko Hietaniemi
 241
 242 =cut
 243
 244 1;