lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '3.1.0';
   7
   8 require Exporter;
   9
  10 our @ISA = qw(Exporter);
  11 our @EXPORT_OK = qw(charinfo charblock);
  12
  13 use Carp;
  14
  15 =head1 NAME
  16
  17 Unicode::UCD - Unicode character database
  18
  19 =head1 SYNOPSIS
  20
  21     use Unicode::UCD 3.1.0;
  22     # requires that level of the Unicode character database
  23
  24     use Unicode::UCD 'charinfo';
  25     my %charinfo  = charinfo($codepoint);
  26
  27     use Unicode::UCD 'charblock';
  28     my $charblock = charblock($codepoint);
  29
  30 =head1 DESCRIPTION
  31
  32 The Unicode module offers a simple interface to the Unicode Character
  33 Database.
  34
  35 =cut
  36
  37 my $UNICODE;
  38 my $BLOCKS;
  39
  40 sub openunicode {
  41     my ($rfh, @path) = @_;
  42     my $f;
  43     unless (defined $$rfh) {
  44         for my $d (@INC) {
  45             use File::Spec;
  46             $f = File::Spec->catfile($d, "unicode", @path);
  47            next unless -f $f;
  48             if (open($$rfh, $f)) {
  49                 last;
  50             } else {
  51                 croak __PACKAGE__, ": open '$f' failed: $!\n";
  52             }
  53         }
  54         croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n"
  55             unless defined $rfh;
  56     }
  57     return $f;
  58 }
  59
  60 =head2 charinfo
  61
  62     use Unicode::UCD 'charinfo';
  63
  64     my %charinfo = charinfo(0x41);
  65
  66 charinfo() returns a hash that has the following fields as defined
  67 by the Unicode standard:
  68
  69     key
  70
  71     code             code point with at least four hexdigits
  72     name             name of the character IN UPPER CASE
  73     category         general category of the character
  74     combining        classes used in the Canonical Ordering Algorithm
  75     bidi             bidirectional category
  76     decomposition    character decomposition mapping
  77     decimal          if decimal digit this is the integer numeric value
  78     digit            if digit this is the numeric value
  79     numeric          if numeric is the integer or rational numeric value
  80     mirrored         if mirrored in bidirectional text
  81     unicode10        Unicode 1.0 name if existed and different
  82     comment          ISO 10646 comment field
  83     upper            uppercase equivalent mapping
  84     lower            lowercase equivalent mapping
  85     title            titlecase equivalent mapping
  86     block            block the character belongs to (used in \p{In...})
  87
  88 If no match is found, an empty hash is returned.
  89
  90 The C<block> property is the same as as returned by charinfo().
  91 (It is not defined in the Unicode Character Database proper but
  92 instead in an auxiliary database.)
  93
  94 =cut
  95
  96 sub charinfo {
  97     my $code = shift;
  98     my $hexk = sprintf("%04X", $code);
  99
 100     openunicode(\$UNICODE, "Unicode.txt");
 101     if (defined $UNICODE) {
 102         use Search::Dict;
 103         if (look($UNICODE, "$hexk;") >= 0) {
 104             my $line = <$UNICODE>;
 105             chomp $line;
 106             my %prop;
 107             @prop{qw(
 108                      code name category
 109                      combining bidi decomposition
 110                      decimal digit numeric
 111                      mirrored unicode10 comment
 112                      upper lower title
 113                     )} = split(/;/, $line, -1);
 114             if ($prop{code} eq $hexk) {
 115                 $prop{block} = charblock($code);
 116                 return %prop;
 117             }
 118         }
 119     }
 120     return;
 121 }
 122
 123 =head2 charblock
 124
 125     use Unicode::UCD 'charblock';
 126
 127     my $charblock = charblock(0x41);
 128
 129 charblock() returns the block the character belongs to, e.g.
 130 C<Basic Latin>.  Note that not all the character positions within all
 131 block are defined.
 132
 133 The name is the same name that is used in the C<\p{In...}> construct,
 134 for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
 135 away from the names for the C<\p{In...}>.
 136
 137 =cut
 138
 139 my @BLOCKS;
 140
 141 sub _charblock {
 142     my ($code, $lo, $hi) = @_;
 143
 144     return if $lo > $hi;
 145
 146     my $mid = int(($lo+$hi) / 2);
 147
 148     if ($BLOCKS[$mid]->[0] < $code) {
 149         if ($BLOCKS[$mid]->[1] >= $code) {
 150             return $BLOCKS[$mid]->[2];
 151         } else {
 152             _charblock($code, $mid + 1, $hi);
 153         }
 154     } elsif ($BLOCKS[$mid]->[0] > $code) {
 155         _charblock($code, $lo, $mid - 1);
 156     } else {
 157         return $BLOCKS[$mid]->[2];
 158     }
 159 }
 160
 161 sub charblock {
 162     my $code = shift;
 163
 164     unless (@BLOCKS) {
 165         if (openunicode(\$BLOCKS, "Blocks.pl")) {
 166             while (<$BLOCKS>) {
 167                 if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
 168                     push @BLOCKS, [ hex($1), hex($2), $3 ];
 169                 }
 170             }
 171             close($BLOCKS);
 172         }
 173     }
 174
 175     _charblock($code, 0, $#BLOCKS);
 176 }
 177
 178 =head1 AUTHOR
 179
 180 Jarkko Hietaniemi
 181
 182 =cut
 183
 184 1;