lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = v3.1.0;
   7
   8 require Exporter;
   9
  10 our @ISA = qw(Exporter);
  11 our @EXPORT_OK = qw(charinfo charblock);
  12
  13 use Carp;
  14
  15 =head1 NAME
  16
  17 Unicode - Unicode character database
  18
  19 =head1 SYNOPSIS
  20
  21     use Unicode::UCD 3.1.0;
  22     # requires that level of the Unicode character database
  23
  24     use Unicode::UCD 'charinfo';
  25     my %charinfo  = charinfo($codepoint);
  26
  27     use Unicode::UCD 'charblock';
  28     my $charblock = charblock($codepoint);
  29
  30 =head1 DESCRIPTION
  31
  32 The Unicode module offers a simple interface to the Unicode Character
  33 Database.
  34
  35 =cut
  36
  37 my $UNICODE;
  38 my $BLOCKS;
  39
  40 sub openunicode {
  41     my ($rfh, @path) = @_;
  42     my $f;
  43     unless (defined $$rfh) {
  44         for my $d (@INC) {
  45             use File::Spec;
  46             $f = File::Spec->catfile($d, "unicode", @path);
  47             if (open($$rfh, $f)) {
  48                 last;
  49             } else {
  50                 croak __PACKAGE__, ": open '$f' failed: $!\n";
  51             }
  52         }
  53         croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n"
  54             unless defined $rfh;
  55     }
  56     return $f;
  57 }
  58
  59 =head2 charinfo
  60
  61     use Unicode::UCD 'charinfo';
  62
  63     my %charinfo = charinfo(0x41);
  64
  65 charinfo() returns a hash that has the following fields as defined
  66 by the Unicode standard:
  67
  68     key
  69
  70     code             code point with at least four hexdigits
  71     name             name of the character IN UPPER CASE
  72     category         general category of the character
  73     combining        classes used in the Canonical Ordering Algorithm
  74     bidi             bidirectional category
  75     decomposition    character decomposition mapping
  76     decimal          if decimal digit this is the integer numeric value
  77     digit            if digit this is the numeric value
  78     numeric          if numeric is the integer or rational numeric value
  79     mirrored         if mirrored in bidirectional text
  80     unicode10        Unicode 1.0 name if existed and different
  81     comment          ISO 10646 comment field
  82     upper            uppercase equivalent mapping
  83     lower            lowercase equivalent mapping
  84     title            titlecase equivalent mapping
  85     block            block the character belongs to (used in \p{In...})
  86
  87 If no match is found, an empty hash is returned.
  88
  89 The C<block> property is the same as as returned by charinfo().
  90 (It is not defined in the Unicode Character Database proper but
  91 instead in an auxiliary database.)
  92
  93 =cut
  94
  95 sub charinfo {
  96     my $code = shift;
  97     my $hexk = sprintf("%04X", $code);
  98
  99     openunicode(\$UNICODE, "Unicode.txt");
 100     if (defined $UNICODE) {
 101         use Search::Dict;
 102         if (look($UNICODE, "$hexk;") >= 0) {
 103             my $line = <$UNICODE>;
 104             chomp $line;
 105             my %prop;
 106             @prop{qw(
 107                      code name category
 108                      combining bidi decomposition
 109                      decimal digit numeric
 110                      mirrored unicode10 comment
 111                      upper lower title
 112                     )} = split(/;/, $line, -1);
 113             if ($prop{code} eq $hexk) {
 114                 $prop{block} = charblock($code);
 115                 return %prop;
 116             }
 117         }
 118     }
 119     return;
 120 }
 121
 122 =head2 charbloc
 123
 124     use Unicode::UCD 'charblock';
 125
 126     my $charblock = charblock(0x41);
 127
 128 charblock() returns the block the character belongs to, e.g.
 129 C<Basic Latin>.  Note that not all the character positions within all
 130 block are defined.
 131
 132 The name is the same name that is used in the C<\p{In...}> construct,
 133 for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
 134 away from the names for the C<\p{In...}>.
 135
 136 =cut
 137
 138 my @BLOCKS;
 139
 140 sub _charblock {
 141     my ($code, $lo, $hi) = @_;
 142
 143     return if $lo > $hi;
 144
 145     my $mid = int(($lo+$hi) / 2);
 146
 147     if ($BLOCKS[$mid]->[0] < $code) {
 148         if ($BLOCKS[$mid]->[1] >= $code) {
 149             return $BLOCKS[$mid]->[2];
 150         } else {
 151             _charblock($code, $mid + 1, $hi);
 152         }
 153     } elsif ($BLOCKS[$mid]->[0] > $code) {
 154         _charblock($code, $lo, $mid - 1);
 155     } else {
 156         return $BLOCKS[$mid]->[2];
 157     }
 158 }
 159
 160 sub charblock {
 161     my $code = shift;
 162
 163     unless (@BLOCKS) {
 164         if (openunicode(\$BLOCKS, "Blocks.pl")) {
 165             while (<$BLOCKS>) {
 166                 if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
 167                     push @BLOCKS, [ hex($1), hex($2), $3 ];
 168                 }
 169             }
 170             close($BLOCKS);
 171         }
 172     }
 173
 174     _charblock($code, 0, $#BLOCKS);
 175 }
 176
 177 =head1 AUTHOR
 178
 179 Jarkko Hietaniemi
 180
 181 =cut
 182
 183 1;