6 our $VERSION = '3.1.0';
10 our @ISA = qw(Exporter);
11 our @EXPORT_OK = qw(charinfo charblock);
17 Unicode - Unicode character database
21 use Unicode::UCD 3.1.0;
22 # requires that level of the Unicode character database
24 use Unicode::UCD 'charinfo';
25 my %charinfo = charinfo($codepoint);
27 use Unicode::UCD 'charblock';
28 my $charblock = charblock($codepoint);
32 The Unicode module offers a simple interface to the Unicode Character
41 my ($rfh, @path) = @_;
43 unless (defined $$rfh) {
46 $f = File::Spec->catfile($d, "unicode", @path);
47 if (open($$rfh, $f)) {
50 croak __PACKAGE__, ": open '$f' failed: $!\n";
53 croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n"
61 use Unicode::UCD 'charinfo';
63 my %charinfo = charinfo(0x41);
65 charinfo() returns a hash that has the following fields as defined
66 by the Unicode standard:
70 code code point with at least four hexdigits
71 name name of the character IN UPPER CASE
72 category general category of the character
73 combining classes used in the Canonical Ordering Algorithm
74 bidi bidirectional category
75 decomposition character decomposition mapping
76 decimal if decimal digit this is the integer numeric value
77 digit if digit this is the numeric value
78 numeric if numeric is the integer or rational numeric value
79 mirrored if mirrored in bidirectional text
80 unicode10 Unicode 1.0 name if existed and different
81 comment ISO 10646 comment field
82 upper uppercase equivalent mapping
83 lower lowercase equivalent mapping
84 title titlecase equivalent mapping
85 block block the character belongs to (used in \p{In...})
87 If no match is found, an empty hash is returned.
89 The C<block> property is the same as as returned by charinfo().
90 (It is not defined in the Unicode Character Database proper but
91 instead in an auxiliary database.)
97 my $hexk = sprintf("%04X", $code);
99 openunicode(\$UNICODE, "Unicode.txt");
100 if (defined $UNICODE) {
102 if (look($UNICODE, "$hexk;") >= 0) {
103 my $line = <$UNICODE>;
108 combining bidi decomposition
109 decimal digit numeric
110 mirrored unicode10 comment
112 )} = split(/;/, $line, -1);
113 if ($prop{code} eq $hexk) {
114 $prop{block} = charblock($code);
124 use Unicode::UCD 'charblock';
126 my $charblock = charblock(0x41);
128 charblock() returns the block the character belongs to, e.g.
129 C<Basic Latin>. Note that not all the character positions within all
132 The name is the same name that is used in the C<\p{In...}> construct,
133 for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
134 away from the names for the C<\p{In...}>.
141 my ($code, $lo, $hi) = @_;
145 my $mid = int(($lo+$hi) / 2);
147 if ($BLOCKS[$mid]->[0] < $code) {
148 if ($BLOCKS[$mid]->[1] >= $code) {
149 return $BLOCKS[$mid]->[2];
151 _charblock($code, $mid + 1, $hi);
153 } elsif ($BLOCKS[$mid]->[0] > $code) {
154 _charblock($code, $lo, $mid - 1);
156 return $BLOCKS[$mid]->[2];
164 if (openunicode(\$BLOCKS, "Blocks.pl")) {
166 if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
167 push @BLOCKS, [ hex($1), hex($2), $3 ];
174 _charblock($code, 0, $#BLOCKS);