6 our $VERSION = '3.1.0';
10 our @ISA = qw(Exporter);
11 our @EXPORT_OK = qw(charinfo charblock);
17 Unicode::UCD - Unicode character database
21 use Unicode::UCD 3.1.0;
22 # requires that level of the Unicode character database
24 use Unicode::UCD 'charinfo';
25 my %charinfo = charinfo($codepoint);
27 use Unicode::UCD 'charblock';
28 my $charblock = charblock($codepoint);
32 The Unicode module offers a simple interface to the Unicode Character
41 my ($rfh, @path) = @_;
43 unless (defined $$rfh) {
46 $f = File::Spec->catfile($d, "unicode", @path);
48 if (open($$rfh, $f)) {
51 croak __PACKAGE__, ": open '$f' failed: $!\n";
54 croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n"
62 use Unicode::UCD 'charinfo';
64 my %charinfo = charinfo(0x41);
66 charinfo() returns a hash that has the following fields as defined
67 by the Unicode standard:
71 code code point with at least four hexdigits
72 name name of the character IN UPPER CASE
73 category general category of the character
74 combining classes used in the Canonical Ordering Algorithm
75 bidi bidirectional category
76 decomposition character decomposition mapping
77 decimal if decimal digit this is the integer numeric value
78 digit if digit this is the numeric value
79 numeric if numeric is the integer or rational numeric value
80 mirrored if mirrored in bidirectional text
81 unicode10 Unicode 1.0 name if existed and different
82 comment ISO 10646 comment field
83 upper uppercase equivalent mapping
84 lower lowercase equivalent mapping
85 title titlecase equivalent mapping
86 block block the character belongs to (used in \p{In...})
88 If no match is found, an empty hash is returned.
90 The C<block> property is the same as as returned by charinfo().
91 (It is not defined in the Unicode Character Database proper but
92 instead in an auxiliary database.)
98 my $hexk = sprintf("%04X", $code);
100 openunicode(\$UNICODE, "Unicode.txt");
101 if (defined $UNICODE) {
103 if (look($UNICODE, "$hexk;") >= 0) {
104 my $line = <$UNICODE>;
109 combining bidi decomposition
110 decimal digit numeric
111 mirrored unicode10 comment
113 )} = split(/;/, $line, -1);
114 if ($prop{code} eq $hexk) {
115 $prop{block} = charblock($code);
125 use Unicode::UCD 'charblock';
127 my $charblock = charblock(0x41);
129 charblock() returns the block the character belongs to, e.g.
130 C<Basic Latin>. Note that not all the character positions within all
133 The name is the same name that is used in the C<\p{In...}> construct,
134 for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
135 away from the names for the C<\p{In...}>.
142 my ($code, $lo, $hi) = @_;
146 my $mid = int(($lo+$hi) / 2);
148 if ($BLOCKS[$mid]->[0] < $code) {
149 if ($BLOCKS[$mid]->[1] >= $code) {
150 return $BLOCKS[$mid]->[2];
152 _charblock($code, $mid + 1, $hi);
154 } elsif ($BLOCKS[$mid]->[0] > $code) {
155 _charblock($code, $lo, $mid - 1);
157 return $BLOCKS[$mid]->[2];
165 if (openunicode(\$BLOCKS, "Blocks.pl")) {
167 if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
168 push @BLOCKS, [ hex($1), hex($2), $3 ];
175 _charblock($code, 0, $#BLOCKS);