Commit | Line | Data |
561c79ed |
1 | package Unicode::UCD; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | |
6f50a187 |
6 | our $VERSION = '3.1.0'; |
561c79ed |
7 | |
8 | require Exporter; |
9 | |
10 | our @ISA = qw(Exporter); |
11 | our @EXPORT_OK = qw(charinfo charblock); |
12 | |
13 | use Carp; |
14 | |
15 | =head1 NAME |
16 | |
00f2772c |
17 | Unicode::UCD - Unicode character database |
561c79ed |
18 | |
19 | =head1 SYNOPSIS |
20 | |
21 | use Unicode::UCD 3.1.0; |
22 | # requires that level of the Unicode character database |
23 | |
24 | use Unicode::UCD 'charinfo'; |
25 | my %charinfo = charinfo($codepoint); |
26 | |
27 | use Unicode::UCD 'charblock'; |
28 | my $charblock = charblock($codepoint); |
29 | |
30 | =head1 DESCRIPTION |
31 | |
32 | The Unicode module offers a simple interface to the Unicode Character |
33 | Database. |
34 | |
35 | =cut |
36 | |
37 | my $UNICODE; |
38 | my $BLOCKS; |
39 | |
40 | sub openunicode { |
41 | my ($rfh, @path) = @_; |
42 | my $f; |
43 | unless (defined $$rfh) { |
44 | for my $d (@INC) { |
45 | use File::Spec; |
46 | $f = File::Spec->catfile($d, "unicode", @path); |
14f14a10 |
47 | next unless -f $f; |
561c79ed |
48 | if (open($$rfh, $f)) { |
49 | last; |
50 | } else { |
51 | croak __PACKAGE__, ": open '$f' failed: $!\n"; |
52 | } |
53 | } |
54 | croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n" |
55 | unless defined $rfh; |
56 | } |
57 | return $f; |
58 | } |
59 | |
60 | =head2 charinfo |
61 | |
62 | use Unicode::UCD 'charinfo'; |
63 | |
64 | my %charinfo = charinfo(0x41); |
65 | |
66 | charinfo() returns a hash that has the following fields as defined |
67 | by the Unicode standard: |
68 | |
69 | key |
70 | |
71 | code code point with at least four hexdigits |
72 | name name of the character IN UPPER CASE |
73 | category general category of the character |
74 | combining classes used in the Canonical Ordering Algorithm |
75 | bidi bidirectional category |
76 | decomposition character decomposition mapping |
77 | decimal if decimal digit this is the integer numeric value |
78 | digit if digit this is the numeric value |
79 | numeric if numeric is the integer or rational numeric value |
80 | mirrored if mirrored in bidirectional text |
81 | unicode10 Unicode 1.0 name if existed and different |
82 | comment ISO 10646 comment field |
83 | upper uppercase equivalent mapping |
84 | lower lowercase equivalent mapping |
85 | title titlecase equivalent mapping |
86 | block block the character belongs to (used in \p{In...}) |
87 | |
88 | If no match is found, an empty hash is returned. |
89 | |
90 | The C<block> property is the same as as returned by charinfo(). |
91 | (It is not defined in the Unicode Character Database proper but |
92 | instead in an auxiliary database.) |
93 | |
94 | =cut |
95 | |
96 | sub charinfo { |
97 | my $code = shift; |
98 | my $hexk = sprintf("%04X", $code); |
99 | |
100 | openunicode(\$UNICODE, "Unicode.txt"); |
101 | if (defined $UNICODE) { |
102 | use Search::Dict; |
103 | if (look($UNICODE, "$hexk;") >= 0) { |
104 | my $line = <$UNICODE>; |
105 | chomp $line; |
106 | my %prop; |
107 | @prop{qw( |
108 | code name category |
109 | combining bidi decomposition |
110 | decimal digit numeric |
111 | mirrored unicode10 comment |
112 | upper lower title |
113 | )} = split(/;/, $line, -1); |
114 | if ($prop{code} eq $hexk) { |
115 | $prop{block} = charblock($code); |
116 | return %prop; |
117 | } |
118 | } |
119 | } |
120 | return; |
121 | } |
122 | |
354a27bf |
123 | =head2 charblock |
561c79ed |
124 | |
125 | use Unicode::UCD 'charblock'; |
126 | |
127 | my $charblock = charblock(0x41); |
128 | |
129 | charblock() returns the block the character belongs to, e.g. |
130 | C<Basic Latin>. Note that not all the character positions within all |
131 | block are defined. |
132 | |
133 | The name is the same name that is used in the C<\p{In...}> construct, |
134 | for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished |
135 | away from the names for the C<\p{In...}>. |
136 | |
137 | =cut |
138 | |
139 | my @BLOCKS; |
140 | |
141 | sub _charblock { |
142 | my ($code, $lo, $hi) = @_; |
143 | |
144 | return if $lo > $hi; |
145 | |
146 | my $mid = int(($lo+$hi) / 2); |
147 | |
148 | if ($BLOCKS[$mid]->[0] < $code) { |
149 | if ($BLOCKS[$mid]->[1] >= $code) { |
150 | return $BLOCKS[$mid]->[2]; |
151 | } else { |
152 | _charblock($code, $mid + 1, $hi); |
153 | } |
154 | } elsif ($BLOCKS[$mid]->[0] > $code) { |
155 | _charblock($code, $lo, $mid - 1); |
156 | } else { |
157 | return $BLOCKS[$mid]->[2]; |
158 | } |
159 | } |
160 | |
161 | sub charblock { |
162 | my $code = shift; |
163 | |
164 | unless (@BLOCKS) { |
165 | if (openunicode(\$BLOCKS, "Blocks.pl")) { |
166 | while (<$BLOCKS>) { |
167 | if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) { |
168 | push @BLOCKS, [ hex($1), hex($2), $3 ]; |
169 | } |
170 | } |
171 | close($BLOCKS); |
172 | } |
173 | } |
174 | |
175 | _charblock($code, 0, $#BLOCKS); |
176 | } |
177 | |
178 | =head1 AUTHOR |
179 | |
180 | Jarkko Hietaniemi |
181 | |
182 | =cut |
183 | |
184 | 1; |