Commit | Line | Data |
561c79ed |
1 | package Unicode::UCD; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | |
6f50a187 |
6 | our $VERSION = '3.1.0'; |
561c79ed |
7 | |
8 | require Exporter; |
9 | |
10 | our @ISA = qw(Exporter); |
e882dd67 |
11 | our @EXPORT_OK = qw(charinfo charblock charscript); |
561c79ed |
12 | |
13 | use Carp; |
14 | |
15 | =head1 NAME |
16 | |
00f2772c |
17 | Unicode::UCD - Unicode character database |
561c79ed |
18 | |
19 | =head1 SYNOPSIS |
20 | |
21 | use Unicode::UCD 3.1.0; |
22 | # requires that level of the Unicode character database |
23 | |
24 | use Unicode::UCD 'charinfo'; |
e882dd67 |
25 | my %charinfo = charinfo($codepoint); |
561c79ed |
26 | |
27 | use Unicode::UCD 'charblock'; |
e882dd67 |
28 | my $charblock = charblock($codepoint); |
29 | |
30 | use Unicode::UCD 'charscript'; |
31 | my $charscript = charblock($codepoint); |
561c79ed |
32 | |
33 | =head1 DESCRIPTION |
34 | |
35 | The Unicode module offers a simple interface to the Unicode Character |
36 | Database. |
37 | |
38 | =cut |
39 | |
40 | my $UNICODE; |
41 | my $BLOCKS; |
e882dd67 |
42 | my $SCRIPTS; |
561c79ed |
43 | |
44 | sub openunicode { |
45 | my ($rfh, @path) = @_; |
46 | my $f; |
47 | unless (defined $$rfh) { |
48 | for my $d (@INC) { |
49 | use File::Spec; |
50 | $f = File::Spec->catfile($d, "unicode", @path); |
32c16050 |
51 | last if open($$rfh, $f); |
e882dd67 |
52 | undef $f; |
561c79ed |
53 | } |
e882dd67 |
54 | croak __PACKAGE__, ": failed to find ", |
55 | File::Spec->catfile(@path), " in @INC" |
56 | unless defined $f; |
561c79ed |
57 | } |
58 | return $f; |
59 | } |
60 | |
61 | =head2 charinfo |
62 | |
63 | use Unicode::UCD 'charinfo'; |
64 | |
65 | my %charinfo = charinfo(0x41); |
66 | |
67 | charinfo() returns a hash that has the following fields as defined |
68 | by the Unicode standard: |
69 | |
70 | key |
71 | |
72 | code code point with at least four hexdigits |
73 | name name of the character IN UPPER CASE |
74 | category general category of the character |
75 | combining classes used in the Canonical Ordering Algorithm |
76 | bidi bidirectional category |
77 | decomposition character decomposition mapping |
78 | decimal if decimal digit this is the integer numeric value |
79 | digit if digit this is the numeric value |
80 | numeric if numeric is the integer or rational numeric value |
81 | mirrored if mirrored in bidirectional text |
82 | unicode10 Unicode 1.0 name if existed and different |
83 | comment ISO 10646 comment field |
84 | upper uppercase equivalent mapping |
85 | lower lowercase equivalent mapping |
86 | title titlecase equivalent mapping |
e882dd67 |
87 | |
561c79ed |
88 | block block the character belongs to (used in \p{In...}) |
e882dd67 |
89 | script script the character belongs to |
561c79ed |
90 | |
91 | If no match is found, an empty hash is returned. |
92 | |
32c16050 |
93 | The C<block> property is the same as as returned by charinfo(). It is |
94 | not defined in the Unicode Character Database proper (Chapter 4 of the |
95 | Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14 |
e882dd67 |
96 | of TUS3). Similarly for the C<script> property. |
32c16050 |
97 | |
98 | Note that you cannot do (de)composition and casing based solely on the |
99 | above C<decomposition> and C<lower>, C<upper>, C<title>, properties, |
e882dd67 |
100 | you will need also the I<Composition Exclusions>, I<Case Folding>, and |
101 | I<SpecialCasing> tables, available as files F<CompExcl.txt>, |
102 | F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution. |
561c79ed |
103 | |
104 | =cut |
105 | |
106 | sub charinfo { |
107 | my $code = shift; |
108 | my $hexk = sprintf("%04X", $code); |
109 | |
110 | openunicode(\$UNICODE, "Unicode.txt"); |
111 | if (defined $UNICODE) { |
112 | use Search::Dict; |
113 | if (look($UNICODE, "$hexk;") >= 0) { |
114 | my $line = <$UNICODE>; |
115 | chomp $line; |
116 | my %prop; |
117 | @prop{qw( |
118 | code name category |
119 | combining bidi decomposition |
120 | decimal digit numeric |
121 | mirrored unicode10 comment |
122 | upper lower title |
123 | )} = split(/;/, $line, -1); |
124 | if ($prop{code} eq $hexk) { |
a196fbfd |
125 | $prop{block} = charblock($code); |
126 | $prop{script} = charscript($code); |
561c79ed |
127 | return %prop; |
128 | } |
129 | } |
130 | } |
131 | return; |
132 | } |
133 | |
e882dd67 |
134 | sub _search { # Binary search in a [[lo,hi,prop],[...],...] table. |
135 | my ($table, $lo, $hi, $code) = @_; |
136 | |
137 | return if $lo > $hi; |
138 | |
139 | my $mid = int(($lo+$hi) / 2); |
140 | |
141 | if ($table->[$mid]->[0] < $code) { |
a196fbfd |
142 | if (defined $table->[$mid]->[1] && $table->[$mid]->[1] >= $code) { |
e882dd67 |
143 | return $table->[$mid]->[2]; |
144 | } else { |
145 | _search($table, $mid + 1, $hi, $code); |
146 | } |
147 | } elsif ($table->[$mid]->[0] > $code) { |
148 | _search($table, $lo, $mid - 1, $code); |
149 | } else { |
150 | return $table->[$mid]->[2]; |
151 | } |
152 | } |
153 | |
354a27bf |
154 | =head2 charblock |
561c79ed |
155 | |
156 | use Unicode::UCD 'charblock'; |
157 | |
158 | my $charblock = charblock(0x41); |
159 | |
160 | charblock() returns the block the character belongs to, e.g. |
161 | C<Basic Latin>. Note that not all the character positions within all |
e882dd67 |
162 | blocks are defined. |
561c79ed |
163 | |
164 | The name is the same name that is used in the C<\p{In...}> construct, |
165 | for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished |
e882dd67 |
166 | away from the names for the C<\p{In...}>, for example C<LatinExtendedA> |
167 | instead of C<Latin Extended-A>. |
561c79ed |
168 | |
169 | =cut |
170 | |
171 | my @BLOCKS; |
172 | |
561c79ed |
173 | sub charblock { |
174 | my $code = shift; |
175 | |
176 | unless (@BLOCKS) { |
2796c109 |
177 | if (openunicode(\$BLOCKS, "Blocks.txt")) { |
561c79ed |
178 | while (<$BLOCKS>) { |
2796c109 |
179 | if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) { |
561c79ed |
180 | push @BLOCKS, [ hex($1), hex($2), $3 ]; |
181 | } |
182 | } |
183 | close($BLOCKS); |
184 | } |
185 | } |
186 | |
e882dd67 |
187 | _search(\@BLOCKS, 0, $#BLOCKS, $code); |
188 | } |
189 | |
190 | =head2 charscript |
191 | |
192 | use Unicode::UCD 'charscript'; |
193 | |
194 | my $charscript = charscript(0x41); |
195 | |
196 | charscript() returns the script the character belongs to, e.g. |
ad9cab37 |
197 | C<Latin>, C<Greek>, C<Han>. |
e882dd67 |
198 | |
199 | Unfortunately, currently (Perl 5.8.0) there is no regular expression |
200 | notation for matching scripts as there is for blocks (C<\p{In...}>. |
201 | |
202 | =cut |
203 | |
204 | my @SCRIPTS; |
205 | |
206 | sub charscript { |
207 | my $code = shift; |
208 | |
209 | unless (@SCRIPTS) { |
210 | if (openunicode(\$SCRIPTS, "Scripts.txt")) { |
211 | while (<$SCRIPTS>) { |
212 | if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) { |
213 | push @SCRIPTS, [ hex($1), $2 ? hex($2) : undef, $3 ]; |
214 | } |
215 | } |
216 | close($SCRIPTS); |
217 | @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS; |
218 | } |
219 | } |
220 | |
221 | _search(\@SCRIPTS, 0, $#SCRIPTS, $code); |
561c79ed |
222 | } |
223 | |
ad9cab37 |
224 | =head2 charblock versus charscript |
225 | |
226 | The difference between a character block and a script is that scripts |
227 | are closer to the linguistic notion of a set of characters required to |
228 | present languages, while block is more of an artifact of the Unicode |
229 | character numbering. For example the Latin B<script> is spread over |
230 | several B<blocks>, such as C<Basic Latin>, C<Latin 1 Supplement>, |
231 | C<Latin Extended-A>, and C<Latin Extended-B>. On the other hand, the |
232 | Latin script does not contain all the characters of the C<Basic Latin> |
233 | block (also known as the ASCII): it includes only the letters, not for |
234 | example the digits or the punctuation. |
235 | |
236 | For block see http://www.unicode.org/Public/UNIDATA/Blocks.txt |
237 | |
238 | For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/ |
239 | |
240 | Note also that the script names are all in uppercase, e.g. C<HEBREW>, |
241 | while the block names are Capitalized and with intermixed spaces, |
242 | e.g. C<Yi Syllables>. |
243 | |
2796c109 |
244 | Greek |
245 | Cyrillic |
246 | Armenian |
247 | Hebrew |
248 | Arabic |
249 | Syriac |
250 | Thaana |
251 | Devanagari |
252 | Bengali |
253 | Gurmukhi |
254 | Gujarati |
255 | Oriya |
256 | Tamil |
257 | Telugu |
258 | Kannada |
259 | Malayalam |
260 | Sinhala |
261 | Thai |
262 | Lao |
263 | Tibetan |
264 | Myanmar |
265 | Georgian |
266 | Ethiopic |
267 | Cherokee |
268 | Ogham |
269 | Runic |
270 | Khmer |
271 | Hiragana |
272 | Katakana |
273 | Bopomofo |
274 | OldItalic |
275 | Gothic |
276 | Deseret |
277 | |
e882dd67 |
278 | =head1 IMPLEMENTATION NOTE |
32c16050 |
279 | |
ad9cab37 |
280 | The first use of charinfo() opens a read-only filehandle to the Unicode |
281 | Character Database (the database is included in the Perl distribution). |
282 | The filehandle is then kept open for further queries. |
32c16050 |
283 | |
561c79ed |
284 | =head1 AUTHOR |
285 | |
286 | Jarkko Hietaniemi |
287 | |
288 | =cut |
289 | |
290 | 1; |