Fixing extra -I's with PERL_CORE
[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm
CommitLineData
561c79ed 1package Unicode::UCD;
2
3use strict;
4use warnings;
5
6f50a187 6our $VERSION = '3.1.0';
561c79ed 7
8require Exporter;
9
10our @ISA = qw(Exporter);
e882dd67 11our @EXPORT_OK = qw(charinfo charblock charscript);
561c79ed 12
13use Carp;
14
15=head1 NAME
16
00f2772c 17Unicode::UCD - Unicode character database
561c79ed 18
19=head1 SYNOPSIS
20
21 use Unicode::UCD 3.1.0;
22 # requires that level of the Unicode character database
23
24 use Unicode::UCD 'charinfo';
e882dd67 25 my %charinfo = charinfo($codepoint);
561c79ed 26
27 use Unicode::UCD 'charblock';
e882dd67 28 my $charblock = charblock($codepoint);
29
30 use Unicode::UCD 'charscript';
31 my $charscript = charblock($codepoint);
561c79ed 32
33=head1 DESCRIPTION
34
35The Unicode module offers a simple interface to the Unicode Character
36Database.
37
38=cut
39
40my $UNICODE;
41my $BLOCKS;
e882dd67 42my $SCRIPTS;
561c79ed 43
44sub openunicode {
45 my ($rfh, @path) = @_;
46 my $f;
47 unless (defined $$rfh) {
48 for my $d (@INC) {
49 use File::Spec;
50 $f = File::Spec->catfile($d, "unicode", @path);
32c16050 51 last if open($$rfh, $f);
e882dd67 52 undef $f;
561c79ed 53 }
e882dd67 54 croak __PACKAGE__, ": failed to find ",
55 File::Spec->catfile(@path), " in @INC"
56 unless defined $f;
561c79ed 57 }
58 return $f;
59}
60
61=head2 charinfo
62
63 use Unicode::UCD 'charinfo';
64
65 my %charinfo = charinfo(0x41);
66
67charinfo() returns a hash that has the following fields as defined
68by the Unicode standard:
69
70 key
71
72 code code point with at least four hexdigits
73 name name of the character IN UPPER CASE
74 category general category of the character
75 combining classes used in the Canonical Ordering Algorithm
76 bidi bidirectional category
77 decomposition character decomposition mapping
78 decimal if decimal digit this is the integer numeric value
79 digit if digit this is the numeric value
80 numeric if numeric is the integer or rational numeric value
81 mirrored if mirrored in bidirectional text
82 unicode10 Unicode 1.0 name if existed and different
83 comment ISO 10646 comment field
84 upper uppercase equivalent mapping
85 lower lowercase equivalent mapping
86 title titlecase equivalent mapping
e882dd67 87
561c79ed 88 block block the character belongs to (used in \p{In...})
e882dd67 89 script script the character belongs to
561c79ed 90
91If no match is found, an empty hash is returned.
92
32c16050 93The C<block> property is the same as as returned by charinfo(). It is
94not defined in the Unicode Character Database proper (Chapter 4 of the
95Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
e882dd67 96of TUS3). Similarly for the C<script> property.
32c16050 97
98Note that you cannot do (de)composition and casing based solely on the
99above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
e882dd67 100you will need also the I<Composition Exclusions>, I<Case Folding>, and
101I<SpecialCasing> tables, available as files F<CompExcl.txt>,
102F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
561c79ed 103
104=cut
105
106sub charinfo {
107 my $code = shift;
108 my $hexk = sprintf("%04X", $code);
109
110 openunicode(\$UNICODE, "Unicode.txt");
111 if (defined $UNICODE) {
112 use Search::Dict;
113 if (look($UNICODE, "$hexk;") >= 0) {
114 my $line = <$UNICODE>;
115 chomp $line;
116 my %prop;
117 @prop{qw(
118 code name category
119 combining bidi decomposition
120 decimal digit numeric
121 mirrored unicode10 comment
122 upper lower title
123 )} = split(/;/, $line, -1);
124 if ($prop{code} eq $hexk) {
a196fbfd 125 $prop{block} = charblock($code);
126 $prop{script} = charscript($code);
561c79ed 127 return %prop;
128 }
129 }
130 }
131 return;
132}
133
e882dd67 134sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
135 my ($table, $lo, $hi, $code) = @_;
136
137 return if $lo > $hi;
138
139 my $mid = int(($lo+$hi) / 2);
140
141 if ($table->[$mid]->[0] < $code) {
a196fbfd 142 if (defined $table->[$mid]->[1] && $table->[$mid]->[1] >= $code) {
e882dd67 143 return $table->[$mid]->[2];
144 } else {
145 _search($table, $mid + 1, $hi, $code);
146 }
147 } elsif ($table->[$mid]->[0] > $code) {
148 _search($table, $lo, $mid - 1, $code);
149 } else {
150 return $table->[$mid]->[2];
151 }
152}
153
354a27bf 154=head2 charblock
561c79ed 155
156 use Unicode::UCD 'charblock';
157
158 my $charblock = charblock(0x41);
159
160charblock() returns the block the character belongs to, e.g.
161C<Basic Latin>. Note that not all the character positions within all
e882dd67 162blocks are defined.
561c79ed 163
164The name is the same name that is used in the C<\p{In...}> construct,
165for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
e882dd67 166away from the names for the C<\p{In...}>, for example C<LatinExtendedA>
167instead of C<Latin Extended-A>.
561c79ed 168
169=cut
170
171my @BLOCKS;
172
561c79ed 173sub charblock {
174 my $code = shift;
175
176 unless (@BLOCKS) {
2796c109 177 if (openunicode(\$BLOCKS, "Blocks.txt")) {
561c79ed 178 while (<$BLOCKS>) {
2796c109 179 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
561c79ed 180 push @BLOCKS, [ hex($1), hex($2), $3 ];
181 }
182 }
183 close($BLOCKS);
184 }
185 }
186
e882dd67 187 _search(\@BLOCKS, 0, $#BLOCKS, $code);
188}
189
190=head2 charscript
191
192 use Unicode::UCD 'charscript';
193
194 my $charscript = charscript(0x41);
195
196charscript() returns the script the character belongs to, e.g.
ad9cab37 197C<Latin>, C<Greek>, C<Han>.
e882dd67 198
199Unfortunately, currently (Perl 5.8.0) there is no regular expression
200notation for matching scripts as there is for blocks (C<\p{In...}>.
201
202=cut
203
204my @SCRIPTS;
205
206sub charscript {
207 my $code = shift;
208
209 unless (@SCRIPTS) {
210 if (openunicode(\$SCRIPTS, "Scripts.txt")) {
211 while (<$SCRIPTS>) {
212 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
213 push @SCRIPTS, [ hex($1), $2 ? hex($2) : undef, $3 ];
214 }
215 }
216 close($SCRIPTS);
217 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
218 }
219 }
220
221 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
561c79ed 222}
223
ad9cab37 224=head2 charblock versus charscript
225
226The difference between a character block and a script is that scripts
227are closer to the linguistic notion of a set of characters required to
228present languages, while block is more of an artifact of the Unicode
229character numbering. For example the Latin B<script> is spread over
230several B<blocks>, such as C<Basic Latin>, C<Latin 1 Supplement>,
231C<Latin Extended-A>, and C<Latin Extended-B>. On the other hand, the
232Latin script does not contain all the characters of the C<Basic Latin>
233block (also known as the ASCII): it includes only the letters, not for
234example the digits or the punctuation.
235
236For block see http://www.unicode.org/Public/UNIDATA/Blocks.txt
237
238For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
239
240Note also that the script names are all in uppercase, e.g. C<HEBREW>,
241while the block names are Capitalized and with intermixed spaces,
242e.g. C<Yi Syllables>.
243
2796c109 244Greek
245Cyrillic
246Armenian
247Hebrew
248Arabic
249Syriac
250Thaana
251Devanagari
252Bengali
253Gurmukhi
254Gujarati
255Oriya
256Tamil
257Telugu
258Kannada
259Malayalam
260Sinhala
261Thai
262Lao
263Tibetan
264Myanmar
265Georgian
266Ethiopic
267Cherokee
268Ogham
269Runic
270Khmer
271Hiragana
272Katakana
273Bopomofo
274OldItalic
275Gothic
276Deseret
277
e882dd67 278=head1 IMPLEMENTATION NOTE
32c16050 279
ad9cab37 280The first use of charinfo() opens a read-only filehandle to the Unicode
281Character Database (the database is included in the Perl distribution).
282The filehandle is then kept open for further queries.
32c16050 283
561c79ed 284=head1 AUTHOR
285
286Jarkko Hietaniemi
287
288=cut
289
2901;