Integrate mainline
[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm
CommitLineData
561c79ed 1package Unicode::UCD;
2
3use strict;
4use warnings;
5
afd1eb53 6our $VERSION = '0.1';
561c79ed 7
8require Exporter;
9
10our @ISA = qw(Exporter);
afd1eb53 11our @EXPORT_OK = qw(charinfo
12 charblock charscript
13 charblocks charscripts
14 charinrange);
561c79ed 15
16use Carp;
17
18=head1 NAME
19
00f2772c 20Unicode::UCD - Unicode character database
561c79ed 21
22=head1 SYNOPSIS
23
561c79ed 24 use Unicode::UCD 'charinfo';
afd1eb53 25 my %charinfo = charinfo($codepoint);
561c79ed 26
27 use Unicode::UCD 'charblock';
afd1eb53 28 my $charblock = charblock($codepoint);
29
30 use Unicode::UCD 'charscript';
31 my $charscript = charblock($codepoint);
561c79ed 32
33=head1 DESCRIPTION
34
35The Unicode module offers a simple interface to the Unicode Character
36Database.
37
38=cut
39
afd1eb53 40my $UNICODEFH;
41my $BLOCKSFH;
42my $SCRIPTSFH;
43my $VERSIONFH;
561c79ed 44
45sub openunicode {
46 my ($rfh, @path) = @_;
47 my $f;
48 unless (defined $$rfh) {
49 for my $d (@INC) {
50 use File::Spec;
51 $f = File::Spec->catfile($d, "unicode", @path);
afd1eb53 52 last if open($$rfh, $f);
53 undef $f;
561c79ed 54 }
afd1eb53 55 croak __PACKAGE__, ": failed to find ",
56 File::Spec->catfile(@path), " in @INC"
57 unless defined $f;
561c79ed 58 }
59 return $f;
60}
61
62=head2 charinfo
63
64 use Unicode::UCD 'charinfo';
65
66 my %charinfo = charinfo(0x41);
67
68charinfo() returns a hash that has the following fields as defined
69by the Unicode standard:
70
71 key
72
73 code code point with at least four hexdigits
74 name name of the character IN UPPER CASE
75 category general category of the character
76 combining classes used in the Canonical Ordering Algorithm
77 bidi bidirectional category
78 decomposition character decomposition mapping
79 decimal if decimal digit this is the integer numeric value
80 digit if digit this is the numeric value
81 numeric if numeric is the integer or rational numeric value
82 mirrored if mirrored in bidirectional text
83 unicode10 Unicode 1.0 name if existed and different
84 comment ISO 10646 comment field
85 upper uppercase equivalent mapping
86 lower lowercase equivalent mapping
87 title titlecase equivalent mapping
afd1eb53 88
561c79ed 89 block block the character belongs to (used in \p{In...})
afd1eb53 90 script script the character belongs to
561c79ed 91
92If no match is found, an empty hash is returned.
93
afd1eb53 94The C<block> property is the same as as returned by charinfo(). It is
95not defined in the Unicode Character Database proper (Chapter 4 of the
96Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
97of TUS3). Similarly for the C<script> property.
98
99Note that you cannot do (de)composition and casing based solely on the
100above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
101you will need also the I<Composition Exclusions>, I<Case Folding>, and
102I<SpecialCasing> tables, available as files F<CompExcl.txt>,
103F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
561c79ed 104
105=cut
106
afd1eb53 107sub _getcode {
108 my $arg = shift;
109
110 if ($arg =~ /^\d+$/) {
111 return $arg;
112 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
113 return hex($1);
114 }
115
116 return;
117}
118
561c79ed 119sub charinfo {
afd1eb53 120 my $arg = shift;
121 my $code = _getcode($arg);
122 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
123 unless defined $code;
561c79ed 124 my $hexk = sprintf("%04X", $code);
125
afd1eb53 126 openunicode(\$UNICODEFH, "Unicode.txt");
127 if (defined $UNICODEFH) {
561c79ed 128 use Search::Dict;
afd1eb53 129 if (look($UNICODEFH, "$hexk;") >= 0) {
130 my $line = <$UNICODEFH>;
561c79ed 131 chomp $line;
132 my %prop;
133 @prop{qw(
134 code name category
135 combining bidi decomposition
136 decimal digit numeric
137 mirrored unicode10 comment
138 upper lower title
139 )} = split(/;/, $line, -1);
140 if ($prop{code} eq $hexk) {
afd1eb53 141 $prop{block} = charblock($code);
142 $prop{script} = charscript($code);
561c79ed 143 return %prop;
144 }
145 }
146 }
147 return;
148}
149
afd1eb53 150sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
151 my ($table, $lo, $hi, $code) = @_;
152
153 return if $lo > $hi;
154
155 my $mid = int(($lo+$hi) / 2);
156
157 if ($table->[$mid]->[0] < $code) {
158 if ($table->[$mid]->[1] >= $code) {
159 return $table->[$mid]->[2];
160 } else {
161 _search($table, $mid + 1, $hi, $code);
162 }
163 } elsif ($table->[$mid]->[0] > $code) {
164 _search($table, $lo, $mid - 1, $code);
165 } else {
166 return $table->[$mid]->[2];
167 }
168}
169
170sub charinrange {
171 my ($range, $arg) = @_;
172 my $code = _getcode($arg);
173 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
174 unless defined $code;
175 _search($range, 0, $#$range, $code);
176}
177
354a27bf 178=head2 charblock
561c79ed 179
180 use Unicode::UCD 'charblock';
181
182 my $charblock = charblock(0x41);
afd1eb53 183 my $charblock = charblock(1234);
184 my $charblock = charblock("0x263a");
185 my $charblock = charblock("U+263a");
186
187 my $ranges = charblock('Armenian');
188
189With a B<code point argument> charblock() returns the block the character
190belongs to, e.g. C<Basic Latin>. Note that not all the character
191positions within all blocks are defined. A <code point argument>
192is either a decimal or a hexadecimal scalar, or "U+" followed
193by hexadecimals.
194
195If supplied with an argument that can't be a code point, charblock()
196tries to do the opposite and interpret the argument as a character
197block. The return value is a I<range>: an anonymous list that
198contains anonymous lists, which in turn contain I<start-of-range>,
199I<end-of-range> code point pairs. You can test whether a code point
200is in a range using the L</charinrange> function. If the argument is
201not a known charater block, C<undef> is returned.
561c79ed 202
203=cut
204
205my @BLOCKS;
afd1eb53 206my %BLOCKS;
207
208sub _charblocks {
209 unless (@BLOCKS) {
210 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
211 while (<$BLOCKSFH>) {
212 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
213 my ($lo, $hi) = (hex($1), hex($2));
214 my $subrange = [ $lo, $hi, $3 ];
215 push @BLOCKS, $subrange;
216 push @{$BLOCKS{$3}}, $subrange;
217 }
218 }
219 close($BLOCKSFH);
220 }
221 }
222}
561c79ed 223
afd1eb53 224sub charblock {
225 my $arg = shift;
561c79ed 226
afd1eb53 227 _charblocks() unless @BLOCKS;
561c79ed 228
afd1eb53 229 my $code = _getcode($arg);
561c79ed 230
afd1eb53 231 if (defined $code) {
232 _search(\@BLOCKS, 0, $#BLOCKS, $code);
233 } else {
234 if (exists $BLOCKS{$arg}) {
235 return $BLOCKS{$arg};
561c79ed 236 } else {
afd1eb53 237 return;
561c79ed 238 }
561c79ed 239 }
240}
241
afd1eb53 242=head2 charscript
561c79ed 243
afd1eb53 244 use Unicode::UCD 'charscript';
245
246 my $charscript = charscript(0x41);
247 my $charscript = charscript(1234);
248 my $charscript = charscript("U+263a");
249
250 my $ranges = charscript('Thai');
251
252With a B<code point argument> charscript() returns the script the
253character belongs to, e.g. C<Latin>, C<Greek>, C<Han>. A <code point
254argument> is either a decimal or a hexadecimal scalar, or "U+"
255followed by hexadecimals.
256
257If supplied with an argument that can't be a code point, charscript()
258tries to do the opposite and interpret the argument as a character
259script. The return value is a I<range>: an anonymous list that
260contains anonymous lists, which in turn contain I<start-of-range>,
261I<end-of-range> code point pairs. You can test whether a code point
262is in a range using the L</charinrange> function. If the argument is
263not a known charater script, C<undef> is returned.
264
265=cut
266
267my @SCRIPTS;
268my %SCRIPTS;
269
270sub _charscripts {
271 unless (@SCRIPTS) {
272 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
273 while (<$SCRIPTSFH>) {
274 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
275 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
276 my $script = lc($3);
277 $script =~ s/\b(\w)/uc($1)/ge;
278 my $subrange = [ $lo, $hi, $script ];
279 push @SCRIPTS, $subrange;
280 push @{$SCRIPTS{$script}}, $subrange;
561c79ed 281 }
282 }
afd1eb53 283 close($SCRIPTSFH);
284 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
285 }
286 }
287}
288
289sub charscript {
290 my $arg = shift;
291
292 _charscripts() unless @SCRIPTS;
293
294 my $code = _getcode($arg);
295
296 if (defined $code) {
297 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
298 } else {
299 if (exists $SCRIPTS{$arg}) {
300 return $SCRIPTS{$arg};
301 } else {
302 return;
561c79ed 303 }
304 }
afd1eb53 305}
306
307=head2 charblocks
308
309 use Unicode::UCD 'charblocks';
310
311 my %charblocks = charblocks();
312
313charblocks() returns a hash with the known block names as the keys,
314and the code point ranges (see L</charblock>) as the values.
315
316=cut
317
318sub charblocks {
319 _charblocks() unless @BLOCKS;
320 return %BLOCKS;
321}
322
323=head2 charscripts
324
325 use Unicode::UCD 'charscripts';
326
327 my %charscripts = charscripts();
328
329charscripts() returns a hash with the known script names as the keys,
330and the code point ranges (see L</charscript>) as the values.
331
332=cut
333
334sub charscripts {
335 _charscripts() unless @SCRIPTS;
336 return %SCRIPTS;
337}
338
339=head2 Blocks versus Scripts
340
341The difference between a block and a script is that scripts are closer
342to the linguistic notion of a set of characters required to present
343languages, while block is more of an artifact of the Unicode character
344numbering and separation into blocks of 256 characters.
345
346For example the Latin B<script> is spread over several B<blocks>, such
347as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
348C<Latin Extended-B>. On the other hand, the Latin script does not
349contain all the characters of the C<Basic Latin> block (also known as
350the ASCII): it includes only the letters, not for example the digits
351or the punctuation.
352
353For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
354
355For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
356
357=head2 Matching Scripts and Blocks
358
359Both scripts and blocks can be matched using the regular expression
360construct C<\p{In...}> and its negation C<\P{In...}>.
361
362The name of the script or the block comes after the C<In>, for example
363C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
364removed from the names for the C<\p{In...}>, for example
365C<LatinExtendedA> instead of C<Latin Extended-A>.
366
367There are a few cases where there exists both a script and a block by
368the same name, in these cases the block version has C<Block> appended:
369C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
370
371=head2 charinrange
372
373In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
374can also test whether a code point is in the I<range> as returned by
375L</charblock> and L</charscript> or as the values of the hash returned
376by L</charblocks> and </charscripts> by using charinrange():
561c79ed 377
afd1eb53 378 use Unicode::UCD qw(charscript charinrange);
379
380 $range = charscript('Hiragana');
381 print "looks like hiragana\n" if charinrange($range, $code);
382
383=cut
384
385=head2 Unicode::UCD::UnicodeVersion
386
387Unicode::UCD::UnicodeVersion() returns the version of the Unicode Character
388Database, in other words, the version of the Unicode standard the
389database implements.
390
391=cut
392
393my $UNICODEVERSION;
394
395sub UnicodeVersion {
396 unless (defined $UNICODEVERSION) {
397 openunicode(\$VERSIONFH, "version");
398 chomp($UNICODEVERSION = <$VERSIONFH>);
399 close($VERSIONFH);
400 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
401 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
402 }
403 return $UNICODEVERSION;
561c79ed 404}
405
afd1eb53 406=head2 Implementation Note
407
408The first use of charinfo() opens a read-only filehandle to the Unicode
409Character Database (the database is included in the Perl distribution).
410The filehandle is then kept open for further queries.
411
561c79ed 412=head1 AUTHOR
413
414Jarkko Hietaniemi
415
416=cut
417
4181;