10 our @ISA = qw(Exporter);
11 our @EXPORT_OK = qw(charinfo
13 charblocks charscripts
20 Unicode::UCD - Unicode character database
24 use Unicode::UCD 'charinfo';
25 my %charinfo = charinfo($codepoint);
27 use Unicode::UCD 'charblock';
28 my $charblock = charblock($codepoint);
30 use Unicode::UCD 'charscript';
31 my $charscript = charblock($codepoint);
35 The Unicode module offers a simple interface to the Unicode Character
46 my ($rfh, @path) = @_;
48 unless (defined $$rfh) {
51 $f = File::Spec->catfile($d, "unicode", @path);
52 last if open($$rfh, $f);
55 croak __PACKAGE__, ": failed to find ",
56 File::Spec->catfile(@path), " in @INC"
64 use Unicode::UCD 'charinfo';
66 my %charinfo = charinfo(0x41);
68 charinfo() returns a hash that has the following fields as defined
69 by the Unicode standard:
73 code code point with at least four hexdigits
74 name name of the character IN UPPER CASE
75 category general category of the character
76 combining classes used in the Canonical Ordering Algorithm
77 bidi bidirectional category
78 decomposition character decomposition mapping
79 decimal if decimal digit this is the integer numeric value
80 digit if digit this is the numeric value
81 numeric if numeric is the integer or rational numeric value
82 mirrored if mirrored in bidirectional text
83 unicode10 Unicode 1.0 name if existed and different
84 comment ISO 10646 comment field
85 upper uppercase equivalent mapping
86 lower lowercase equivalent mapping
87 title titlecase equivalent mapping
89 block block the character belongs to (used in \p{In...})
90 script script the character belongs to
92 If no match is found, an empty hash is returned.
94 The C<block> property is the same as as returned by charinfo(). It is
95 not defined in the Unicode Character Database proper (Chapter 4 of the
96 Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
97 of TUS3). Similarly for the C<script> property.
99 Note that you cannot do (de)composition and casing based solely on the
100 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
101 you will need also the I<Composition Exclusions>, I<Case Folding>, and
102 I<SpecialCasing> tables, available as files F<CompExcl.txt>,
103 F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
110 if ($arg =~ /^\d+$/) {
112 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
121 my $code = _getcode($arg);
122 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
123 unless defined $code;
124 my $hexk = sprintf("%04X", $code);
126 openunicode(\$UNICODEFH, "Unicode.txt");
127 if (defined $UNICODEFH) {
129 if (look($UNICODEFH, "$hexk;") >= 0) {
130 my $line = <$UNICODEFH>;
135 combining bidi decomposition
136 decimal digit numeric
137 mirrored unicode10 comment
139 )} = split(/;/, $line, -1);
140 if ($prop{code} eq $hexk) {
141 $prop{block} = charblock($code);
142 $prop{script} = charscript($code);
150 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
151 my ($table, $lo, $hi, $code) = @_;
155 my $mid = int(($lo+$hi) / 2);
157 if ($table->[$mid]->[0] < $code) {
158 if ($table->[$mid]->[1] >= $code) {
159 return $table->[$mid]->[2];
161 _search($table, $mid + 1, $hi, $code);
163 } elsif ($table->[$mid]->[0] > $code) {
164 _search($table, $lo, $mid - 1, $code);
166 return $table->[$mid]->[2];
171 my ($range, $arg) = @_;
172 my $code = _getcode($arg);
173 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
174 unless defined $code;
175 _search($range, 0, $#$range, $code);
180 use Unicode::UCD 'charblock';
182 my $charblock = charblock(0x41);
183 my $charblock = charblock(1234);
184 my $charblock = charblock("0x263a");
185 my $charblock = charblock("U+263a");
187 my $ranges = charblock('Armenian');
189 With a B<code point argument> charblock() returns the block the character
190 belongs to, e.g. C<Basic Latin>. Note that not all the character
191 positions within all blocks are defined. A <code point argument>
192 is either a decimal or a hexadecimal scalar, or "U+" followed
195 If supplied with an argument that can't be a code point, charblock()
196 tries to do the opposite and interpret the argument as a character
197 block. The return value is a I<range>: an anonymous list that
198 contains anonymous lists, which in turn contain I<start-of-range>,
199 I<end-of-range> code point pairs. You can test whether a code point
200 is in a range using the L</charinrange> function. If the argument is
201 not a known charater block, C<undef> is returned.
210 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
211 while (<$BLOCKSFH>) {
212 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
213 my ($lo, $hi) = (hex($1), hex($2));
214 my $subrange = [ $lo, $hi, $3 ];
215 push @BLOCKS, $subrange;
216 push @{$BLOCKS{$3}}, $subrange;
227 _charblocks() unless @BLOCKS;
229 my $code = _getcode($arg);
232 _search(\@BLOCKS, 0, $#BLOCKS, $code);
234 if (exists $BLOCKS{$arg}) {
235 return $BLOCKS{$arg};
244 use Unicode::UCD 'charscript';
246 my $charscript = charscript(0x41);
247 my $charscript = charscript(1234);
248 my $charscript = charscript("U+263a");
250 my $ranges = charscript('Thai');
252 With a B<code point argument> charscript() returns the script the
253 character belongs to, e.g. C<Latin>, C<Greek>, C<Han>. A <code point
254 argument> is either a decimal or a hexadecimal scalar, or "U+"
255 followed by hexadecimals.
257 If supplied with an argument that can't be a code point, charscript()
258 tries to do the opposite and interpret the argument as a character
259 script. The return value is a I<range>: an anonymous list that
260 contains anonymous lists, which in turn contain I<start-of-range>,
261 I<end-of-range> code point pairs. You can test whether a code point
262 is in a range using the L</charinrange> function. If the argument is
263 not a known charater script, C<undef> is returned.
272 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
273 while (<$SCRIPTSFH>) {
274 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
275 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
277 $script =~ s/\b(\w)/uc($1)/ge;
278 my $subrange = [ $lo, $hi, $script ];
279 push @SCRIPTS, $subrange;
280 push @{$SCRIPTS{$script}}, $subrange;
284 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
292 _charscripts() unless @SCRIPTS;
294 my $code = _getcode($arg);
297 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
299 if (exists $SCRIPTS{$arg}) {
300 return $SCRIPTS{$arg};
309 use Unicode::UCD 'charblocks';
311 my %charblocks = charblocks();
313 charblocks() returns a hash with the known block names as the keys,
314 and the code point ranges (see L</charblock>) as the values.
319 _charblocks() unless @BLOCKS;
325 use Unicode::UCD 'charscripts';
327 my %charscripts = charscripts();
329 charscripts() returns a hash with the known script names as the keys,
330 and the code point ranges (see L</charscript>) as the values.
335 _charscripts() unless @SCRIPTS;
339 =head2 Blocks versus Scripts
341 The difference between a block and a script is that scripts are closer
342 to the linguistic notion of a set of characters required to present
343 languages, while block is more of an artifact of the Unicode character
344 numbering and separation into blocks of 256 characters.
346 For example the Latin B<script> is spread over several B<blocks>, such
347 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
348 C<Latin Extended-B>. On the other hand, the Latin script does not
349 contain all the characters of the C<Basic Latin> block (also known as
350 the ASCII): it includes only the letters, not for example the digits
353 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
355 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
357 =head2 Matching Scripts and Blocks
359 Both scripts and blocks can be matched using the regular expression
360 construct C<\p{In...}> and its negation C<\P{In...}>.
362 The name of the script or the block comes after the C<In>, for example
363 C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
364 removed from the names for the C<\p{In...}>, for example
365 C<LatinExtendedA> instead of C<Latin Extended-A>.
367 There are a few cases where there exists both a script and a block by
368 the same name, in these cases the block version has C<Block> appended:
369 C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
373 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
374 can also test whether a code point is in the I<range> as returned by
375 L</charblock> and L</charscript> or as the values of the hash returned
376 by L</charblocks> and </charscripts> by using charinrange():
378 use Unicode::UCD qw(charscript charinrange);
380 $range = charscript('Hiragana');
381 print "looks like hiragana\n" if charinrange($range, $code);
385 =head2 Unicode::UCD::UnicodeVersion
387 Unicode::UCD::UnicodeVersion() returns the version of the Unicode Character
388 Database, in other words, the version of the Unicode standard the
396 unless (defined $UNICODEVERSION) {
397 openunicode(\$VERSIONFH, "version");
398 chomp($UNICODEVERSION = <$VERSIONFH>);
400 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
401 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
403 return $UNICODEVERSION;
406 =head2 Implementation Note
408 The first use of charinfo() opens a read-only filehandle to the Unicode
409 Character Database (the database is included in the Perl distribution).
410 The filehandle is then kept open for further queries.