10 our @ISA = qw(Exporter);
11 our @EXPORT_OK = qw(charinfo
13 charblocks charscripts
22 UnicodeCD - Unicode character database
26 use UnicodeCD 'charinfo';
27 my $charinfo = charinfo($codepoint);
29 use UnicodeCD 'charblock';
30 my $charblock = charblock($codepoint);
32 use UnicodeCD 'charscript';
33 my $charscript = charblock($codepoint);
35 use UnicodeCD 'charblocks';
36 my $charblocks = charblocks();
38 use UnicodeCD 'charscripts';
39 my %charscripts = charscripts();
41 use UnicodeCD qw(charscript charinrange);
42 my $range = charscript($script);
43 print "looks like $script\n" if charinrange($range, $codepoint);
45 use UnicodeCD 'compexcl';
46 my $compexcl = compexcl($codepoint);
48 my $unicode_version = UnicodeCD::UnicodeVersion();
52 The UnicodeCD module offers a simple interface to the Unicode Character
66 my ($rfh, @path) = @_;
68 unless (defined $$rfh) {
71 $f = File::Spec->catfile($d, "unicode", @path);
72 last if open($$rfh, $f);
75 croak __PACKAGE__, ": failed to find ",
76 File::Spec->catfile(@path), " in @INC"
84 use UnicodeCD 'charinfo';
86 my $charinfo = charinfo(0x41);
88 charinfo() returns a reference to a hash that has the following fields
89 as defined by the Unicode standard:
93 code code point with at least four hexdigits
94 name name of the character IN UPPER CASE
95 category general category of the character
96 combining classes used in the Canonical Ordering Algorithm
97 bidi bidirectional category
98 decomposition character decomposition mapping
99 decimal if decimal digit this is the integer numeric value
100 digit if digit this is the numeric value
101 numeric if numeric is the integer or rational numeric value
102 mirrored if mirrored in bidirectional text
103 unicode10 Unicode 1.0 name if existed and different
104 comment ISO 10646 comment field
105 upper uppercase equivalent mapping
106 lower lowercase equivalent mapping
107 title titlecase equivalent mapping
109 block block the character belongs to (used in \p{In...})
110 script script the character belongs to
112 If no match is found, a reference to an empty hash is returned.
114 The C<block> property is the same as as returned by charinfo(). It is
115 not defined in the Unicode Character Database proper (Chapter 4 of the
116 Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
117 of TUS3). Similarly for the C<script> property.
119 Note that you cannot do (de)composition and casing based solely on the
120 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
121 you will need also the compexcl(), casefold(), and casespec() functions.
128 if ($arg =~ /^\d+$/) {
130 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
139 my $code = _getcode($arg);
140 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
141 unless defined $code;
142 my $hexk = sprintf("%04X", $code);
144 openunicode(\$UNICODEFH, "Unicode.txt");
145 if (defined $UNICODEFH) {
147 if (look($UNICODEFH, "$hexk;") >= 0) {
148 my $line = <$UNICODEFH>;
153 combining bidi decomposition
154 decimal digit numeric
155 mirrored unicode10 comment
157 )} = split(/;/, $line, -1);
158 if ($prop{code} eq $hexk) {
159 $prop{block} = charblock($code);
160 $prop{script} = charscript($code);
168 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
169 my ($table, $lo, $hi, $code) = @_;
173 my $mid = int(($lo+$hi) / 2);
175 if ($table->[$mid]->[0] < $code) {
176 if ($table->[$mid]->[1] >= $code) {
177 return $table->[$mid]->[2];
179 _search($table, $mid + 1, $hi, $code);
181 } elsif ($table->[$mid]->[0] > $code) {
182 _search($table, $lo, $mid - 1, $code);
184 return $table->[$mid]->[2];
189 my ($range, $arg) = @_;
190 my $code = _getcode($arg);
191 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
192 unless defined $code;
193 _search($range, 0, $#$range, $code);
198 use UnicodeCD 'charblock';
200 my $charblock = charblock(0x41);
201 my $charblock = charblock(1234);
202 my $charblock = charblock("0x263a");
203 my $charblock = charblock("U+263a");
205 my $ranges = charblock('Armenian');
207 With a B<code point argument> charblock() returns the block the character
208 belongs to, e.g. C<Basic Latin>. Note that not all the character
209 positions within all blocks are defined.
211 If supplied with an argument that can't be a code point, charblock()
212 tries to do the opposite and interpret the argument as a character
213 block. The return value is a I<range>: an anonymous list that
214 contains anonymous lists, which in turn contain I<start-of-range>,
215 I<end-of-range> code point pairs. You can test whether a code point
216 is in a range using the L</charinrange> function. If the argument is
217 not a known charater block, C<undef> is returned.
226 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
227 while (<$BLOCKSFH>) {
228 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
229 my ($lo, $hi) = (hex($1), hex($2));
230 my $subrange = [ $lo, $hi, $3 ];
231 push @BLOCKS, $subrange;
232 push @{$BLOCKS{$3}}, $subrange;
243 _charblocks() unless @BLOCKS;
245 my $code = _getcode($arg);
248 _search(\@BLOCKS, 0, $#BLOCKS, $code);
250 if (exists $BLOCKS{$arg}) {
251 return $BLOCKS{$arg};
260 use UnicodeCD 'charscript';
262 my $charscript = charscript(0x41);
263 my $charscript = charscript(1234);
264 my $charscript = charscript("U+263a");
266 my $ranges = charscript('Thai');
268 With a B<code point argument> charscript() returns the script the
269 character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
271 If supplied with an argument that can't be a code point, charscript()
272 tries to do the opposite and interpret the argument as a character
273 script. The return value is a I<range>: an anonymous list that
274 contains anonymous lists, which in turn contain I<start-of-range>,
275 I<end-of-range> code point pairs. You can test whether a code point
276 is in a range using the L</charinrange> function. If the argument is
277 not a known charater script, C<undef> is returned.
286 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
287 while (<$SCRIPTSFH>) {
288 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
289 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
291 $script =~ s/\b(\w)/uc($1)/ge;
292 my $subrange = [ $lo, $hi, $script ];
293 push @SCRIPTS, $subrange;
294 push @{$SCRIPTS{$script}}, $subrange;
298 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
306 _charscripts() unless @SCRIPTS;
308 my $code = _getcode($arg);
311 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
313 if (exists $SCRIPTS{$arg}) {
314 return $SCRIPTS{$arg};
323 use UnicodeCD 'charblocks';
325 my $charblocks = charblocks();
327 charblocks() returns a reference to a hash with the known block names
328 as the keys, and the code point ranges (see L</charblock>) as the values.
333 _charblocks() unless %BLOCKS;
339 use UnicodeCD 'charscripts';
341 my %charscripts = charscripts();
343 charscripts() returns a hash with the known script names as the keys,
344 and the code point ranges (see L</charscript>) as the values.
349 _charscripts() unless %SCRIPTS;
353 =head2 Blocks versus Scripts
355 The difference between a block and a script is that scripts are closer
356 to the linguistic notion of a set of characters required to present
357 languages, while block is more of an artifact of the Unicode character
358 numbering and separation into blocks of 256 characters.
360 For example the Latin B<script> is spread over several B<blocks>, such
361 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
362 C<Latin Extended-B>. On the other hand, the Latin script does not
363 contain all the characters of the C<Basic Latin> block (also known as
364 the ASCII): it includes only the letters, not for example the digits
367 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
369 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
371 =head2 Matching Scripts and Blocks
373 Both scripts and blocks can be matched using the regular expression
374 construct C<\p{In...}> and its negation C<\P{In...}>.
376 The name of the script or the block comes after the C<In>, for example
377 C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
378 removed from the names for the C<\p{In...}>, for example
379 C<LatinExtendedA> instead of C<Latin Extended-A>.
381 There are a few cases where there exists both a script and a block by
382 the same name, in these cases the block version has C<Block> appended:
383 C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
385 =head2 Code Point Arguments
387 A <code point argument> is either a decimal or a hexadecimal scalar,
388 or "U+" followed by hexadecimals.
392 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
393 can also test whether a code point is in the I<range> as returned by
394 L</charblock> and L</charscript> or as the values of the hash returned
395 by L</charblocks> and L</charscripts> by using charinrange():
397 use UnicodeCD qw(charscript charinrange);
399 $range = charscript('Hiragana');
400 print "looks like hiragana\n" if charinrange($range, $codepoint);
406 use UnicodeCD 'compexcl';
408 my $compexcl = compexcl("09dc");
410 The compexcl() returns the composition exclusion (that is, if the
411 character should not be produced during a precomposition) of the
412 character specified by a B<code point argument>.
414 If there is a composition exclusion for the character, true is
415 returned. Otherwise, false is returned.
423 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
424 while (<$COMPEXCLFH>) {
425 if (/^([0-9A-F]+) \# /) {
427 $COMPEXCL{$code} = undef;
437 my $code = _getcode($arg);
439 _compexcl() unless %COMPEXCL;
441 return exists $COMPEXCL{$code};
446 use UnicodeCD 'casefold';
448 my %casefold = casefold("09dc");
450 The casefold() returns the locale-independent case folding of the
451 character specified by a B<code point argument>.
453 If there is a case folding for that character, a reference to a hash
454 with the following fields is returned:
458 code code point with at least four hexdigits
459 status "C", "F", "S", or "I"
460 mapping one or more codes separated by spaces
462 The meaning of the I<status> is as follows:
464 C common case folding, common mappings shared
465 by both simple and full mappings
466 F full case folding, mappings that cause strings
467 to grow in length. Multiple characters are separated
469 S simple case folding, mappings to single characters
470 where different from F
471 I special case for dotted uppercase I and
473 - If this mapping is included, the result is
474 case-insensitive, but dotless and dotted I's
475 are not distinguished
476 - If this mapping is excluded, the result is not
477 fully case-insensitive, but dotless and dotted
478 I's are distinguished
480 If there is no case folding for that character, C<undef> is returned.
482 For more information about case mappings see
483 http://www.unicode.org/unicode/reports/tr21/
491 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
492 while (<$CASEFOLDFH>) {
493 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
495 $CASEFOLD{$code} = { code => $1,
507 my $code = _getcode($arg);
509 _casefold() unless %CASEFOLD;
511 return $CASEFOLD{$code};
516 use UnicodeCD 'casespec';
518 my %casespec = casespec("09dc");
520 The casespec() returns the potentially locale-dependent case mapping
521 of the character specified by a B<code point argument>. The mapping
522 may change the length of the string (which the basic Unicode case
523 mappings as returned by charinfo() never do).
525 If there is a case folding for that character, a reference to a hash
526 with the following fields is returned:
530 code code point with at least four hexdigits
534 condition condition list (may be undef)
536 The C<condition> is optional. Where present, it consists of one or
537 more I<locales> or I<contexts>, separated by spaces (other than as
538 used to separate elements, spaces are to be ignored). A condition
539 list overrides the normal behavior if all of the listed conditions are
540 true. Case distinctions in the condition list are not significant.
541 Conditions preceded by "NON_" represent the negation of the condition
543 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
544 followed by a "_" and a 2-letter ISO language code (possibly followed
545 by a "_" and a variant code). You can find the lists of those codes,
546 see L<Locale::Country> and L<Locale::Language>.
548 A I<context> is one of the following choices:
550 FINAL The letter is not followed by a letter of
551 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
552 MODERN The mapping is only used for modern text
553 AFTER_i The last base character was "i" (U+0069)
555 For more information about case mappings see
556 http://www.unicode.org/unicode/reports/tr21/
564 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
565 while (<$CASESPECFH>) {
566 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
568 $CASESPEC{$code} = { code => $1,
582 my $code = _getcode($arg);
584 _casespec() unless %CASESPEC;
586 return $CASESPEC{$code};
589 =head2 UnicodeCD::UnicodeVersion
591 UnicodeCD::UnicodeVersion() returns the version of the Unicode Character
592 Database, in other words, the version of the Unicode standard the
600 unless (defined $UNICODEVERSION) {
601 openunicode(\$VERSIONFH, "version");
602 chomp($UNICODEVERSION = <$VERSIONFH>);
604 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
605 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
607 return $UNICODEVERSION;
610 =head2 Implementation Note
612 The first use of charinfo() opens a read-only filehandle to the Unicode
613 Character Database (the database is included in the Perl distribution).
614 The filehandle is then kept open for further queries.