10 our @ISA = qw(Exporter);
12 our @EXPORT_OK = qw(charinfo
14 charblocks charscripts
23 Unicode::UCD - Unicode character database
27 use Unicode::UCD 'charinfo';
28 my $charinfo = charinfo($codepoint);
30 use Unicode::UCD 'charblock';
31 my $charblock = charblock($codepoint);
33 use Unicode::UCD 'charscript';
34 my $charscript = charblock($codepoint);
36 use Unicode::UCD 'charblocks';
37 my $charblocks = charblocks();
39 use Unicode::UCD 'charscripts';
40 my %charscripts = charscripts();
42 use Unicode::UCD qw(charscript charinrange);
43 my $range = charscript($script);
44 print "looks like $script\n" if charinrange($range, $codepoint);
46 use Unicode::UCD 'compexcl';
47 my $compexcl = compexcl($codepoint);
49 my $unicode_version = Unicode::UCD::UnicodeVersion();
53 The Unicode::UCD module offers a simple interface to the Unicode
67 my ($rfh, @path) = @_;
69 unless (defined $$rfh) {
72 $f = File::Spec->catfile($d, "unicore", @path);
73 last if open($$rfh, $f);
76 croak __PACKAGE__, ": failed to find ",
77 File::Spec->catfile(@path), " in @INC"
85 use Unicode::UCD 'charinfo';
87 my $charinfo = charinfo(0x41);
89 charinfo() returns a reference to a hash that has the following fields
90 as defined by the Unicode standard:
94 code code point with at least four hexdigits
95 name name of the character IN UPPER CASE
96 category general category of the character
97 combining classes used in the Canonical Ordering Algorithm
98 bidi bidirectional category
99 decomposition character decomposition mapping
100 decimal if decimal digit this is the integer numeric value
101 digit if digit this is the numeric value
102 numeric if numeric is the integer or rational numeric value
103 mirrored if mirrored in bidirectional text
104 unicode10 Unicode 1.0 name if existed and different
105 comment ISO 10646 comment field
106 upper uppercase equivalent mapping
107 lower lowercase equivalent mapping
108 title titlecase equivalent mapping
110 block block the character belongs to (used in \p{In...})
111 script script the character belongs to
113 If no match is found, a reference to an empty hash is returned.
115 The C<block> property is the same as as returned by charinfo(). It is
116 not defined in the Unicode Character Database proper (Chapter 4 of the
117 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
118 (Chapter 14 of TUS3). Similarly for the C<script> property.
120 Note that you cannot do (de)composition and casing based solely on the
121 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
122 you will need also the compexcl(), casefold(), and casespec() functions.
129 if ($arg =~ /^\d+$/) {
131 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
138 use Lingua::KO::Hangul::Util;
140 sub hangul_decomp { # internal: called from charinfo
141 my @tmp = decomposeHangul(shift);
143 @tmp == 2 ? sprintf("%04X %04X", @tmp) :
144 @tmp == 3 ? sprintf("%04X %04X %04X", @tmp) :
148 sub han_charname { # internal: called from charinfo
149 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
152 my @CharinfoRanges = (
154 # [ first, last, coderef to name, coderef to decompose ],
155 # CJK Ideographs Extension A
156 [ 0x3400, 0x4DB5, \&han_charname, undef ],
158 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
160 [ 0xAC00, 0xD7A3, \&getHangulName, \&hangul_decomp ],
161 # Non-Private Use High Surrogates
162 [ 0xD800, 0xDB7F, undef, undef ],
163 # Private Use High Surrogates
164 [ 0xDB80, 0xDBFF, undef, undef ],
166 [ 0xDC00, 0xDFFF, undef, undef ],
167 # The Private Use Area
168 [ 0xE000, 0xF8FF, undef, undef ],
169 # CJK Ideographs Extension B
170 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
171 # Plane 15 Private Use Area
172 [ 0xF0000, 0xFFFFD, undef, undef ],
173 # Plane 16 Private Use Area
174 [ 0x100000, 0x10FFFD, undef, undef ],
179 my $code = _getcode($arg);
180 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
181 unless defined $code;
182 my $hexk = sprintf("%06X", $code);
183 my($rcode,$rname,$rdec);
184 foreach my $range (@CharinfoRanges){
185 if ($range->[0] <= $code && $code <= $range->[1]) {
188 $rcode = sprintf("%04X", hex($rcode));
189 $rname = $range->[2] ? $range->[2]->($code) : '';
190 $rdec = $range->[3] ? $range->[3]->($code) : '';
191 $hexk = sprintf("%06X", $range->[0]); # replace by the first
195 openunicode(\$UNICODEFH, "Unicode.txt");
196 if (defined $UNICODEFH) {
197 use Search::Dict 1.02;
198 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
199 my $line = <$UNICODEFH>;
204 combining bidi decomposition
205 decimal digit numeric
206 mirrored unicode10 comment
208 )} = split(/;/, $line, -1);
210 $hexk = sprintf("%04X", hex($hexk));
211 if ($prop{code} eq $hexk) {
212 $prop{block} = charblock($code);
213 $prop{script} = charscript($code);
215 $prop{code} = $rcode;
216 $prop{name} = $rname;
217 $prop{decomposition} = $rdec;
226 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
227 my ($table, $lo, $hi, $code) = @_;
231 my $mid = int(($lo+$hi) / 2);
233 if ($table->[$mid]->[0] < $code) {
234 if ($table->[$mid]->[1] >= $code) {
235 return $table->[$mid]->[2];
237 _search($table, $mid + 1, $hi, $code);
239 } elsif ($table->[$mid]->[0] > $code) {
240 _search($table, $lo, $mid - 1, $code);
242 return $table->[$mid]->[2];
247 my ($range, $arg) = @_;
248 my $code = _getcode($arg);
249 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
250 unless defined $code;
251 _search($range, 0, $#$range, $code);
256 use Unicode::UCD 'charblock';
258 my $charblock = charblock(0x41);
259 my $charblock = charblock(1234);
260 my $charblock = charblock("0x263a");
261 my $charblock = charblock("U+263a");
263 my $range = charblock('Armenian');
265 With a B<code point argument> charblock() returns the I<block> the character
266 belongs to, e.g. C<Basic Latin>. Note that not all the character
267 positions within all blocks are defined.
269 See also L</Blocks versus Scripts>.
271 If supplied with an argument that can't be a code point, charblock()
272 tries to do the opposite and interpret the argument as a character
273 block. The return value is a I<range>: an anonymous list that
274 contains anonymous lists, which in turn contain I<start-of-range>,
275 I<end-of-range> code point pairs. You can test whether a code point
276 is in a range using the L</charinrange> function. If the argument is
277 not a known charater block, C<undef> is returned.
286 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
287 while (<$BLOCKSFH>) {
288 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
289 my ($lo, $hi) = (hex($1), hex($2));
290 my $subrange = [ $lo, $hi, $3 ];
291 push @BLOCKS, $subrange;
292 push @{$BLOCKS{$3}}, $subrange;
303 _charblocks() unless @BLOCKS;
305 my $code = _getcode($arg);
308 _search(\@BLOCKS, 0, $#BLOCKS, $code);
310 if (exists $BLOCKS{$arg}) {
311 return $BLOCKS{$arg};
320 use Unicode::UCD 'charscript';
322 my $charscript = charscript(0x41);
323 my $charscript = charscript(1234);
324 my $charscript = charscript("U+263a");
326 my $range = charscript('Thai');
328 With a B<code point argument> charscript() returns the I<script> the
329 character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
331 See also L</Blocks versus Scripts>.
333 If supplied with an argument that can't be a code point, charscript()
334 tries to do the opposite and interpret the argument as a character
335 script. The return value is a I<range>: an anonymous list that
336 contains anonymous lists, which in turn contain I<start-of-range>,
337 I<end-of-range> code point pairs. You can test whether a code point
338 is in a range using the L</charinrange> function. If the argument is
339 not a known charater script, C<undef> is returned.
348 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
349 while (<$SCRIPTSFH>) {
350 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
351 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
353 $script =~ s/\b(\w)/uc($1)/ge;
354 my $subrange = [ $lo, $hi, $script ];
355 push @SCRIPTS, $subrange;
356 push @{$SCRIPTS{$script}}, $subrange;
360 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
368 _charscripts() unless @SCRIPTS;
370 my $code = _getcode($arg);
373 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
375 if (exists $SCRIPTS{$arg}) {
376 return $SCRIPTS{$arg};
385 use Unicode::UCD 'charblocks';
387 my $charblocks = charblocks();
389 charblocks() returns a reference to a hash with the known block names
390 as the keys, and the code point ranges (see L</charblock>) as the values.
392 See also L</Blocks versus Scripts>.
397 _charblocks() unless %BLOCKS;
403 use Unicode::UCD 'charscripts';
405 my %charscripts = charscripts();
407 charscripts() returns a hash with the known script names as the keys,
408 and the code point ranges (see L</charscript>) as the values.
410 See also L</Blocks versus Scripts>.
415 _charscripts() unless %SCRIPTS;
419 =head2 Blocks versus Scripts
421 The difference between a block and a script is that scripts are closer
422 to the linguistic notion of a set of characters required to present
423 languages, while block is more of an artifact of the Unicode character
424 numbering and separation into blocks of 256 characters.
426 For example the Latin B<script> is spread over several B<blocks>, such
427 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
428 C<Latin Extended-B>. On the other hand, the Latin script does not
429 contain all the characters of the C<Basic Latin> block (also known as
430 the ASCII): it includes only the letters, not for example the digits
433 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
435 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
437 =head2 Matching Scripts and Blocks
439 Both scripts and blocks can be matched using the regular expression
440 construct C<\p{In...}> and its negation C<\P{In...}>.
442 The name of the script or the block comes after the C<In>, for example
443 C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
444 removed from the names for the C<\p{In...}>, for example
445 C<LatinExtendedA> instead of C<Latin Extended-A>.
447 There are a few cases where there is both a script and a block by the
448 same name, in these cases the block version has C<Block> appended to
449 its name: C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is
452 =head2 Code Point Arguments
454 A <code point argument> is either a decimal or a hexadecimal scalar
455 designating a Unicode character, or "U+" followed by hexadecimals
456 designating a Unicode character. Note that Unicode is B<not> limited
457 to 16 bits (the number of Unicode characters is open-ended, in theory
458 unlimited): you may have more than 4 hexdigits.
462 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
463 can also test whether a code point is in the I<range> as returned by
464 L</charblock> and L</charscript> or as the values of the hash returned
465 by L</charblocks> and L</charscripts> by using charinrange():
467 use Unicode::UCD qw(charscript charinrange);
469 $range = charscript('Hiragana');
470 print "looks like hiragana\n" if charinrange($range, $codepoint);
476 use Unicode::UCD 'compexcl';
478 my $compexcl = compexcl("09dc");
480 The compexcl() returns the composition exclusion (that is, if the
481 character should not be produced during a precomposition) of the
482 character specified by a B<code point argument>.
484 If there is a composition exclusion for the character, true is
485 returned. Otherwise, false is returned.
493 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
494 while (<$COMPEXCLFH>) {
495 if (/^([0-9A-F]+) \# /) {
497 $COMPEXCL{$code} = undef;
507 my $code = _getcode($arg);
508 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
509 unless defined $code;
511 _compexcl() unless %COMPEXCL;
513 return exists $COMPEXCL{$code};
518 use Unicode::UCD 'casefold';
520 my %casefold = casefold("09dc");
522 The casefold() returns the locale-independent case folding of the
523 character specified by a B<code point argument>.
525 If there is a case folding for that character, a reference to a hash
526 with the following fields is returned:
530 code code point with at least four hexdigits
531 status "C", "F", "S", or "I"
532 mapping one or more codes separated by spaces
534 The meaning of the I<status> is as follows:
536 C common case folding, common mappings shared
537 by both simple and full mappings
538 F full case folding, mappings that cause strings
539 to grow in length. Multiple characters are separated
541 S simple case folding, mappings to single characters
542 where different from F
543 I special case for dotted uppercase I and
545 - If this mapping is included, the result is
546 case-insensitive, but dotless and dotted I's
547 are not distinguished
548 - If this mapping is excluded, the result is not
549 fully case-insensitive, but dotless and dotted
550 I's are distinguished
552 If there is no case folding for that character, C<undef> is returned.
554 For more information about case mappings see
555 http://www.unicode.org/unicode/reports/tr21/
563 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
564 while (<$CASEFOLDFH>) {
565 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
567 $CASEFOLD{$code} = { code => $1,
579 my $code = _getcode($arg);
580 croak __PACKAGE__, "::casefold: unknown code '$arg'"
581 unless defined $code;
583 _casefold() unless %CASEFOLD;
585 return $CASEFOLD{$code};
590 use Unicode::UCD 'casespec';
592 my %casespec = casespec("09dc");
594 The casespec() returns the potentially locale-dependent case mapping
595 of the character specified by a B<code point argument>. The mapping
596 may change the length of the string (which the basic Unicode case
597 mappings as returned by charinfo() never do).
599 If there is a case folding for that character, a reference to a hash
600 with the following fields is returned:
604 code code point with at least four hexdigits
608 condition condition list (may be undef)
610 The C<condition> is optional. Where present, it consists of one or
611 more I<locales> or I<contexts>, separated by spaces (other than as
612 used to separate elements, spaces are to be ignored). A condition
613 list overrides the normal behavior if all of the listed conditions are
614 true. Case distinctions in the condition list are not significant.
615 Conditions preceded by "NON_" represent the negation of the condition
617 Note that when there are multiple case folding definitions for a
618 single code point because of different locales, the value returned by
619 casespec() is a hash reference which has the locales as the keys and
620 hash references as described above as the values.
622 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
623 followed by a "_" and a 2-letter ISO language code (possibly followed
624 by a "_" and a variant code). You can find the lists of those codes,
625 see L<Locale::Country> and L<Locale::Language>.
627 A I<context> is one of the following choices:
629 FINAL The letter is not followed by a letter of
630 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
631 MODERN The mapping is only used for modern text
632 AFTER_i The last base character was "i" (U+0069)
634 For more information about case mappings see
635 http://www.unicode.org/unicode/reports/tr21/
643 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
644 while (<$CASESPECFH>) {
645 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
646 my ($hexcode, $lower, $title, $upper, $condition) =
647 ($1, $2, $3, $4, $5);
648 my $code = hex($hexcode);
649 if (exists $CASESPEC{$code}) {
650 if (exists $CASESPEC{$code}->{code}) {
655 @{$CASESPEC{$code}}{qw(lower
660 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
661 if (defined $oldlocale) {
662 delete $CASESPEC{$code};
663 $CASESPEC{$code}->{$oldlocale} =
668 condition => $oldcondition };
670 warn __PACKAGE__, ": SpecCase.txt:", $., ": No oldlocale for 0x$hexcode\n"
674 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
675 $CASESPEC{$code}->{$locale} =
680 condition => $condition };
687 condition => $condition };
698 my $code = _getcode($arg);
699 croak __PACKAGE__, "::casespec: unknown code '$arg'"
700 unless defined $code;
702 _casespec() unless %CASESPEC;
704 return $CASESPEC{$code};
707 =head2 Unicode::UCD::UnicodeVersion
709 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
710 Character Database, in other words, the version of the Unicode
711 standard the database implements. The version is a string
712 of numbers delimited by dots (C<'.'>).
719 unless (defined $UNICODEVERSION) {
720 openunicode(\$VERSIONFH, "version");
721 chomp($UNICODEVERSION = <$VERSIONFH>);
723 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
724 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
726 return $UNICODEVERSION;
729 =head2 Implementation Note
731 The first use of charinfo() opens a read-only filehandle to the Unicode
732 Character Database (the database is included in the Perl distribution).
733 The filehandle is then kept open for further queries. In other words,
734 if you are wondering where one of your filehandles went, that's where.
738 Does not yet support EBCDIC platforms.