10 our @ISA = qw(Exporter);
12 our @EXPORT_OK = qw(charinfo
14 charblocks charscripts
23 Unicode::UCD - Unicode character database
27 use Unicode::UCD 'charinfo';
28 my $charinfo = charinfo($codepoint);
30 use Unicode::UCD 'charblock';
31 my $charblock = charblock($codepoint);
33 use Unicode::UCD 'charscript';
34 my $charscript = charblock($codepoint);
36 use Unicode::UCD 'charblocks';
37 my $charblocks = charblocks();
39 use Unicode::UCD 'charscripts';
40 my %charscripts = charscripts();
42 use Unicode::UCD qw(charscript charinrange);
43 my $range = charscript($script);
44 print "looks like $script\n" if charinrange($range, $codepoint);
46 use Unicode::UCD 'compexcl';
47 my $compexcl = compexcl($codepoint);
49 my $unicode_version = Unicode::UCD::UnicodeVersion();
53 The Unicode::UCD module offers a simple interface to the Unicode
67 my ($rfh, @path) = @_;
69 unless (defined $$rfh) {
72 $f = File::Spec->catfile($d, "unicore", @path);
73 last if open($$rfh, $f);
76 croak __PACKAGE__, ": failed to find ",
77 File::Spec->catfile(@path), " in @INC"
85 use Unicode::UCD 'charinfo';
87 my $charinfo = charinfo(0x41);
89 charinfo() returns a reference to a hash that has the following fields
90 as defined by the Unicode standard:
94 code code point with at least four hexdigits
95 name name of the character IN UPPER CASE
96 category general category of the character
97 combining classes used in the Canonical Ordering Algorithm
98 bidi bidirectional category
99 decomposition character decomposition mapping
100 decimal if decimal digit this is the integer numeric value
101 digit if digit this is the numeric value
102 numeric if numeric is the integer or rational numeric value
103 mirrored if mirrored in bidirectional text
104 unicode10 Unicode 1.0 name if existed and different
105 comment ISO 10646 comment field
106 upper uppercase equivalent mapping
107 lower lowercase equivalent mapping
108 title titlecase equivalent mapping
110 block block the character belongs to (used in \p{In...})
111 script script the character belongs to
113 If no match is found, a reference to an empty hash is returned.
115 The C<block> property is the same as as returned by charinfo(). It is
116 not defined in the Unicode Character Database proper (Chapter 4 of the
117 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
118 (Chapter 14 of TUS3). Similarly for the C<script> property.
120 Note that you cannot do (de)composition and casing based solely on the
121 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
122 you will need also the compexcl(), casefold(), and casespec() functions.
129 if ($arg =~ /^\d+$/) {
131 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
138 # Lingua::KO::Hangul::Util not part of the standard distribution
139 # but it will be used if available.
141 eval { require Lingua::KO::Hangul::Util };
142 my $hasHangulUtil = ! $@;
143 if ($hasHangulUtil) {
144 Lingua::KO::Hangul::Util->import();
147 sub hangul_decomp { # internal: called from charinfo
148 if ($hasHangulUtil) {
149 my @tmp = decomposeHangul(shift);
150 return sprintf("%04X %04X", @tmp) if @tmp == 2;
151 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
156 sub hangul_charname { # internal: called from charinfo
157 return sprintf("HANGUL SYLLABLE-%04X", shift);
160 sub han_charname { # internal: called from charinfo
161 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
164 my @CharinfoRanges = (
166 # [ first, last, coderef to name, coderef to decompose ],
167 # CJK Ideographs Extension A
168 [ 0x3400, 0x4DB5, \&han_charname, undef ],
170 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
172 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
173 # Non-Private Use High Surrogates
174 [ 0xD800, 0xDB7F, undef, undef ],
175 # Private Use High Surrogates
176 [ 0xDB80, 0xDBFF, undef, undef ],
178 [ 0xDC00, 0xDFFF, undef, undef ],
179 # The Private Use Area
180 [ 0xE000, 0xF8FF, undef, undef ],
181 # CJK Ideographs Extension B
182 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
183 # Plane 15 Private Use Area
184 [ 0xF0000, 0xFFFFD, undef, undef ],
185 # Plane 16 Private Use Area
186 [ 0x100000, 0x10FFFD, undef, undef ],
191 my $code = _getcode($arg);
192 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
193 unless defined $code;
194 my $hexk = sprintf("%06X", $code);
195 my($rcode,$rname,$rdec);
196 foreach my $range (@CharinfoRanges){
197 if ($range->[0] <= $code && $code <= $range->[1]) {
200 $rcode = sprintf("%04X", hex($rcode));
201 $rname = $range->[2] ? $range->[2]->($code) : '';
202 $rdec = $range->[3] ? $range->[3]->($code) : '';
203 $hexk = sprintf("%06X", $range->[0]); # replace by the first
207 openunicode(\$UNICODEFH, "Unicode.txt");
208 if (defined $UNICODEFH) {
209 use Search::Dict 1.02;
210 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
211 my $line = <$UNICODEFH>;
216 combining bidi decomposition
217 decimal digit numeric
218 mirrored unicode10 comment
220 )} = split(/;/, $line, -1);
222 $hexk = sprintf("%04X", hex($hexk));
223 if ($prop{code} eq $hexk) {
224 $prop{block} = charblock($code);
225 $prop{script} = charscript($code);
227 $prop{code} = $rcode;
228 $prop{name} = $rname;
229 $prop{decomposition} = $rdec;
238 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
239 my ($table, $lo, $hi, $code) = @_;
243 my $mid = int(($lo+$hi) / 2);
245 if ($table->[$mid]->[0] < $code) {
246 if ($table->[$mid]->[1] >= $code) {
247 return $table->[$mid]->[2];
249 _search($table, $mid + 1, $hi, $code);
251 } elsif ($table->[$mid]->[0] > $code) {
252 _search($table, $lo, $mid - 1, $code);
254 return $table->[$mid]->[2];
259 my ($range, $arg) = @_;
260 my $code = _getcode($arg);
261 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
262 unless defined $code;
263 _search($range, 0, $#$range, $code);
268 use Unicode::UCD 'charblock';
270 my $charblock = charblock(0x41);
271 my $charblock = charblock(1234);
272 my $charblock = charblock("0x263a");
273 my $charblock = charblock("U+263a");
275 my $range = charblock('Armenian');
277 With a B<code point argument> charblock() returns the I<block> the character
278 belongs to, e.g. C<Basic Latin>. Note that not all the character
279 positions within all blocks are defined.
281 See also L</Blocks versus Scripts>.
283 If supplied with an argument that can't be a code point, charblock()
284 tries to do the opposite and interpret the argument as a character
285 block. The return value is a I<range>: an anonymous list that
286 contains anonymous lists, which in turn contain I<start-of-range>,
287 I<end-of-range> code point pairs. You can test whether a code point
288 is in a range using the L</charinrange> function. If the argument is
289 not a known charater block, C<undef> is returned.
298 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
299 while (<$BLOCKSFH>) {
300 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
301 my ($lo, $hi) = (hex($1), hex($2));
302 my $subrange = [ $lo, $hi, $3 ];
303 push @BLOCKS, $subrange;
304 push @{$BLOCKS{$3}}, $subrange;
315 _charblocks() unless @BLOCKS;
317 my $code = _getcode($arg);
320 _search(\@BLOCKS, 0, $#BLOCKS, $code);
322 if (exists $BLOCKS{$arg}) {
323 return $BLOCKS{$arg};
332 use Unicode::UCD 'charscript';
334 my $charscript = charscript(0x41);
335 my $charscript = charscript(1234);
336 my $charscript = charscript("U+263a");
338 my $range = charscript('Thai');
340 With a B<code point argument> charscript() returns the I<script> the
341 character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
343 See also L</Blocks versus Scripts>.
345 If supplied with an argument that can't be a code point, charscript()
346 tries to do the opposite and interpret the argument as a character
347 script. The return value is a I<range>: an anonymous list that
348 contains anonymous lists, which in turn contain I<start-of-range>,
349 I<end-of-range> code point pairs. You can test whether a code point
350 is in a range using the L</charinrange> function. If the argument is
351 not a known charater script, C<undef> is returned.
360 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
361 while (<$SCRIPTSFH>) {
362 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
363 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
365 $script =~ s/\b(\w)/uc($1)/ge;
366 my $subrange = [ $lo, $hi, $script ];
367 push @SCRIPTS, $subrange;
368 push @{$SCRIPTS{$script}}, $subrange;
372 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
380 _charscripts() unless @SCRIPTS;
382 my $code = _getcode($arg);
385 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
387 if (exists $SCRIPTS{$arg}) {
388 return $SCRIPTS{$arg};
397 use Unicode::UCD 'charblocks';
399 my $charblocks = charblocks();
401 charblocks() returns a reference to a hash with the known block names
402 as the keys, and the code point ranges (see L</charblock>) as the values.
404 See also L</Blocks versus Scripts>.
409 _charblocks() unless %BLOCKS;
415 use Unicode::UCD 'charscripts';
417 my %charscripts = charscripts();
419 charscripts() returns a hash with the known script names as the keys,
420 and the code point ranges (see L</charscript>) as the values.
422 See also L</Blocks versus Scripts>.
427 _charscripts() unless %SCRIPTS;
431 =head2 Blocks versus Scripts
433 The difference between a block and a script is that scripts are closer
434 to the linguistic notion of a set of characters required to present
435 languages, while block is more of an artifact of the Unicode character
436 numbering and separation into blocks of 256 characters.
438 For example the Latin B<script> is spread over several B<blocks>, such
439 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
440 C<Latin Extended-B>. On the other hand, the Latin script does not
441 contain all the characters of the C<Basic Latin> block (also known as
442 the ASCII): it includes only the letters, not for example the digits
445 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
447 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
449 =head2 Matching Scripts and Blocks
451 Both scripts and blocks can be matched using the regular expression
452 construct C<\p{In...}> and its negation C<\P{In...}>.
454 The name of the script or the block comes after the C<In>, for example
455 C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
456 removed from the names for the C<\p{In...}>, for example
457 C<LatinExtendedA> instead of C<Latin Extended-A>.
459 There are a few cases where there is both a script and a block by the
460 same name, in these cases the block version has C<Block> appended to
461 its name: C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is
464 =head2 Code Point Arguments
466 A <code point argument> is either a decimal or a hexadecimal scalar
467 designating a Unicode character, or "U+" followed by hexadecimals
468 designating a Unicode character. Note that Unicode is B<not> limited
469 to 16 bits (the number of Unicode characters is open-ended, in theory
470 unlimited): you may have more than 4 hexdigits.
474 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
475 can also test whether a code point is in the I<range> as returned by
476 L</charblock> and L</charscript> or as the values of the hash returned
477 by L</charblocks> and L</charscripts> by using charinrange():
479 use Unicode::UCD qw(charscript charinrange);
481 $range = charscript('Hiragana');
482 print "looks like hiragana\n" if charinrange($range, $codepoint);
488 use Unicode::UCD 'compexcl';
490 my $compexcl = compexcl("09dc");
492 The compexcl() returns the composition exclusion (that is, if the
493 character should not be produced during a precomposition) of the
494 character specified by a B<code point argument>.
496 If there is a composition exclusion for the character, true is
497 returned. Otherwise, false is returned.
505 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
506 while (<$COMPEXCLFH>) {
507 if (/^([0-9A-F]+) \# /) {
509 $COMPEXCL{$code} = undef;
519 my $code = _getcode($arg);
520 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
521 unless defined $code;
523 _compexcl() unless %COMPEXCL;
525 return exists $COMPEXCL{$code};
530 use Unicode::UCD 'casefold';
532 my %casefold = casefold("09dc");
534 The casefold() returns the locale-independent case folding of the
535 character specified by a B<code point argument>.
537 If there is a case folding for that character, a reference to a hash
538 with the following fields is returned:
542 code code point with at least four hexdigits
543 status "C", "F", "S", or "I"
544 mapping one or more codes separated by spaces
546 The meaning of the I<status> is as follows:
548 C common case folding, common mappings shared
549 by both simple and full mappings
550 F full case folding, mappings that cause strings
551 to grow in length. Multiple characters are separated
553 S simple case folding, mappings to single characters
554 where different from F
555 I special case for dotted uppercase I and
557 - If this mapping is included, the result is
558 case-insensitive, but dotless and dotted I's
559 are not distinguished
560 - If this mapping is excluded, the result is not
561 fully case-insensitive, but dotless and dotted
562 I's are distinguished
564 If there is no case folding for that character, C<undef> is returned.
566 For more information about case mappings see
567 http://www.unicode.org/unicode/reports/tr21/
575 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
576 while (<$CASEFOLDFH>) {
577 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
579 $CASEFOLD{$code} = { code => $1,
591 my $code = _getcode($arg);
592 croak __PACKAGE__, "::casefold: unknown code '$arg'"
593 unless defined $code;
595 _casefold() unless %CASEFOLD;
597 return $CASEFOLD{$code};
602 use Unicode::UCD 'casespec';
604 my %casespec = casespec("09dc");
606 The casespec() returns the potentially locale-dependent case mapping
607 of the character specified by a B<code point argument>. The mapping
608 may change the length of the string (which the basic Unicode case
609 mappings as returned by charinfo() never do).
611 If there is a case folding for that character, a reference to a hash
612 with the following fields is returned:
616 code code point with at least four hexdigits
620 condition condition list (may be undef)
622 The C<condition> is optional. Where present, it consists of one or
623 more I<locales> or I<contexts>, separated by spaces (other than as
624 used to separate elements, spaces are to be ignored). A condition
625 list overrides the normal behavior if all of the listed conditions are
626 true. Case distinctions in the condition list are not significant.
627 Conditions preceded by "NON_" represent the negation of the condition
629 Note that when there are multiple case folding definitions for a
630 single code point because of different locales, the value returned by
631 casespec() is a hash reference which has the locales as the keys and
632 hash references as described above as the values.
634 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
635 followed by a "_" and a 2-letter ISO language code (possibly followed
636 by a "_" and a variant code). You can find the lists of those codes,
637 see L<Locale::Country> and L<Locale::Language>.
639 A I<context> is one of the following choices:
641 FINAL The letter is not followed by a letter of
642 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
643 MODERN The mapping is only used for modern text
644 AFTER_i The last base character was "i" (U+0069)
646 For more information about case mappings see
647 http://www.unicode.org/unicode/reports/tr21/
655 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
656 while (<$CASESPECFH>) {
657 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
658 my ($hexcode, $lower, $title, $upper, $condition) =
659 ($1, $2, $3, $4, $5);
660 my $code = hex($hexcode);
661 if (exists $CASESPEC{$code}) {
662 if (exists $CASESPEC{$code}->{code}) {
667 @{$CASESPEC{$code}}{qw(lower
672 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
673 if (defined $oldlocale) {
674 delete $CASESPEC{$code};
675 $CASESPEC{$code}->{$oldlocale} =
680 condition => $oldcondition };
682 warn __PACKAGE__, ": SpecCase.txt:", $., ": No oldlocale for 0x$hexcode\n"
686 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
687 $CASESPEC{$code}->{$locale} =
692 condition => $condition };
699 condition => $condition };
710 my $code = _getcode($arg);
711 croak __PACKAGE__, "::casespec: unknown code '$arg'"
712 unless defined $code;
714 _casespec() unless %CASESPEC;
716 return $CASESPEC{$code};
719 =head2 Unicode::UCD::UnicodeVersion
721 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
722 Character Database, in other words, the version of the Unicode
723 standard the database implements. The version is a string
724 of numbers delimited by dots (C<'.'>).
731 unless (defined $UNICODEVERSION) {
732 openunicode(\$VERSIONFH, "version");
733 chomp($UNICODEVERSION = <$VERSIONFH>);
735 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
736 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
738 return $UNICODEVERSION;
741 =head2 Implementation Note
743 The first use of charinfo() opens a read-only filehandle to the Unicode
744 Character Database (the database is included in the Perl distribution).
745 The filehandle is then kept open for further queries. In other words,
746 if you are wondering where one of your filehandles went, that's where.
750 Does not yet support EBCDIC platforms.