10 our @ISA = qw(Exporter);
12 our @EXPORT_OK = qw(charinfo
14 charblocks charscripts
23 Unicode::UCD - Unicode character database
27 use Unicode::UCD 'charinfo';
28 my $charinfo = charinfo($codepoint);
30 use Unicode::UCD 'charblock';
31 my $charblock = charblock($codepoint);
33 use Unicode::UCD 'charscript';
34 my $charscript = charblock($codepoint);
36 use Unicode::UCD 'charblocks';
37 my $charblocks = charblocks();
39 use Unicode::UCD 'charscripts';
40 my %charscripts = charscripts();
42 use Unicode::UCD qw(charscript charinrange);
43 my $range = charscript($script);
44 print "looks like $script\n" if charinrange($range, $codepoint);
46 use Unicode::UCD 'compexcl';
47 my $compexcl = compexcl($codepoint);
49 my $unicode_version = Unicode::UCD::UnicodeVersion();
53 The Unicode::UCD module offers a simple interface to the Unicode
67 my ($rfh, @path) = @_;
69 unless (defined $$rfh) {
72 $f = File::Spec->catfile($d, "unicore", @path);
73 last if open($$rfh, $f);
76 croak __PACKAGE__, ": failed to find ",
77 File::Spec->catfile(@path), " in @INC"
85 use Unicode::UCD 'charinfo';
87 my $charinfo = charinfo(0x41);
89 charinfo() returns a reference to a hash that has the following fields
90 as defined by the Unicode standard:
94 code code point with at least four hexdigits
95 name name of the character IN UPPER CASE
96 category general category of the character
97 combining classes used in the Canonical Ordering Algorithm
98 bidi bidirectional category
99 decomposition character decomposition mapping
100 decimal if decimal digit this is the integer numeric value
101 digit if digit this is the numeric value
102 numeric if numeric is the integer or rational numeric value
103 mirrored if mirrored in bidirectional text
104 unicode10 Unicode 1.0 name if existed and different
105 comment ISO 10646 comment field
106 upper uppercase equivalent mapping
107 lower lowercase equivalent mapping
108 title titlecase equivalent mapping
110 block block the character belongs to (used in \p{In...})
111 script script the character belongs to
113 If no match is found, a reference to an empty hash is returned.
115 The C<block> property is the same as returned by charinfo(). It is
116 not defined in the Unicode Character Database proper (Chapter 4 of the
117 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
118 (Chapter 14 of TUS3). Similarly for the C<script> property.
120 Note that you cannot do (de)composition and casing based solely on the
121 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
122 you will need also the compexcl(), casefold(), and casespec() functions.
129 if ($arg =~ /^[1-9]\d*$/) {
131 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
138 # Lingua::KO::Hangul::Util not part of the standard distribution
139 # but it will be used if available.
141 eval { require Lingua::KO::Hangul::Util };
142 my $hasHangulUtil = ! $@;
143 if ($hasHangulUtil) {
144 Lingua::KO::Hangul::Util->import();
147 sub hangul_decomp { # internal: called from charinfo
148 if ($hasHangulUtil) {
149 my @tmp = decomposeHangul(shift);
150 return sprintf("%04X %04X", @tmp) if @tmp == 2;
151 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
156 sub hangul_charname { # internal: called from charinfo
157 return sprintf("HANGUL SYLLABLE-%04X", shift);
160 sub han_charname { # internal: called from charinfo
161 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
164 my @CharinfoRanges = (
166 # [ first, last, coderef to name, coderef to decompose ],
167 # CJK Ideographs Extension A
168 [ 0x3400, 0x4DB5, \&han_charname, undef ],
170 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
172 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
173 # Non-Private Use High Surrogates
174 [ 0xD800, 0xDB7F, undef, undef ],
175 # Private Use High Surrogates
176 [ 0xDB80, 0xDBFF, undef, undef ],
178 [ 0xDC00, 0xDFFF, undef, undef ],
179 # The Private Use Area
180 [ 0xE000, 0xF8FF, undef, undef ],
181 # CJK Ideographs Extension B
182 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
183 # Plane 15 Private Use Area
184 [ 0xF0000, 0xFFFFD, undef, undef ],
185 # Plane 16 Private Use Area
186 [ 0x100000, 0x10FFFD, undef, undef ],
191 my $code = _getcode($arg);
192 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
193 unless defined $code;
194 my $hexk = sprintf("%06X", $code);
195 my($rcode,$rname,$rdec);
196 foreach my $range (@CharinfoRanges){
197 if ($range->[0] <= $code && $code <= $range->[1]) {
200 $rcode = sprintf("%04X", hex($rcode));
201 $rname = $range->[2] ? $range->[2]->($code) : '';
202 $rdec = $range->[3] ? $range->[3]->($code) : '';
203 $hexk = sprintf("%06X", $range->[0]); # replace by the first
207 openunicode(\$UNICODEFH, "UnicodeData.txt");
208 if (defined $UNICODEFH) {
209 use Search::Dict 1.02;
210 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
211 my $line = <$UNICODEFH>;
216 combining bidi decomposition
217 decimal digit numeric
218 mirrored unicode10 comment
220 )} = split(/;/, $line, -1);
222 $hexk = sprintf("%04X", hex($hexk));
223 if ($prop{code} eq $hexk) {
224 $prop{block} = charblock($code);
225 $prop{script} = charscript($code);
227 $prop{code} = $rcode;
228 $prop{name} = $rname;
229 $prop{decomposition} = $rdec;
238 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
239 my ($table, $lo, $hi, $code) = @_;
243 my $mid = int(($lo+$hi) / 2);
245 if ($table->[$mid]->[0] < $code) {
246 if ($table->[$mid]->[1] >= $code) {
247 return $table->[$mid]->[2];
249 _search($table, $mid + 1, $hi, $code);
251 } elsif ($table->[$mid]->[0] > $code) {
252 _search($table, $lo, $mid - 1, $code);
254 return $table->[$mid]->[2];
259 my ($range, $arg) = @_;
260 my $code = _getcode($arg);
261 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
262 unless defined $code;
263 _search($range, 0, $#$range, $code);
268 use Unicode::UCD 'charblock';
270 my $charblock = charblock(0x41);
271 my $charblock = charblock(1234);
272 my $charblock = charblock("0x263a");
273 my $charblock = charblock("U+263a");
275 my $range = charblock('Armenian');
277 With a B<code point argument> charblock() returns the I<block> the character
278 belongs to, e.g. C<Basic Latin>. Note that not all the character
279 positions within all blocks are defined.
281 See also L</Blocks versus Scripts>.
283 If supplied with an argument that can't be a code point, charblock() tries
284 to do the opposite and interpret the argument as a character block. The
285 return value is a I<range>: an anonymous list of lists that contain
286 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
287 code point is in a range using the L</charinrange> function. If the
288 argument is not a known charater block, C<undef> is returned.
297 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
299 while (<$BLOCKSFH>) {
300 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
301 my ($lo, $hi) = (hex($1), hex($2));
302 my $subrange = [ $lo, $hi, $3 ];
303 push @BLOCKS, $subrange;
304 push @{$BLOCKS{$3}}, $subrange;
315 _charblocks() unless @BLOCKS;
317 my $code = _getcode($arg);
320 _search(\@BLOCKS, 0, $#BLOCKS, $code);
322 if (exists $BLOCKS{$arg}) {
323 return $BLOCKS{$arg};
332 use Unicode::UCD 'charscript';
334 my $charscript = charscript(0x41);
335 my $charscript = charscript(1234);
336 my $charscript = charscript("U+263a");
338 my $range = charscript('Thai');
340 With a B<code point argument> charscript() returns the I<script> the
341 character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
343 See also L</Blocks versus Scripts>.
345 If supplied with an argument that can't be a code point, charscript() tries
346 to do the opposite and interpret the argument as a character script. The
347 return value is a I<range>: an anonymous list of lists that contain
348 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
349 code point is in a range using the L</charinrange> function. If the
350 argument is not a known charater script, C<undef> is returned.
359 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
361 while (<$SCRIPTSFH>) {
362 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
363 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
365 $script =~ s/\b(\w)/uc($1)/ge;
366 my $subrange = [ $lo, $hi, $script ];
367 push @SCRIPTS, $subrange;
368 push @{$SCRIPTS{$script}}, $subrange;
372 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
380 _charscripts() unless @SCRIPTS;
382 my $code = _getcode($arg);
385 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
387 if (exists $SCRIPTS{$arg}) {
388 return $SCRIPTS{$arg};
397 use Unicode::UCD 'charblocks';
399 my $charblocks = charblocks();
401 charblocks() returns a reference to a hash with the known block names
402 as the keys, and the code point ranges (see L</charblock>) as the values.
404 See also L</Blocks versus Scripts>.
409 _charblocks() unless %BLOCKS;
415 use Unicode::UCD 'charscripts';
417 my %charscripts = charscripts();
419 charscripts() returns a hash with the known script names as the keys,
420 and the code point ranges (see L</charscript>) as the values.
422 See also L</Blocks versus Scripts>.
427 _charscripts() unless %SCRIPTS;
431 =head2 Blocks versus Scripts
433 The difference between a block and a script is that scripts are closer
434 to the linguistic notion of a set of characters required to present
435 languages, while block is more of an artifact of the Unicode character
436 numbering and separation into blocks of (mostly) 256 characters.
438 For example the Latin B<script> is spread over several B<blocks>, such
439 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
440 C<Latin Extended-B>. On the other hand, the Latin script does not
441 contain all the characters of the C<Basic Latin> block (also known as
442 the ASCII): it includes only the letters, and not, for example, the digits
445 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
447 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
449 =head2 Matching Scripts and Blocks
451 Scripts are matched with the regular-expression construct
452 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
453 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
454 any of the 256 code points in the Tibetan block).
456 =head2 Code Point Arguments
458 A I<code point argument> is either a decimal or a hexadecimal scalar
459 designating a Unicode character, or C<U+> followed by hexadecimals
460 designating a Unicode character. In other words, if you want a code
461 point to be interpreted as a hexadecimal number, you must prefix it
462 with either C<0x> or C<U+>, because a string like e.g. C<123> will
463 be interpreted as a decimal code point. Also note that Unicode is
464 B<not> limited to 16 bits (the number of Unicode characters is
465 open-ended, in theory unlimited): you may have more than 4 hexdigits.
469 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
470 can also test whether a code point is in the I<range> as returned by
471 L</charblock> and L</charscript> or as the values of the hash returned
472 by L</charblocks> and L</charscripts> by using charinrange():
474 use Unicode::UCD qw(charscript charinrange);
476 $range = charscript('Hiragana');
477 print "looks like hiragana\n" if charinrange($range, $codepoint);
483 use Unicode::UCD 'compexcl';
485 my $compexcl = compexcl("09dc");
487 The compexcl() returns the composition exclusion (that is, if the
488 character should not be produced during a precomposition) of the
489 character specified by a B<code point argument>.
491 If there is a composition exclusion for the character, true is
492 returned. Otherwise, false is returned.
500 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
502 while (<$COMPEXCLFH>) {
503 if (/^([0-9A-F]+)\s+\#\s+/) {
505 $COMPEXCL{$code} = undef;
515 my $code = _getcode($arg);
516 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
517 unless defined $code;
519 _compexcl() unless %COMPEXCL;
521 return exists $COMPEXCL{$code};
526 use Unicode::UCD 'casefold';
528 my $casefold = casefold("00DF");
530 The casefold() returns the locale-independent case folding of the
531 character specified by a B<code point argument>.
533 If there is a case folding for that character, a reference to a hash
534 with the following fields is returned:
538 code code point with at least four hexdigits
539 status "C", "F", "S", or "I"
540 mapping one or more codes separated by spaces
542 The meaning of the I<status> is as follows:
544 C common case folding, common mappings shared
545 by both simple and full mappings
546 F full case folding, mappings that cause strings
547 to grow in length. Multiple characters are separated
549 S simple case folding, mappings to single characters
550 where different from F
551 I special case for dotted uppercase I and
553 - If this mapping is included, the result is
554 case-insensitive, but dotless and dotted I's
555 are not distinguished
556 - If this mapping is excluded, the result is not
557 fully case-insensitive, but dotless and dotted
558 I's are distinguished
560 If there is no case folding for that character, C<undef> is returned.
562 For more information about case mappings see
563 http://www.unicode.org/unicode/reports/tr21/
571 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
573 while (<$CASEFOLDFH>) {
574 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
576 $CASEFOLD{$code} = { code => $1,
588 my $code = _getcode($arg);
589 croak __PACKAGE__, "::casefold: unknown code '$arg'"
590 unless defined $code;
592 _casefold() unless %CASEFOLD;
594 return $CASEFOLD{$code};
599 use Unicode::UCD 'casespec';
601 my $casespec = casespec("FB00");
603 The casespec() returns the potentially locale-dependent case mapping
604 of the character specified by a B<code point argument>. The mapping
605 may change the length of the string (which the basic Unicode case
606 mappings as returned by charinfo() never do).
608 If there is a case folding for that character, a reference to a hash
609 with the following fields is returned:
613 code code point with at least four hexdigits
617 condition condition list (may be undef)
619 The C<condition> is optional. Where present, it consists of one or
620 more I<locales> or I<contexts>, separated by spaces (other than as
621 used to separate elements, spaces are to be ignored). A condition
622 list overrides the normal behavior if all of the listed conditions are
623 true. Case distinctions in the condition list are not significant.
624 Conditions preceded by "NON_" represent the negation of the condition.
626 Note that when there are multiple case folding definitions for a
627 single code point because of different locales, the value returned by
628 casespec() is a hash reference which has the locales as the keys and
629 hash references as described above as the values.
631 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
632 followed by a "_" and a 2-letter ISO language code (possibly followed
633 by a "_" and a variant code). You can find the lists of those codes,
634 see L<Locale::Country> and L<Locale::Language>.
636 A I<context> is one of the following choices:
638 FINAL The letter is not followed by a letter of
639 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
640 MODERN The mapping is only used for modern text
641 AFTER_i The last base character was "i" (U+0069)
643 For more information about case mappings see
644 http://www.unicode.org/unicode/reports/tr21/
652 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
654 while (<$CASESPECFH>) {
655 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
656 my ($hexcode, $lower, $title, $upper, $condition) =
657 ($1, $2, $3, $4, $5);
658 my $code = hex($hexcode);
659 if (exists $CASESPEC{$code}) {
660 if (exists $CASESPEC{$code}->{code}) {
665 @{$CASESPEC{$code}}{qw(lower
669 if (defined $oldcondition) {
671 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
672 delete $CASESPEC{$code};
673 $CASESPEC{$code}->{$oldlocale} =
678 condition => $oldcondition };
682 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
683 $CASESPEC{$code}->{$locale} =
688 condition => $condition };
695 condition => $condition };
706 my $code = _getcode($arg);
707 croak __PACKAGE__, "::casespec: unknown code '$arg'"
708 unless defined $code;
710 _casespec() unless %CASESPEC;
712 return $CASESPEC{$code};
715 =head2 Unicode::UCD::UnicodeVersion
717 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
718 Character Database, in other words, the version of the Unicode
719 standard the database implements. The version is a string
720 of numbers delimited by dots (C<'.'>).
727 unless (defined $UNICODEVERSION) {
728 openunicode(\$VERSIONFH, "version");
729 chomp($UNICODEVERSION = <$VERSIONFH>);
731 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
732 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
734 return $UNICODEVERSION;
737 =head2 Implementation Note
739 The first use of charinfo() opens a read-only filehandle to the Unicode
740 Character Database (the database is included in the Perl distribution).
741 The filehandle is then kept open for further queries. In other words,
742 if you are wondering where one of your filehandles went, that's where.
746 Does not yet support EBCDIC platforms.