8 use Storable qw(dclone);
12 our @ISA = qw(Exporter);
14 our @EXPORT_OK = qw(charinfo
16 charblocks charscripts
25 Unicode::UCD - Unicode character database
29 use Unicode::UCD 'charinfo';
30 my $charinfo = charinfo($codepoint);
32 use Unicode::UCD 'charblock';
33 my $charblock = charblock($codepoint);
35 use Unicode::UCD 'charscript';
36 my $charscript = charscript($codepoint);
38 use Unicode::UCD 'charblocks';
39 my $charblocks = charblocks();
41 use Unicode::UCD 'charscripts';
42 my %charscripts = charscripts();
44 use Unicode::UCD qw(charscript charinrange);
45 my $range = charscript($script);
46 print "looks like $script\n" if charinrange($range, $codepoint);
48 use Unicode::UCD 'compexcl';
49 my $compexcl = compexcl($codepoint);
51 my $unicode_version = Unicode::UCD::UnicodeVersion();
55 The Unicode::UCD module offers a simple interface to the Unicode
69 my ($rfh, @path) = @_;
71 unless (defined $$rfh) {
74 $f = File::Spec->catfile($d, "unicore", @path);
75 last if open($$rfh, $f);
78 croak __PACKAGE__, ": failed to find ",
79 File::Spec->catfile(@path), " in @INC"
87 use Unicode::UCD 'charinfo';
89 my $charinfo = charinfo(0x41);
91 charinfo() returns a reference to a hash that has the following fields
92 as defined by the Unicode standard:
96 code code point with at least four hexdigits
97 name name of the character IN UPPER CASE
98 category general category of the character
99 combining classes used in the Canonical Ordering Algorithm
100 bidi bidirectional category
101 decomposition character decomposition mapping
102 decimal if decimal digit this is the integer numeric value
103 digit if digit this is the numeric value
104 numeric if numeric is the integer or rational numeric value
105 mirrored if mirrored in bidirectional text
106 unicode10 Unicode 1.0 name if existed and different
107 comment ISO 10646 comment field
108 upper uppercase equivalent mapping
109 lower lowercase equivalent mapping
110 title titlecase equivalent mapping
112 block block the character belongs to (used in \p{In...})
113 script script the character belongs to
115 If no match is found, a reference to an empty hash is returned.
117 The C<block> property is the same as returned by charinfo(). It is
118 not defined in the Unicode Character Database proper (Chapter 4 of the
119 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
120 (Chapter 14 of TUS3). Similarly for the C<script> property.
122 Note that you cannot do (de)composition and casing based solely on the
123 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
124 you will need also the compexcl(), casefold(), and casespec() functions.
128 # NB: This function is duplicated in charnames.pm
132 if ($arg =~ /^[1-9]\d*$/) {
134 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
141 # Lingua::KO::Hangul::Util not part of the standard distribution
142 # but it will be used if available.
144 eval { require Lingua::KO::Hangul::Util };
145 my $hasHangulUtil = ! $@;
146 if ($hasHangulUtil) {
147 Lingua::KO::Hangul::Util->import();
150 sub hangul_decomp { # internal: called from charinfo
151 if ($hasHangulUtil) {
152 my @tmp = decomposeHangul(shift);
153 return sprintf("%04X %04X", @tmp) if @tmp == 2;
154 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
159 sub hangul_charname { # internal: called from charinfo
160 return sprintf("HANGUL SYLLABLE-%04X", shift);
163 sub han_charname { # internal: called from charinfo
164 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
167 my @CharinfoRanges = (
169 # [ first, last, coderef to name, coderef to decompose ],
170 # CJK Ideographs Extension A
171 [ 0x3400, 0x4DB5, \&han_charname, undef ],
173 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
175 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
176 # Non-Private Use High Surrogates
177 [ 0xD800, 0xDB7F, undef, undef ],
178 # Private Use High Surrogates
179 [ 0xDB80, 0xDBFF, undef, undef ],
181 [ 0xDC00, 0xDFFF, undef, undef ],
182 # The Private Use Area
183 [ 0xE000, 0xF8FF, undef, undef ],
184 # CJK Ideographs Extension B
185 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
186 # Plane 15 Private Use Area
187 [ 0xF0000, 0xFFFFD, undef, undef ],
188 # Plane 16 Private Use Area
189 [ 0x100000, 0x10FFFD, undef, undef ],
194 my $code = _getcode($arg);
195 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
196 unless defined $code;
197 my $hexk = sprintf("%06X", $code);
198 my($rcode,$rname,$rdec);
199 foreach my $range (@CharinfoRanges){
200 if ($range->[0] <= $code && $code <= $range->[1]) {
203 $rcode = sprintf("%04X", hex($rcode));
204 $rname = $range->[2] ? $range->[2]->($code) : '';
205 $rdec = $range->[3] ? $range->[3]->($code) : '';
206 $hexk = sprintf("%06X", $range->[0]); # replace by the first
210 openunicode(\$UNICODEFH, "UnicodeData.txt");
211 if (defined $UNICODEFH) {
212 use Search::Dict 1.02;
213 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
214 my $line = <$UNICODEFH>;
215 return unless defined $line;
220 combining bidi decomposition
221 decimal digit numeric
222 mirrored unicode10 comment
224 )} = split(/;/, $line, -1);
226 $hexk = sprintf("%04X", hex($hexk));
227 if ($prop{code} eq $hexk) {
228 $prop{block} = charblock($code);
229 $prop{script} = charscript($code);
231 $prop{code} = $rcode;
232 $prop{name} = $rname;
233 $prop{decomposition} = $rdec;
242 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
243 my ($table, $lo, $hi, $code) = @_;
247 my $mid = int(($lo+$hi) / 2);
249 if ($table->[$mid]->[0] < $code) {
250 if ($table->[$mid]->[1] >= $code) {
251 return $table->[$mid]->[2];
253 _search($table, $mid + 1, $hi, $code);
255 } elsif ($table->[$mid]->[0] > $code) {
256 _search($table, $lo, $mid - 1, $code);
258 return $table->[$mid]->[2];
263 my ($range, $arg) = @_;
264 my $code = _getcode($arg);
265 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
266 unless defined $code;
267 _search($range, 0, $#$range, $code);
272 use Unicode::UCD 'charblock';
274 my $charblock = charblock(0x41);
275 my $charblock = charblock(1234);
276 my $charblock = charblock("0x263a");
277 my $charblock = charblock("U+263a");
279 my $range = charblock('Armenian');
281 With a B<code point argument> charblock() returns the I<block> the character
282 belongs to, e.g. C<Basic Latin>. Note that not all the character
283 positions within all blocks are defined.
285 See also L</Blocks versus Scripts>.
287 If supplied with an argument that can't be a code point, charblock() tries
288 to do the opposite and interpret the argument as a character block. The
289 return value is a I<range>: an anonymous list of lists that contain
290 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
291 code point is in a range using the L</charinrange> function. If the
292 argument is not a known charater block, C<undef> is returned.
301 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
303 while (<$BLOCKSFH>) {
304 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
305 my ($lo, $hi) = (hex($1), hex($2));
306 my $subrange = [ $lo, $hi, $3 ];
307 push @BLOCKS, $subrange;
308 push @{$BLOCKS{$3}}, $subrange;
319 _charblocks() unless @BLOCKS;
321 my $code = _getcode($arg);
324 _search(\@BLOCKS, 0, $#BLOCKS, $code);
326 if (exists $BLOCKS{$arg}) {
327 return dclone $BLOCKS{$arg};
336 use Unicode::UCD 'charscript';
338 my $charscript = charscript(0x41);
339 my $charscript = charscript(1234);
340 my $charscript = charscript("U+263a");
342 my $range = charscript('Thai');
344 With a B<code point argument> charscript() returns the I<script> the
345 character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
347 See also L</Blocks versus Scripts>.
349 If supplied with an argument that can't be a code point, charscript() tries
350 to do the opposite and interpret the argument as a character script. The
351 return value is a I<range>: an anonymous list of lists that contain
352 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
353 code point is in a range using the L</charinrange> function. If the
354 argument is not a known charater script, C<undef> is returned.
363 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
365 while (<$SCRIPTSFH>) {
366 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
367 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
369 $script =~ s/\b(\w)/uc($1)/ge;
370 my $subrange = [ $lo, $hi, $script ];
371 push @SCRIPTS, $subrange;
372 push @{$SCRIPTS{$script}}, $subrange;
376 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
384 _charscripts() unless @SCRIPTS;
386 my $code = _getcode($arg);
389 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
391 if (exists $SCRIPTS{$arg}) {
392 return dclone $SCRIPTS{$arg};
401 use Unicode::UCD 'charblocks';
403 my $charblocks = charblocks();
405 charblocks() returns a reference to a hash with the known block names
406 as the keys, and the code point ranges (see L</charblock>) as the values.
408 See also L</Blocks versus Scripts>.
413 _charblocks() unless %BLOCKS;
414 return dclone \%BLOCKS;
419 use Unicode::UCD 'charscripts';
421 my %charscripts = charscripts();
423 charscripts() returns a hash with the known script names as the keys,
424 and the code point ranges (see L</charscript>) as the values.
426 See also L</Blocks versus Scripts>.
431 _charscripts() unless %SCRIPTS;
432 return dclone \%SCRIPTS;
435 =head2 Blocks versus Scripts
437 The difference between a block and a script is that scripts are closer
438 to the linguistic notion of a set of characters required to present
439 languages, while block is more of an artifact of the Unicode character
440 numbering and separation into blocks of (mostly) 256 characters.
442 For example the Latin B<script> is spread over several B<blocks>, such
443 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
444 C<Latin Extended-B>. On the other hand, the Latin script does not
445 contain all the characters of the C<Basic Latin> block (also known as
446 the ASCII): it includes only the letters, and not, for example, the digits
449 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
451 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
453 =head2 Matching Scripts and Blocks
455 Scripts are matched with the regular-expression construct
456 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
457 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
458 any of the 256 code points in the Tibetan block).
460 =head2 Code Point Arguments
462 A I<code point argument> is either a decimal or a hexadecimal scalar
463 designating a Unicode character, or C<U+> followed by hexadecimals
464 designating a Unicode character. In other words, if you want a code
465 point to be interpreted as a hexadecimal number, you must prefix it
466 with either C<0x> or C<U+>, because a string like e.g. C<123> will
467 be interpreted as a decimal code point. Also note that Unicode is
468 B<not> limited to 16 bits (the number of Unicode characters is
469 open-ended, in theory unlimited): you may have more than 4 hexdigits.
473 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
474 can also test whether a code point is in the I<range> as returned by
475 L</charblock> and L</charscript> or as the values of the hash returned
476 by L</charblocks> and L</charscripts> by using charinrange():
478 use Unicode::UCD qw(charscript charinrange);
480 $range = charscript('Hiragana');
481 print "looks like hiragana\n" if charinrange($range, $codepoint);
487 use Unicode::UCD 'compexcl';
489 my $compexcl = compexcl("09dc");
491 The compexcl() returns the composition exclusion (that is, if the
492 character should not be produced during a precomposition) of the
493 character specified by a B<code point argument>.
495 If there is a composition exclusion for the character, true is
496 returned. Otherwise, false is returned.
504 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
506 while (<$COMPEXCLFH>) {
507 if (/^([0-9A-F]+)\s+\#\s+/) {
509 $COMPEXCL{$code} = undef;
519 my $code = _getcode($arg);
520 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
521 unless defined $code;
523 _compexcl() unless %COMPEXCL;
525 return exists $COMPEXCL{$code};
530 use Unicode::UCD 'casefold';
532 my $casefold = casefold("00DF");
534 The casefold() returns the locale-independent case folding of the
535 character specified by a B<code point argument>.
537 If there is a case folding for that character, a reference to a hash
538 with the following fields is returned:
542 code code point with at least four hexdigits
543 status "C", "F", "S", or "I"
544 mapping one or more codes separated by spaces
546 The meaning of the I<status> is as follows:
548 C common case folding, common mappings shared
549 by both simple and full mappings
550 F full case folding, mappings that cause strings
551 to grow in length. Multiple characters are separated
553 S simple case folding, mappings to single characters
554 where different from F
555 I special case for dotted uppercase I and
557 - If this mapping is included, the result is
558 case-insensitive, but dotless and dotted I's
559 are not distinguished
560 - If this mapping is excluded, the result is not
561 fully case-insensitive, but dotless and dotted
562 I's are distinguished
564 If there is no case folding for that character, C<undef> is returned.
566 For more information about case mappings see
567 http://www.unicode.org/unicode/reports/tr21/
575 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
577 while (<$CASEFOLDFH>) {
578 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
580 $CASEFOLD{$code} = { code => $1,
592 my $code = _getcode($arg);
593 croak __PACKAGE__, "::casefold: unknown code '$arg'"
594 unless defined $code;
596 _casefold() unless %CASEFOLD;
598 return $CASEFOLD{$code};
603 use Unicode::UCD 'casespec';
605 my $casespec = casespec("FB00");
607 The casespec() returns the potentially locale-dependent case mapping
608 of the character specified by a B<code point argument>. The mapping
609 may change the length of the string (which the basic Unicode case
610 mappings as returned by charinfo() never do).
612 If there is a case folding for that character, a reference to a hash
613 with the following fields is returned:
617 code code point with at least four hexdigits
621 condition condition list (may be undef)
623 The C<condition> is optional. Where present, it consists of one or
624 more I<locales> or I<contexts>, separated by spaces (other than as
625 used to separate elements, spaces are to be ignored). A condition
626 list overrides the normal behavior if all of the listed conditions are
627 true. Case distinctions in the condition list are not significant.
628 Conditions preceded by "NON_" represent the negation of the condition.
630 Note that when there are multiple case folding definitions for a
631 single code point because of different locales, the value returned by
632 casespec() is a hash reference which has the locales as the keys and
633 hash references as described above as the values.
635 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
636 followed by a "_" and a 2-letter ISO language code (possibly followed
637 by a "_" and a variant code). You can find the lists of those codes,
638 see L<Locale::Country> and L<Locale::Language>.
640 A I<context> is one of the following choices:
642 FINAL The letter is not followed by a letter of
643 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
644 MODERN The mapping is only used for modern text
645 AFTER_i The last base character was "i" (U+0069)
647 For more information about case mappings see
648 http://www.unicode.org/unicode/reports/tr21/
656 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
658 while (<$CASESPECFH>) {
659 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
660 my ($hexcode, $lower, $title, $upper, $condition) =
661 ($1, $2, $3, $4, $5);
662 my $code = hex($hexcode);
663 if (exists $CASESPEC{$code}) {
664 if (exists $CASESPEC{$code}->{code}) {
669 @{$CASESPEC{$code}}{qw(lower
673 if (defined $oldcondition) {
675 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
676 delete $CASESPEC{$code};
677 $CASESPEC{$code}->{$oldlocale} =
682 condition => $oldcondition };
686 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
687 $CASESPEC{$code}->{$locale} =
692 condition => $condition };
699 condition => $condition };
710 my $code = _getcode($arg);
711 croak __PACKAGE__, "::casespec: unknown code '$arg'"
712 unless defined $code;
714 _casespec() unless %CASESPEC;
716 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
719 =head2 Unicode::UCD::UnicodeVersion
721 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
722 Character Database, in other words, the version of the Unicode
723 standard the database implements. The version is a string
724 of numbers delimited by dots (C<'.'>).
731 unless (defined $UNICODEVERSION) {
732 openunicode(\$VERSIONFH, "version");
733 chomp($UNICODEVERSION = <$VERSIONFH>);
735 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
736 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
738 return $UNICODEVERSION;
741 =head2 Implementation Note
743 The first use of charinfo() opens a read-only filehandle to the Unicode
744 Character Database (the database is included in the Perl distribution).
745 The filehandle is then kept open for further queries. In other words,
746 if you are wondering where one of your filehandles went, that's where.
750 Does not yet support EBCDIC platforms.