8 use Storable qw(dclone);
12 our @ISA = qw(Exporter);
14 our @EXPORT_OK = qw(charinfo
16 charblocks charscripts
25 Unicode::UCD - Unicode character database
29 use Unicode::UCD 'charinfo';
30 my $charinfo = charinfo($codepoint);
32 use Unicode::UCD 'charblock';
33 my $charblock = charblock($codepoint);
35 use Unicode::UCD 'charscript';
36 my $charscript = charscript($codepoint);
38 use Unicode::UCD 'charblocks';
39 my $charblocks = charblocks();
41 use Unicode::UCD 'charscripts';
42 my %charscripts = charscripts();
44 use Unicode::UCD qw(charscript charinrange);
45 my $range = charscript($script);
46 print "looks like $script\n" if charinrange($range, $codepoint);
48 use Unicode::UCD 'compexcl';
49 my $compexcl = compexcl($codepoint);
51 my $unicode_version = Unicode::UCD::UnicodeVersion();
55 The Unicode::UCD module offers a simple interface to the Unicode
69 my ($rfh, @path) = @_;
71 unless (defined $$rfh) {
74 $f = File::Spec->catfile($d, "unicore", @path);
75 last if open($$rfh, $f);
78 croak __PACKAGE__, ": failed to find ",
79 File::Spec->catfile(@path), " in @INC"
87 use Unicode::UCD 'charinfo';
89 my $charinfo = charinfo(0x41);
91 charinfo() returns a reference to a hash that has the following fields
92 as defined by the Unicode standard:
96 code code point with at least four hexdigits
97 name name of the character IN UPPER CASE
98 category general category of the character
99 combining classes used in the Canonical Ordering Algorithm
100 bidi bidirectional category
101 decomposition character decomposition mapping
102 decimal if decimal digit this is the integer numeric value
103 digit if digit this is the numeric value
104 numeric if numeric is the integer or rational numeric value
105 mirrored if mirrored in bidirectional text
106 unicode10 Unicode 1.0 name if existed and different
107 comment ISO 10646 comment field
108 upper uppercase equivalent mapping
109 lower lowercase equivalent mapping
110 title titlecase equivalent mapping
112 block block the character belongs to (used in \p{In...})
113 script script the character belongs to
115 If no match is found, a reference to an empty hash is returned.
117 The C<block> property is the same as returned by charinfo(). It is
118 not defined in the Unicode Character Database proper (Chapter 4 of the
119 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
120 (Chapter 14 of TUS3). Similarly for the C<script> property.
122 Note that you cannot do (de)composition and casing based solely on the
123 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
124 you will need also the compexcl(), casefold(), and casespec() functions.
131 if ($arg =~ /^[1-9]\d*$/) {
133 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
140 # Lingua::KO::Hangul::Util not part of the standard distribution
141 # but it will be used if available.
143 eval { require Lingua::KO::Hangul::Util };
144 my $hasHangulUtil = ! $@;
145 if ($hasHangulUtil) {
146 Lingua::KO::Hangul::Util->import();
149 sub hangul_decomp { # internal: called from charinfo
150 if ($hasHangulUtil) {
151 my @tmp = decomposeHangul(shift);
152 return sprintf("%04X %04X", @tmp) if @tmp == 2;
153 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
158 sub hangul_charname { # internal: called from charinfo
159 return sprintf("HANGUL SYLLABLE-%04X", shift);
162 sub han_charname { # internal: called from charinfo
163 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
166 my @CharinfoRanges = (
168 # [ first, last, coderef to name, coderef to decompose ],
169 # CJK Ideographs Extension A
170 [ 0x3400, 0x4DB5, \&han_charname, undef ],
172 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
174 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
175 # Non-Private Use High Surrogates
176 [ 0xD800, 0xDB7F, undef, undef ],
177 # Private Use High Surrogates
178 [ 0xDB80, 0xDBFF, undef, undef ],
180 [ 0xDC00, 0xDFFF, undef, undef ],
181 # The Private Use Area
182 [ 0xE000, 0xF8FF, undef, undef ],
183 # CJK Ideographs Extension B
184 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
185 # Plane 15 Private Use Area
186 [ 0xF0000, 0xFFFFD, undef, undef ],
187 # Plane 16 Private Use Area
188 [ 0x100000, 0x10FFFD, undef, undef ],
193 my $code = _getcode($arg);
194 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
195 unless defined $code;
196 my $hexk = sprintf("%06X", $code);
197 my($rcode,$rname,$rdec);
198 foreach my $range (@CharinfoRanges){
199 if ($range->[0] <= $code && $code <= $range->[1]) {
202 $rcode = sprintf("%04X", hex($rcode));
203 $rname = $range->[2] ? $range->[2]->($code) : '';
204 $rdec = $range->[3] ? $range->[3]->($code) : '';
205 $hexk = sprintf("%06X", $range->[0]); # replace by the first
209 openunicode(\$UNICODEFH, "UnicodeData.txt");
210 if (defined $UNICODEFH) {
211 use Search::Dict 1.02;
212 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
213 my $line = <$UNICODEFH>;
218 combining bidi decomposition
219 decimal digit numeric
220 mirrored unicode10 comment
222 )} = split(/;/, $line, -1);
224 $hexk = sprintf("%04X", hex($hexk));
225 if ($prop{code} eq $hexk) {
226 $prop{block} = charblock($code);
227 $prop{script} = charscript($code);
229 $prop{code} = $rcode;
230 $prop{name} = $rname;
231 $prop{decomposition} = $rdec;
240 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
241 my ($table, $lo, $hi, $code) = @_;
245 my $mid = int(($lo+$hi) / 2);
247 if ($table->[$mid]->[0] < $code) {
248 if ($table->[$mid]->[1] >= $code) {
249 return $table->[$mid]->[2];
251 _search($table, $mid + 1, $hi, $code);
253 } elsif ($table->[$mid]->[0] > $code) {
254 _search($table, $lo, $mid - 1, $code);
256 return $table->[$mid]->[2];
261 my ($range, $arg) = @_;
262 my $code = _getcode($arg);
263 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
264 unless defined $code;
265 _search($range, 0, $#$range, $code);
270 use Unicode::UCD 'charblock';
272 my $charblock = charblock(0x41);
273 my $charblock = charblock(1234);
274 my $charblock = charblock("0x263a");
275 my $charblock = charblock("U+263a");
277 my $range = charblock('Armenian');
279 With a B<code point argument> charblock() returns the I<block> the character
280 belongs to, e.g. C<Basic Latin>. Note that not all the character
281 positions within all blocks are defined.
283 See also L</Blocks versus Scripts>.
285 If supplied with an argument that can't be a code point, charblock() tries
286 to do the opposite and interpret the argument as a character block. The
287 return value is a I<range>: an anonymous list of lists that contain
288 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
289 code point is in a range using the L</charinrange> function. If the
290 argument is not a known charater block, C<undef> is returned.
299 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
301 while (<$BLOCKSFH>) {
302 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
303 my ($lo, $hi) = (hex($1), hex($2));
304 my $subrange = [ $lo, $hi, $3 ];
305 push @BLOCKS, $subrange;
306 push @{$BLOCKS{$3}}, $subrange;
317 _charblocks() unless @BLOCKS;
319 my $code = _getcode($arg);
322 _search(\@BLOCKS, 0, $#BLOCKS, $code);
324 if (exists $BLOCKS{$arg}) {
325 return dclone $BLOCKS{$arg};
334 use Unicode::UCD 'charscript';
336 my $charscript = charscript(0x41);
337 my $charscript = charscript(1234);
338 my $charscript = charscript("U+263a");
340 my $range = charscript('Thai');
342 With a B<code point argument> charscript() returns the I<script> the
343 character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
345 See also L</Blocks versus Scripts>.
347 If supplied with an argument that can't be a code point, charscript() tries
348 to do the opposite and interpret the argument as a character script. The
349 return value is a I<range>: an anonymous list of lists that contain
350 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
351 code point is in a range using the L</charinrange> function. If the
352 argument is not a known charater script, C<undef> is returned.
361 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
363 while (<$SCRIPTSFH>) {
364 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
365 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
367 $script =~ s/\b(\w)/uc($1)/ge;
368 my $subrange = [ $lo, $hi, $script ];
369 push @SCRIPTS, $subrange;
370 push @{$SCRIPTS{$script}}, $subrange;
374 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
382 _charscripts() unless @SCRIPTS;
384 my $code = _getcode($arg);
387 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
389 if (exists $SCRIPTS{$arg}) {
390 return dclone $SCRIPTS{$arg};
399 use Unicode::UCD 'charblocks';
401 my $charblocks = charblocks();
403 charblocks() returns a reference to a hash with the known block names
404 as the keys, and the code point ranges (see L</charblock>) as the values.
406 See also L</Blocks versus Scripts>.
411 _charblocks() unless %BLOCKS;
412 return dclone \%BLOCKS;
417 use Unicode::UCD 'charscripts';
419 my %charscripts = charscripts();
421 charscripts() returns a hash with the known script names as the keys,
422 and the code point ranges (see L</charscript>) as the values.
424 See also L</Blocks versus Scripts>.
429 _charscripts() unless %SCRIPTS;
430 return dclone \%SCRIPTS;
433 =head2 Blocks versus Scripts
435 The difference between a block and a script is that scripts are closer
436 to the linguistic notion of a set of characters required to present
437 languages, while block is more of an artifact of the Unicode character
438 numbering and separation into blocks of (mostly) 256 characters.
440 For example the Latin B<script> is spread over several B<blocks>, such
441 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
442 C<Latin Extended-B>. On the other hand, the Latin script does not
443 contain all the characters of the C<Basic Latin> block (also known as
444 the ASCII): it includes only the letters, and not, for example, the digits
447 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
449 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
451 =head2 Matching Scripts and Blocks
453 Scripts are matched with the regular-expression construct
454 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
455 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
456 any of the 256 code points in the Tibetan block).
458 =head2 Code Point Arguments
460 A I<code point argument> is either a decimal or a hexadecimal scalar
461 designating a Unicode character, or C<U+> followed by hexadecimals
462 designating a Unicode character. In other words, if you want a code
463 point to be interpreted as a hexadecimal number, you must prefix it
464 with either C<0x> or C<U+>, because a string like e.g. C<123> will
465 be interpreted as a decimal code point. Also note that Unicode is
466 B<not> limited to 16 bits (the number of Unicode characters is
467 open-ended, in theory unlimited): you may have more than 4 hexdigits.
471 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
472 can also test whether a code point is in the I<range> as returned by
473 L</charblock> and L</charscript> or as the values of the hash returned
474 by L</charblocks> and L</charscripts> by using charinrange():
476 use Unicode::UCD qw(charscript charinrange);
478 $range = charscript('Hiragana');
479 print "looks like hiragana\n" if charinrange($range, $codepoint);
485 use Unicode::UCD 'compexcl';
487 my $compexcl = compexcl("09dc");
489 The compexcl() returns the composition exclusion (that is, if the
490 character should not be produced during a precomposition) of the
491 character specified by a B<code point argument>.
493 If there is a composition exclusion for the character, true is
494 returned. Otherwise, false is returned.
502 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
504 while (<$COMPEXCLFH>) {
505 if (/^([0-9A-F]+)\s+\#\s+/) {
507 $COMPEXCL{$code} = undef;
517 my $code = _getcode($arg);
518 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
519 unless defined $code;
521 _compexcl() unless %COMPEXCL;
523 return exists $COMPEXCL{$code};
528 use Unicode::UCD 'casefold';
530 my $casefold = casefold("00DF");
532 The casefold() returns the locale-independent case folding of the
533 character specified by a B<code point argument>.
535 If there is a case folding for that character, a reference to a hash
536 with the following fields is returned:
540 code code point with at least four hexdigits
541 status "C", "F", "S", or "I"
542 mapping one or more codes separated by spaces
544 The meaning of the I<status> is as follows:
546 C common case folding, common mappings shared
547 by both simple and full mappings
548 F full case folding, mappings that cause strings
549 to grow in length. Multiple characters are separated
551 S simple case folding, mappings to single characters
552 where different from F
553 I special case for dotted uppercase I and
555 - If this mapping is included, the result is
556 case-insensitive, but dotless and dotted I's
557 are not distinguished
558 - If this mapping is excluded, the result is not
559 fully case-insensitive, but dotless and dotted
560 I's are distinguished
562 If there is no case folding for that character, C<undef> is returned.
564 For more information about case mappings see
565 http://www.unicode.org/unicode/reports/tr21/
573 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
575 while (<$CASEFOLDFH>) {
576 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
578 $CASEFOLD{$code} = { code => $1,
590 my $code = _getcode($arg);
591 croak __PACKAGE__, "::casefold: unknown code '$arg'"
592 unless defined $code;
594 _casefold() unless %CASEFOLD;
596 return $CASEFOLD{$code};
601 use Unicode::UCD 'casespec';
603 my $casespec = casespec("FB00");
605 The casespec() returns the potentially locale-dependent case mapping
606 of the character specified by a B<code point argument>. The mapping
607 may change the length of the string (which the basic Unicode case
608 mappings as returned by charinfo() never do).
610 If there is a case folding for that character, a reference to a hash
611 with the following fields is returned:
615 code code point with at least four hexdigits
619 condition condition list (may be undef)
621 The C<condition> is optional. Where present, it consists of one or
622 more I<locales> or I<contexts>, separated by spaces (other than as
623 used to separate elements, spaces are to be ignored). A condition
624 list overrides the normal behavior if all of the listed conditions are
625 true. Case distinctions in the condition list are not significant.
626 Conditions preceded by "NON_" represent the negation of the condition.
628 Note that when there are multiple case folding definitions for a
629 single code point because of different locales, the value returned by
630 casespec() is a hash reference which has the locales as the keys and
631 hash references as described above as the values.
633 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
634 followed by a "_" and a 2-letter ISO language code (possibly followed
635 by a "_" and a variant code). You can find the lists of those codes,
636 see L<Locale::Country> and L<Locale::Language>.
638 A I<context> is one of the following choices:
640 FINAL The letter is not followed by a letter of
641 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
642 MODERN The mapping is only used for modern text
643 AFTER_i The last base character was "i" (U+0069)
645 For more information about case mappings see
646 http://www.unicode.org/unicode/reports/tr21/
654 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
656 while (<$CASESPECFH>) {
657 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
658 my ($hexcode, $lower, $title, $upper, $condition) =
659 ($1, $2, $3, $4, $5);
660 my $code = hex($hexcode);
661 if (exists $CASESPEC{$code}) {
662 if (exists $CASESPEC{$code}->{code}) {
667 @{$CASESPEC{$code}}{qw(lower
671 if (defined $oldcondition) {
673 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
674 delete $CASESPEC{$code};
675 $CASESPEC{$code}->{$oldlocale} =
680 condition => $oldcondition };
684 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
685 $CASESPEC{$code}->{$locale} =
690 condition => $condition };
697 condition => $condition };
708 my $code = _getcode($arg);
709 croak __PACKAGE__, "::casespec: unknown code '$arg'"
710 unless defined $code;
712 _casespec() unless %CASESPEC;
714 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
717 =head2 Unicode::UCD::UnicodeVersion
719 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
720 Character Database, in other words, the version of the Unicode
721 standard the database implements. The version is a string
722 of numbers delimited by dots (C<'.'>).
729 unless (defined $UNICODEVERSION) {
730 openunicode(\$VERSIONFH, "version");
731 chomp($UNICODEVERSION = <$VERSIONFH>);
733 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
734 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
736 return $UNICODEVERSION;
739 =head2 Implementation Note
741 The first use of charinfo() opens a read-only filehandle to the Unicode
742 Character Database (the database is included in the Perl distribution).
743 The filehandle is then kept open for further queries. In other words,
744 if you are wondering where one of your filehandles went, that's where.
748 Does not yet support EBCDIC platforms.