8 use Storable qw(dclone);
12 our @ISA = qw(Exporter);
14 our @EXPORT_OK = qw(charinfo
16 charblocks charscripts
26 Unicode::UCD - Unicode character database
30 use Unicode::UCD 'charinfo';
31 my $charinfo = charinfo($codepoint);
33 use Unicode::UCD 'charblock';
34 my $charblock = charblock($codepoint);
36 use Unicode::UCD 'charscript';
37 my $charscript = charscript($codepoint);
39 use Unicode::UCD 'charblocks';
40 my $charblocks = charblocks();
42 use Unicode::UCD 'charscripts';
43 my %charscripts = charscripts();
45 use Unicode::UCD qw(charscript charinrange);
46 my $range = charscript($script);
47 print "looks like $script\n" if charinrange($range, $codepoint);
49 use Unicode::UCD 'compexcl';
50 my $compexcl = compexcl($codepoint);
52 use Unicode::UCD 'namedseq';
53 my $namedseq = namedseq($named_sequence_name);
55 my $unicode_version = Unicode::UCD::UnicodeVersion();
59 The Unicode::UCD module offers a simple interface to the Unicode
74 my ($rfh, @path) = @_;
76 unless (defined $$rfh) {
79 $f = File::Spec->catfile($d, "unicore", @path);
80 last if open($$rfh, $f);
83 croak __PACKAGE__, ": failed to find ",
84 File::Spec->catfile(@path), " in @INC"
92 use Unicode::UCD 'charinfo';
94 my $charinfo = charinfo(0x41);
96 charinfo() returns a reference to a hash that has the following fields
97 as defined by the Unicode standard:
101 code code point with at least four hexdigits
102 name name of the character IN UPPER CASE
103 category general category of the character
104 combining classes used in the Canonical Ordering Algorithm
105 bidi bidirectional category
106 decomposition character decomposition mapping
107 decimal if decimal digit this is the integer numeric value
108 digit if digit this is the numeric value
109 numeric if numeric is the integer or rational numeric value
110 mirrored if mirrored in bidirectional text
111 unicode10 Unicode 1.0 name if existed and different
112 comment ISO 10646 comment field
113 upper uppercase equivalent mapping
114 lower lowercase equivalent mapping
115 title titlecase equivalent mapping
117 block block the character belongs to (used in \p{In...})
118 script script the character belongs to
120 If no match is found, a reference to an empty hash is returned.
122 The C<block> property is the same as returned by charinfo(). It is
123 not defined in the Unicode Character Database proper (Chapter 4 of the
124 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
125 (Chapter 14 of TUS3). Similarly for the C<script> property.
127 Note that you cannot do (de)composition and casing based solely on the
128 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
129 you will need also the compexcl(), casefold(), and casespec() functions.
133 # NB: This function is duplicated in charnames.pm
137 if ($arg =~ /^[1-9]\d*$/) {
139 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
146 # Lingua::KO::Hangul::Util not part of the standard distribution
147 # but it will be used if available.
149 eval { require Lingua::KO::Hangul::Util };
150 my $hasHangulUtil = ! $@;
151 if ($hasHangulUtil) {
152 Lingua::KO::Hangul::Util->import();
155 sub hangul_decomp { # internal: called from charinfo
156 if ($hasHangulUtil) {
157 my @tmp = decomposeHangul(shift);
158 return sprintf("%04X %04X", @tmp) if @tmp == 2;
159 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
164 sub hangul_charname { # internal: called from charinfo
165 return sprintf("HANGUL SYLLABLE-%04X", shift);
168 sub han_charname { # internal: called from charinfo
169 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
172 my @CharinfoRanges = (
174 # [ first, last, coderef to name, coderef to decompose ],
175 # CJK Ideographs Extension A
176 [ 0x3400, 0x4DB5, \&han_charname, undef ],
178 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
180 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
181 # Non-Private Use High Surrogates
182 [ 0xD800, 0xDB7F, undef, undef ],
183 # Private Use High Surrogates
184 [ 0xDB80, 0xDBFF, undef, undef ],
186 [ 0xDC00, 0xDFFF, undef, undef ],
187 # The Private Use Area
188 [ 0xE000, 0xF8FF, undef, undef ],
189 # CJK Ideographs Extension B
190 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
191 # Plane 15 Private Use Area
192 [ 0xF0000, 0xFFFFD, undef, undef ],
193 # Plane 16 Private Use Area
194 [ 0x100000, 0x10FFFD, undef, undef ],
199 my $code = _getcode($arg);
200 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
201 unless defined $code;
202 my $hexk = sprintf("%06X", $code);
203 my($rcode,$rname,$rdec);
204 foreach my $range (@CharinfoRanges){
205 if ($range->[0] <= $code && $code <= $range->[1]) {
208 $rcode = sprintf("%04X", hex($rcode));
209 $rname = $range->[2] ? $range->[2]->($code) : '';
210 $rdec = $range->[3] ? $range->[3]->($code) : '';
211 $hexk = sprintf("%06X", $range->[0]); # replace by the first
215 openunicode(\$UNICODEFH, "UnicodeData.txt");
216 if (defined $UNICODEFH) {
217 use Search::Dict 1.02;
218 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
219 my $line = <$UNICODEFH>;
220 return unless defined $line;
225 combining bidi decomposition
226 decimal digit numeric
227 mirrored unicode10 comment
229 )} = split(/;/, $line, -1);
231 $hexk = sprintf("%04X", hex($hexk));
232 if ($prop{code} eq $hexk) {
233 $prop{block} = charblock($code);
234 $prop{script} = charscript($code);
236 $prop{code} = $rcode;
237 $prop{name} = $rname;
238 $prop{decomposition} = $rdec;
247 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
248 my ($table, $lo, $hi, $code) = @_;
252 my $mid = int(($lo+$hi) / 2);
254 if ($table->[$mid]->[0] < $code) {
255 if ($table->[$mid]->[1] >= $code) {
256 return $table->[$mid]->[2];
258 _search($table, $mid + 1, $hi, $code);
260 } elsif ($table->[$mid]->[0] > $code) {
261 _search($table, $lo, $mid - 1, $code);
263 return $table->[$mid]->[2];
268 my ($range, $arg) = @_;
269 my $code = _getcode($arg);
270 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
271 unless defined $code;
272 _search($range, 0, $#$range, $code);
277 use Unicode::UCD 'charblock';
279 my $charblock = charblock(0x41);
280 my $charblock = charblock(1234);
281 my $charblock = charblock("0x263a");
282 my $charblock = charblock("U+263a");
284 my $range = charblock('Armenian');
286 With a B<code point argument> charblock() returns the I<block> the character
287 belongs to, e.g. C<Basic Latin>. Note that not all the character
288 positions within all blocks are defined.
290 See also L</Blocks versus Scripts>.
292 If supplied with an argument that can't be a code point, charblock() tries
293 to do the opposite and interpret the argument as a character block. The
294 return value is a I<range>: an anonymous list of lists that contain
295 I<start-of-range>, I<end-of-range> code point pairs. You can test whether
296 a code point is in a range using the L</charinrange> function. If the
297 argument is not a known character block, C<undef> is returned.
306 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
308 while (<$BLOCKSFH>) {
309 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
310 my ($lo, $hi) = (hex($1), hex($2));
311 my $subrange = [ $lo, $hi, $3 ];
312 push @BLOCKS, $subrange;
313 push @{$BLOCKS{$3}}, $subrange;
324 _charblocks() unless @BLOCKS;
326 my $code = _getcode($arg);
329 _search(\@BLOCKS, 0, $#BLOCKS, $code);
331 if (exists $BLOCKS{$arg}) {
332 return dclone $BLOCKS{$arg};
341 use Unicode::UCD 'charscript';
343 my $charscript = charscript(0x41);
344 my $charscript = charscript(1234);
345 my $charscript = charscript("U+263a");
347 my $range = charscript('Thai');
349 With a B<code point argument> charscript() returns the I<script> the
350 character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
352 See also L</Blocks versus Scripts>.
354 If supplied with an argument that can't be a code point, charscript() tries
355 to do the opposite and interpret the argument as a character script. The
356 return value is a I<range>: an anonymous list of lists that contain
357 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
358 code point is in a range using the L</charinrange> function. If the
359 argument is not a known character script, C<undef> is returned.
368 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
370 while (<$SCRIPTSFH>) {
371 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
372 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
374 $script =~ s/\b(\w)/uc($1)/ge;
375 my $subrange = [ $lo, $hi, $script ];
376 push @SCRIPTS, $subrange;
377 push @{$SCRIPTS{$script}}, $subrange;
381 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
389 _charscripts() unless @SCRIPTS;
391 my $code = _getcode($arg);
394 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
396 if (exists $SCRIPTS{$arg}) {
397 return dclone $SCRIPTS{$arg};
406 use Unicode::UCD 'charblocks';
408 my $charblocks = charblocks();
410 charblocks() returns a reference to a hash with the known block names
411 as the keys, and the code point ranges (see L</charblock>) as the values.
413 See also L</Blocks versus Scripts>.
418 _charblocks() unless %BLOCKS;
419 return dclone \%BLOCKS;
424 use Unicode::UCD 'charscripts';
426 my %charscripts = charscripts();
428 charscripts() returns a hash with the known script names as the keys,
429 and the code point ranges (see L</charscript>) as the values.
431 See also L</Blocks versus Scripts>.
436 _charscripts() unless %SCRIPTS;
437 return dclone \%SCRIPTS;
440 =head2 Blocks versus Scripts
442 The difference between a block and a script is that scripts are closer
443 to the linguistic notion of a set of characters required to present
444 languages, while block is more of an artifact of the Unicode character
445 numbering and separation into blocks of (mostly) 256 characters.
447 For example the Latin B<script> is spread over several B<blocks>, such
448 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
449 C<Latin Extended-B>. On the other hand, the Latin script does not
450 contain all the characters of the C<Basic Latin> block (also known as
451 the ASCII): it includes only the letters, and not, for example, the digits
454 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
456 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
458 =head2 Matching Scripts and Blocks
460 Scripts are matched with the regular-expression construct
461 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
462 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
463 any of the 256 code points in the Tibetan block).
465 =head2 Code Point Arguments
467 A I<code point argument> is either a decimal or a hexadecimal scalar
468 designating a Unicode character, or C<U+> followed by hexadecimals
469 designating a Unicode character. In other words, if you want a code
470 point to be interpreted as a hexadecimal number, you must prefix it
471 with either C<0x> or C<U+>, because a string like e.g. C<123> will
472 be interpreted as a decimal code point. Also note that Unicode is
473 B<not> limited to 16 bits (the number of Unicode characters is
474 open-ended, in theory unlimited): you may have more than 4 hexdigits.
478 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
479 can also test whether a code point is in the I<range> as returned by
480 L</charblock> and L</charscript> or as the values of the hash returned
481 by L</charblocks> and L</charscripts> by using charinrange():
483 use Unicode::UCD qw(charscript charinrange);
485 $range = charscript('Hiragana');
486 print "looks like hiragana\n" if charinrange($range, $codepoint);
492 use Unicode::UCD 'compexcl';
494 my $compexcl = compexcl("09dc");
496 The compexcl() returns the composition exclusion (that is, if the
497 character should not be produced during a precomposition) of the
498 character specified by a B<code point argument>.
500 If there is a composition exclusion for the character, true is
501 returned. Otherwise, false is returned.
509 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
511 while (<$COMPEXCLFH>) {
512 if (/^([0-9A-F]+)\s+\#\s+/) {
514 $COMPEXCL{$code} = undef;
524 my $code = _getcode($arg);
525 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
526 unless defined $code;
528 _compexcl() unless %COMPEXCL;
530 return exists $COMPEXCL{$code};
535 use Unicode::UCD 'casefold';
537 my $casefold = casefold("00DF");
539 The casefold() returns the locale-independent case folding of the
540 character specified by a B<code point argument>.
542 If there is a case folding for that character, a reference to a hash
543 with the following fields is returned:
547 code code point with at least four hexdigits
548 status "C", "F", "S", or "I"
549 mapping one or more codes separated by spaces
551 The meaning of the I<status> is as follows:
553 C common case folding, common mappings shared
554 by both simple and full mappings
555 F full case folding, mappings that cause strings
556 to grow in length. Multiple characters are separated
558 S simple case folding, mappings to single characters
559 where different from F
560 I special case for dotted uppercase I and
562 - If this mapping is included, the result is
563 case-insensitive, but dotless and dotted I's
564 are not distinguished
565 - If this mapping is excluded, the result is not
566 fully case-insensitive, but dotless and dotted
567 I's are distinguished
569 If there is no case folding for that character, C<undef> is returned.
571 For more information about case mappings see
572 http://www.unicode.org/unicode/reports/tr21/
580 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
582 while (<$CASEFOLDFH>) {
583 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
585 $CASEFOLD{$code} = { code => $1,
597 my $code = _getcode($arg);
598 croak __PACKAGE__, "::casefold: unknown code '$arg'"
599 unless defined $code;
601 _casefold() unless %CASEFOLD;
603 return $CASEFOLD{$code};
608 use Unicode::UCD 'casespec';
610 my $casespec = casespec("FB00");
612 The casespec() returns the potentially locale-dependent case mapping
613 of the character specified by a B<code point argument>. The mapping
614 may change the length of the string (which the basic Unicode case
615 mappings as returned by charinfo() never do).
617 If there is a case folding for that character, a reference to a hash
618 with the following fields is returned:
622 code code point with at least four hexdigits
626 condition condition list (may be undef)
628 The C<condition> is optional. Where present, it consists of one or
629 more I<locales> or I<contexts>, separated by spaces (other than as
630 used to separate elements, spaces are to be ignored). A condition
631 list overrides the normal behavior if all of the listed conditions are
632 true. Case distinctions in the condition list are not significant.
633 Conditions preceded by "NON_" represent the negation of the condition.
635 Note that when there are multiple case folding definitions for a
636 single code point because of different locales, the value returned by
637 casespec() is a hash reference which has the locales as the keys and
638 hash references as described above as the values.
640 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
641 followed by a "_" and a 2-letter ISO language code (possibly followed
642 by a "_" and a variant code). You can find the lists of those codes,
643 see L<Locale::Country> and L<Locale::Language>.
645 A I<context> is one of the following choices:
647 FINAL The letter is not followed by a letter of
648 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
649 MODERN The mapping is only used for modern text
650 AFTER_i The last base character was "i" (U+0069)
652 For more information about case mappings see
653 http://www.unicode.org/unicode/reports/tr21/
661 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
663 while (<$CASESPECFH>) {
664 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
665 my ($hexcode, $lower, $title, $upper, $condition) =
666 ($1, $2, $3, $4, $5);
667 my $code = hex($hexcode);
668 if (exists $CASESPEC{$code}) {
669 if (exists $CASESPEC{$code}->{code}) {
674 @{$CASESPEC{$code}}{qw(lower
678 if (defined $oldcondition) {
680 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
681 delete $CASESPEC{$code};
682 $CASESPEC{$code}->{$oldlocale} =
687 condition => $oldcondition };
691 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
692 $CASESPEC{$code}->{$locale} =
697 condition => $condition };
704 condition => $condition };
715 my $code = _getcode($arg);
716 croak __PACKAGE__, "::casespec: unknown code '$arg'"
717 unless defined $code;
719 _casespec() unless %CASESPEC;
721 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
726 use Unicode::UCD 'namedseq';
728 my $namedseq = namedseq("KATAKANA LETTER AINU P");
729 my @namedseq = namedseq("KATAKANA LETTER AINU P");
730 my %namedseq = namedseq();
732 If used with a single argument in a scalar context, returns the string
733 consisting of the code points of the named sequence, or C<undef> if no
734 named sequence by that name exists. If used with a single argument in
735 a list context, returns list of the code points. If used with no
736 arguments in a list context, returns a hash with the names of the
737 named sequences as the keys and the named sequences as strings as
738 the values. Otherwise, returns C<undef> or empty list depending
741 (New from Unicode 4.1.0)
749 if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
751 while (<$NAMEDSEQFH>) {
752 if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
753 my ($n, $s) = ($1, $2);
754 my @s = map { chr(hex($_)) } split(' ', $s);
755 $NAMEDSEQ{$n} = join("", @s);
764 _namedseq() unless %NAMEDSEQ;
765 my $wantarray = wantarray();
766 if (defined $wantarray) {
771 my $s = $NAMEDSEQ{ $_[0] };
772 return defined $s ? map { ord($_) } split('', $s) : ();
775 return $NAMEDSEQ{ $_[0] };
781 =head2 Unicode::UCD::UnicodeVersion
783 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
784 Character Database, in other words, the version of the Unicode
785 standard the database implements. The version is a string
786 of numbers delimited by dots (C<'.'>).
793 unless (defined $UNICODEVERSION) {
794 openunicode(\$VERSIONFH, "version");
795 chomp($UNICODEVERSION = <$VERSIONFH>);
797 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
798 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
800 return $UNICODEVERSION;
803 =head2 Implementation Note
805 The first use of charinfo() opens a read-only filehandle to the Unicode
806 Character Database (the database is included in the Perl distribution).
807 The filehandle is then kept open for further queries. In other words,
808 if you are wondering where one of your filehandles went, that's where.
812 Does not yet support EBCDIC platforms.