8 use Storable qw(dclone);
12 our @ISA = qw(Exporter);
14 our @EXPORT_OK = qw(charinfo
16 charblocks charscripts
25 Unicode::UCD - Unicode character database
29 use Unicode::UCD 'charinfo';
30 my $charinfo = charinfo($codepoint);
32 use Unicode::UCD 'charblock';
33 my $charblock = charblock($codepoint);
35 use Unicode::UCD 'charscript';
36 my $charscript = charscript($codepoint);
38 use Unicode::UCD 'charblocks';
39 my $charblocks = charblocks();
41 use Unicode::UCD 'charscripts';
42 my %charscripts = charscripts();
44 use Unicode::UCD qw(charscript charinrange);
45 my $range = charscript($script);
46 print "looks like $script\n" if charinrange($range, $codepoint);
48 use Unicode::UCD 'compexcl';
49 my $compexcl = compexcl($codepoint);
51 my $unicode_version = Unicode::UCD::UnicodeVersion();
55 The Unicode::UCD module offers a simple interface to the Unicode
69 my ($rfh, @path) = @_;
71 unless (defined $$rfh) {
74 $f = File::Spec->catfile($d, "unicore", @path);
75 last if open($$rfh, $f);
78 croak __PACKAGE__, ": failed to find ",
79 File::Spec->catfile(@path), " in @INC"
87 use Unicode::UCD 'charinfo';
89 my $charinfo = charinfo(0x41);
91 charinfo() returns a reference to a hash that has the following fields
92 as defined by the Unicode standard:
96 code code point with at least four hexdigits
97 name name of the character IN UPPER CASE
98 category general category of the character
99 combining classes used in the Canonical Ordering Algorithm
100 bidi bidirectional category
101 decomposition character decomposition mapping
102 decimal if decimal digit this is the integer numeric value
103 digit if digit this is the numeric value
104 numeric if numeric is the integer or rational numeric value
105 mirrored if mirrored in bidirectional text
106 unicode10 Unicode 1.0 name if existed and different
107 comment ISO 10646 comment field
108 upper uppercase equivalent mapping
109 lower lowercase equivalent mapping
110 title titlecase equivalent mapping
112 block block the character belongs to (used in \p{In...})
113 script script the character belongs to
115 If no match is found, a reference to an empty hash is returned.
117 The C<block> property is the same as returned by charinfo(). It is
118 not defined in the Unicode Character Database proper (Chapter 4 of the
119 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
120 (Chapter 14 of TUS3). Similarly for the C<script> property.
122 Note that you cannot do (de)composition and casing based solely on the
123 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
124 you will need also the compexcl(), casefold(), and casespec() functions.
131 if ($arg =~ /^[1-9]\d*$/) {
133 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
140 # Lingua::KO::Hangul::Util not part of the standard distribution
141 # but it will be used if available.
143 eval { require Lingua::KO::Hangul::Util };
144 my $hasHangulUtil = ! $@;
145 if ($hasHangulUtil) {
146 Lingua::KO::Hangul::Util->import();
149 sub hangul_decomp { # internal: called from charinfo
150 if ($hasHangulUtil) {
151 my @tmp = decomposeHangul(shift);
152 return sprintf("%04X %04X", @tmp) if @tmp == 2;
153 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
158 sub hangul_charname { # internal: called from charinfo
159 return sprintf("HANGUL SYLLABLE-%04X", shift);
162 sub han_charname { # internal: called from charinfo
163 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
166 my @CharinfoRanges = (
168 # [ first, last, coderef to name, coderef to decompose ],
169 # CJK Ideographs Extension A
170 [ 0x3400, 0x4DB5, \&han_charname, undef ],
172 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
174 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
175 # Non-Private Use High Surrogates
176 [ 0xD800, 0xDB7F, undef, undef ],
177 # Private Use High Surrogates
178 [ 0xDB80, 0xDBFF, undef, undef ],
180 [ 0xDC00, 0xDFFF, undef, undef ],
181 # The Private Use Area
182 [ 0xE000, 0xF8FF, undef, undef ],
183 # CJK Ideographs Extension B
184 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
185 # Plane 15 Private Use Area
186 [ 0xF0000, 0xFFFFD, undef, undef ],
187 # Plane 16 Private Use Area
188 [ 0x100000, 0x10FFFD, undef, undef ],
193 my $code = _getcode($arg);
194 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
195 unless defined $code;
196 my $hexk = sprintf("%06X", $code);
197 my($rcode,$rname,$rdec);
198 foreach my $range (@CharinfoRanges){
199 if ($range->[0] <= $code && $code <= $range->[1]) {
202 $rcode = sprintf("%04X", hex($rcode));
203 $rname = $range->[2] ? $range->[2]->($code) : '';
204 $rdec = $range->[3] ? $range->[3]->($code) : '';
205 $hexk = sprintf("%06X", $range->[0]); # replace by the first
209 openunicode(\$UNICODEFH, "UnicodeData.txt");
210 if (defined $UNICODEFH) {
211 use Search::Dict 1.02;
212 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
213 my $line = <$UNICODEFH>;
214 return unless defined $line;
219 combining bidi decomposition
220 decimal digit numeric
221 mirrored unicode10 comment
223 )} = split(/;/, $line, -1);
225 $hexk = sprintf("%04X", hex($hexk));
226 if ($prop{code} eq $hexk) {
227 $prop{block} = charblock($code);
228 $prop{script} = charscript($code);
230 $prop{code} = $rcode;
231 $prop{name} = $rname;
232 $prop{decomposition} = $rdec;
241 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
242 my ($table, $lo, $hi, $code) = @_;
246 my $mid = int(($lo+$hi) / 2);
248 if ($table->[$mid]->[0] < $code) {
249 if ($table->[$mid]->[1] >= $code) {
250 return $table->[$mid]->[2];
252 _search($table, $mid + 1, $hi, $code);
254 } elsif ($table->[$mid]->[0] > $code) {
255 _search($table, $lo, $mid - 1, $code);
257 return $table->[$mid]->[2];
262 my ($range, $arg) = @_;
263 my $code = _getcode($arg);
264 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
265 unless defined $code;
266 _search($range, 0, $#$range, $code);
271 use Unicode::UCD 'charblock';
273 my $charblock = charblock(0x41);
274 my $charblock = charblock(1234);
275 my $charblock = charblock("0x263a");
276 my $charblock = charblock("U+263a");
278 my $range = charblock('Armenian');
280 With a B<code point argument> charblock() returns the I<block> the character
281 belongs to, e.g. C<Basic Latin>. Note that not all the character
282 positions within all blocks are defined.
284 See also L</Blocks versus Scripts>.
286 If supplied with an argument that can't be a code point, charblock() tries
287 to do the opposite and interpret the argument as a character block. The
288 return value is a I<range>: an anonymous list of lists that contain
289 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
290 code point is in a range using the L</charinrange> function. If the
291 argument is not a known charater block, C<undef> is returned.
300 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
302 while (<$BLOCKSFH>) {
303 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
304 my ($lo, $hi) = (hex($1), hex($2));
305 my $subrange = [ $lo, $hi, $3 ];
306 push @BLOCKS, $subrange;
307 push @{$BLOCKS{$3}}, $subrange;
318 _charblocks() unless @BLOCKS;
320 my $code = _getcode($arg);
323 _search(\@BLOCKS, 0, $#BLOCKS, $code);
325 if (exists $BLOCKS{$arg}) {
326 return dclone $BLOCKS{$arg};
335 use Unicode::UCD 'charscript';
337 my $charscript = charscript(0x41);
338 my $charscript = charscript(1234);
339 my $charscript = charscript("U+263a");
341 my $range = charscript('Thai');
343 With a B<code point argument> charscript() returns the I<script> the
344 character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
346 See also L</Blocks versus Scripts>.
348 If supplied with an argument that can't be a code point, charscript() tries
349 to do the opposite and interpret the argument as a character script. The
350 return value is a I<range>: an anonymous list of lists that contain
351 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
352 code point is in a range using the L</charinrange> function. If the
353 argument is not a known charater script, C<undef> is returned.
362 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
364 while (<$SCRIPTSFH>) {
365 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
366 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
368 $script =~ s/\b(\w)/uc($1)/ge;
369 my $subrange = [ $lo, $hi, $script ];
370 push @SCRIPTS, $subrange;
371 push @{$SCRIPTS{$script}}, $subrange;
375 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
383 _charscripts() unless @SCRIPTS;
385 my $code = _getcode($arg);
388 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
390 if (exists $SCRIPTS{$arg}) {
391 return dclone $SCRIPTS{$arg};
400 use Unicode::UCD 'charblocks';
402 my $charblocks = charblocks();
404 charblocks() returns a reference to a hash with the known block names
405 as the keys, and the code point ranges (see L</charblock>) as the values.
407 See also L</Blocks versus Scripts>.
412 _charblocks() unless %BLOCKS;
413 return dclone \%BLOCKS;
418 use Unicode::UCD 'charscripts';
420 my %charscripts = charscripts();
422 charscripts() returns a hash with the known script names as the keys,
423 and the code point ranges (see L</charscript>) as the values.
425 See also L</Blocks versus Scripts>.
430 _charscripts() unless %SCRIPTS;
431 return dclone \%SCRIPTS;
434 =head2 Blocks versus Scripts
436 The difference between a block and a script is that scripts are closer
437 to the linguistic notion of a set of characters required to present
438 languages, while block is more of an artifact of the Unicode character
439 numbering and separation into blocks of (mostly) 256 characters.
441 For example the Latin B<script> is spread over several B<blocks>, such
442 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
443 C<Latin Extended-B>. On the other hand, the Latin script does not
444 contain all the characters of the C<Basic Latin> block (also known as
445 the ASCII): it includes only the letters, and not, for example, the digits
448 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
450 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
452 =head2 Matching Scripts and Blocks
454 Scripts are matched with the regular-expression construct
455 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
456 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
457 any of the 256 code points in the Tibetan block).
459 =head2 Code Point Arguments
461 A I<code point argument> is either a decimal or a hexadecimal scalar
462 designating a Unicode character, or C<U+> followed by hexadecimals
463 designating a Unicode character. In other words, if you want a code
464 point to be interpreted as a hexadecimal number, you must prefix it
465 with either C<0x> or C<U+>, because a string like e.g. C<123> will
466 be interpreted as a decimal code point. Also note that Unicode is
467 B<not> limited to 16 bits (the number of Unicode characters is
468 open-ended, in theory unlimited): you may have more than 4 hexdigits.
472 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
473 can also test whether a code point is in the I<range> as returned by
474 L</charblock> and L</charscript> or as the values of the hash returned
475 by L</charblocks> and L</charscripts> by using charinrange():
477 use Unicode::UCD qw(charscript charinrange);
479 $range = charscript('Hiragana');
480 print "looks like hiragana\n" if charinrange($range, $codepoint);
486 use Unicode::UCD 'compexcl';
488 my $compexcl = compexcl("09dc");
490 The compexcl() returns the composition exclusion (that is, if the
491 character should not be produced during a precomposition) of the
492 character specified by a B<code point argument>.
494 If there is a composition exclusion for the character, true is
495 returned. Otherwise, false is returned.
503 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
505 while (<$COMPEXCLFH>) {
506 if (/^([0-9A-F]+)\s+\#\s+/) {
508 $COMPEXCL{$code} = undef;
518 my $code = _getcode($arg);
519 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
520 unless defined $code;
522 _compexcl() unless %COMPEXCL;
524 return exists $COMPEXCL{$code};
529 use Unicode::UCD 'casefold';
531 my $casefold = casefold("00DF");
533 The casefold() returns the locale-independent case folding of the
534 character specified by a B<code point argument>.
536 If there is a case folding for that character, a reference to a hash
537 with the following fields is returned:
541 code code point with at least four hexdigits
542 status "C", "F", "S", or "I"
543 mapping one or more codes separated by spaces
545 The meaning of the I<status> is as follows:
547 C common case folding, common mappings shared
548 by both simple and full mappings
549 F full case folding, mappings that cause strings
550 to grow in length. Multiple characters are separated
552 S simple case folding, mappings to single characters
553 where different from F
554 I special case for dotted uppercase I and
556 - If this mapping is included, the result is
557 case-insensitive, but dotless and dotted I's
558 are not distinguished
559 - If this mapping is excluded, the result is not
560 fully case-insensitive, but dotless and dotted
561 I's are distinguished
563 If there is no case folding for that character, C<undef> is returned.
565 For more information about case mappings see
566 http://www.unicode.org/unicode/reports/tr21/
574 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
576 while (<$CASEFOLDFH>) {
577 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
579 $CASEFOLD{$code} = { code => $1,
591 my $code = _getcode($arg);
592 croak __PACKAGE__, "::casefold: unknown code '$arg'"
593 unless defined $code;
595 _casefold() unless %CASEFOLD;
597 return $CASEFOLD{$code};
602 use Unicode::UCD 'casespec';
604 my $casespec = casespec("FB00");
606 The casespec() returns the potentially locale-dependent case mapping
607 of the character specified by a B<code point argument>. The mapping
608 may change the length of the string (which the basic Unicode case
609 mappings as returned by charinfo() never do).
611 If there is a case folding for that character, a reference to a hash
612 with the following fields is returned:
616 code code point with at least four hexdigits
620 condition condition list (may be undef)
622 The C<condition> is optional. Where present, it consists of one or
623 more I<locales> or I<contexts>, separated by spaces (other than as
624 used to separate elements, spaces are to be ignored). A condition
625 list overrides the normal behavior if all of the listed conditions are
626 true. Case distinctions in the condition list are not significant.
627 Conditions preceded by "NON_" represent the negation of the condition.
629 Note that when there are multiple case folding definitions for a
630 single code point because of different locales, the value returned by
631 casespec() is a hash reference which has the locales as the keys and
632 hash references as described above as the values.
634 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
635 followed by a "_" and a 2-letter ISO language code (possibly followed
636 by a "_" and a variant code). You can find the lists of those codes,
637 see L<Locale::Country> and L<Locale::Language>.
639 A I<context> is one of the following choices:
641 FINAL The letter is not followed by a letter of
642 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
643 MODERN The mapping is only used for modern text
644 AFTER_i The last base character was "i" (U+0069)
646 For more information about case mappings see
647 http://www.unicode.org/unicode/reports/tr21/
655 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
657 while (<$CASESPECFH>) {
658 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
659 my ($hexcode, $lower, $title, $upper, $condition) =
660 ($1, $2, $3, $4, $5);
661 my $code = hex($hexcode);
662 if (exists $CASESPEC{$code}) {
663 if (exists $CASESPEC{$code}->{code}) {
668 @{$CASESPEC{$code}}{qw(lower
672 if (defined $oldcondition) {
674 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
675 delete $CASESPEC{$code};
676 $CASESPEC{$code}->{$oldlocale} =
681 condition => $oldcondition };
685 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
686 $CASESPEC{$code}->{$locale} =
691 condition => $condition };
698 condition => $condition };
709 my $code = _getcode($arg);
710 croak __PACKAGE__, "::casespec: unknown code '$arg'"
711 unless defined $code;
713 _casespec() unless %CASESPEC;
715 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
718 =head2 Unicode::UCD::UnicodeVersion
720 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
721 Character Database, in other words, the version of the Unicode
722 standard the database implements. The version is a string
723 of numbers delimited by dots (C<'.'>).
730 unless (defined $UNICODEVERSION) {
731 openunicode(\$VERSIONFH, "version");
732 chomp($UNICODEVERSION = <$VERSIONFH>);
734 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
735 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
737 return $UNICODEVERSION;
740 =head2 Implementation Note
742 The first use of charinfo() opens a read-only filehandle to the Unicode
743 Character Database (the database is included in the Perl distribution).
744 The filehandle is then kept open for further queries. In other words,
745 if you are wondering where one of your filehandles went, that's where.
749 Does not yet support EBCDIC platforms.