10 our @ISA = qw(Exporter);
12 our @EXPORT_OK = qw(charinfo
14 charblocks charscripts
23 Unicode::UCD - Unicode character database
27 use Unicode::UCD 'charinfo';
28 my $charinfo = charinfo($codepoint);
30 use Unicode::UCD 'charblock';
31 my $charblock = charblock($codepoint);
33 use Unicode::UCD 'charscript';
34 my $charscript = charblock($codepoint);
36 use Unicode::UCD 'charblocks';
37 my $charblocks = charblocks();
39 use Unicode::UCD 'charscripts';
40 my %charscripts = charscripts();
42 use Unicode::UCD qw(charscript charinrange);
43 my $range = charscript($script);
44 print "looks like $script\n" if charinrange($range, $codepoint);
46 use Unicode::UCD 'compexcl';
47 my $compexcl = compexcl($codepoint);
49 my $unicode_version = Unicode::UCD::UnicodeVersion();
53 The Unicode::UCD module offers a simple interface to the Unicode
67 my ($rfh, @path) = @_;
69 unless (defined $$rfh) {
72 $f = File::Spec->catfile($d, "unicore", @path);
73 last if open($$rfh, $f);
76 croak __PACKAGE__, ": failed to find ",
77 File::Spec->catfile(@path), " in @INC"
85 use Unicode::UCD 'charinfo';
87 my $charinfo = charinfo(0x41);
89 charinfo() returns a reference to a hash that has the following fields
90 as defined by the Unicode standard:
94 code code point with at least four hexdigits
95 name name of the character IN UPPER CASE
96 category general category of the character
97 combining classes used in the Canonical Ordering Algorithm
98 bidi bidirectional category
99 decomposition character decomposition mapping
100 decimal if decimal digit this is the integer numeric value
101 digit if digit this is the numeric value
102 numeric if numeric is the integer or rational numeric value
103 mirrored if mirrored in bidirectional text
104 unicode10 Unicode 1.0 name if existed and different
105 comment ISO 10646 comment field
106 upper uppercase equivalent mapping
107 lower lowercase equivalent mapping
108 title titlecase equivalent mapping
110 block block the character belongs to (used in \p{In...})
111 script script the character belongs to
113 If no match is found, a reference to an empty hash is returned.
115 The C<block> property is the same as returned by charinfo(). It is
116 not defined in the Unicode Character Database proper (Chapter 4 of the
117 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
118 (Chapter 14 of TUS3). Similarly for the C<script> property.
120 Note that you cannot do (de)composition and casing based solely on the
121 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
122 you will need also the compexcl(), casefold(), and casespec() functions.
129 if ($arg =~ /^\d+$/) {
131 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
138 # Lingua::KO::Hangul::Util not part of the standard distribution
139 # but it will be used if available.
141 eval { require Lingua::KO::Hangul::Util };
142 my $hasHangulUtil = ! $@;
143 if ($hasHangulUtil) {
144 Lingua::KO::Hangul::Util->import();
147 sub hangul_decomp { # internal: called from charinfo
148 if ($hasHangulUtil) {
149 my @tmp = decomposeHangul(shift);
150 return sprintf("%04X %04X", @tmp) if @tmp == 2;
151 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
156 sub hangul_charname { # internal: called from charinfo
157 return sprintf("HANGUL SYLLABLE-%04X", shift);
160 sub han_charname { # internal: called from charinfo
161 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
164 my @CharinfoRanges = (
166 # [ first, last, coderef to name, coderef to decompose ],
167 # CJK Ideographs Extension A
168 [ 0x3400, 0x4DB5, \&han_charname, undef ],
170 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
172 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
173 # Non-Private Use High Surrogates
174 [ 0xD800, 0xDB7F, undef, undef ],
175 # Private Use High Surrogates
176 [ 0xDB80, 0xDBFF, undef, undef ],
178 [ 0xDC00, 0xDFFF, undef, undef ],
179 # The Private Use Area
180 [ 0xE000, 0xF8FF, undef, undef ],
181 # CJK Ideographs Extension B
182 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
183 # Plane 15 Private Use Area
184 [ 0xF0000, 0xFFFFD, undef, undef ],
185 # Plane 16 Private Use Area
186 [ 0x100000, 0x10FFFD, undef, undef ],
191 my $code = _getcode($arg);
192 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
193 unless defined $code;
194 my $hexk = sprintf("%06X", $code);
195 my($rcode,$rname,$rdec);
196 foreach my $range (@CharinfoRanges){
197 if ($range->[0] <= $code && $code <= $range->[1]) {
200 $rcode = sprintf("%04X", hex($rcode));
201 $rname = $range->[2] ? $range->[2]->($code) : '';
202 $rdec = $range->[3] ? $range->[3]->($code) : '';
203 $hexk = sprintf("%06X", $range->[0]); # replace by the first
207 openunicode(\$UNICODEFH, "Unicode.txt");
208 if (defined $UNICODEFH) {
209 use Search::Dict 1.02;
210 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
211 my $line = <$UNICODEFH>;
216 combining bidi decomposition
217 decimal digit numeric
218 mirrored unicode10 comment
220 )} = split(/;/, $line, -1);
222 $hexk = sprintf("%04X", hex($hexk));
223 if ($prop{code} eq $hexk) {
224 $prop{block} = charblock($code);
225 $prop{script} = charscript($code);
227 $prop{code} = $rcode;
228 $prop{name} = $rname;
229 $prop{decomposition} = $rdec;
238 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
239 my ($table, $lo, $hi, $code) = @_;
243 my $mid = int(($lo+$hi) / 2);
245 if ($table->[$mid]->[0] < $code) {
246 if ($table->[$mid]->[1] >= $code) {
247 return $table->[$mid]->[2];
249 _search($table, $mid + 1, $hi, $code);
251 } elsif ($table->[$mid]->[0] > $code) {
252 _search($table, $lo, $mid - 1, $code);
254 return $table->[$mid]->[2];
259 my ($range, $arg) = @_;
260 my $code = _getcode($arg);
261 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
262 unless defined $code;
263 _search($range, 0, $#$range, $code);
268 use Unicode::UCD 'charblock';
270 my $charblock = charblock(0x41);
271 my $charblock = charblock(1234);
272 my $charblock = charblock("0x263a");
273 my $charblock = charblock("U+263a");
275 my $range = charblock('Armenian');
277 With a B<code point argument> charblock() returns the I<block> the character
278 belongs to, e.g. C<Basic Latin>. Note that not all the character
279 positions within all blocks are defined.
281 See also L</Blocks versus Scripts>.
283 If supplied with an argument that can't be a code point, charblock() tries
284 to do the opposite and interpret the argument as a character block. The
285 return value is a I<range>: an anonymous list of lists that contain
286 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
287 code point is in a range using the L</charinrange> function. If the
288 argument is not a known charater block, C<undef> is returned.
297 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
298 while (<$BLOCKSFH>) {
299 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
300 my ($lo, $hi) = (hex($1), hex($2));
301 my $subrange = [ $lo, $hi, $3 ];
302 push @BLOCKS, $subrange;
303 push @{$BLOCKS{$3}}, $subrange;
314 _charblocks() unless @BLOCKS;
316 my $code = _getcode($arg);
319 _search(\@BLOCKS, 0, $#BLOCKS, $code);
321 if (exists $BLOCKS{$arg}) {
322 return $BLOCKS{$arg};
331 use Unicode::UCD 'charscript';
333 my $charscript = charscript(0x41);
334 my $charscript = charscript(1234);
335 my $charscript = charscript("U+263a");
337 my $range = charscript('Thai');
339 With a B<code point argument> charscript() returns the I<script> the
340 character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
342 See also L</Blocks versus Scripts>.
344 If supplied with an argument that can't be a code point, charscript() tries
345 to do the opposite and interpret the argument as a character script. The
346 return value is a I<range>: an anonymous list of lists that contain
347 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
348 code point is in a range using the L</charinrange> function. If the
349 argument is not a known charater script, C<undef> is returned.
358 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
359 while (<$SCRIPTSFH>) {
360 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
361 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
363 $script =~ s/\b(\w)/uc($1)/ge;
364 my $subrange = [ $lo, $hi, $script ];
365 push @SCRIPTS, $subrange;
366 push @{$SCRIPTS{$script}}, $subrange;
370 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
378 _charscripts() unless @SCRIPTS;
380 my $code = _getcode($arg);
383 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
385 if (exists $SCRIPTS{$arg}) {
386 return $SCRIPTS{$arg};
395 use Unicode::UCD 'charblocks';
397 my $charblocks = charblocks();
399 charblocks() returns a reference to a hash with the known block names
400 as the keys, and the code point ranges (see L</charblock>) as the values.
402 See also L</Blocks versus Scripts>.
407 _charblocks() unless %BLOCKS;
413 use Unicode::UCD 'charscripts';
415 my %charscripts = charscripts();
417 charscripts() returns a hash with the known script names as the keys,
418 and the code point ranges (see L</charscript>) as the values.
420 See also L</Blocks versus Scripts>.
425 _charscripts() unless %SCRIPTS;
429 =head2 Blocks versus Scripts
431 The difference between a block and a script is that scripts are closer
432 to the linguistic notion of a set of characters required to present
433 languages, while block is more of an artifact of the Unicode character
434 numbering and separation into blocks of (mostly) 256 characters.
436 For example the Latin B<script> is spread over several B<blocks>, such
437 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
438 C<Latin Extended-B>. On the other hand, the Latin script does not
439 contain all the characters of the C<Basic Latin> block (also known as
440 the ASCII): it includes only the letters, and not, for example, the digits
443 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
445 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
447 =head2 Matching Scripts and Blocks
449 Scripts are matched with the regular-expression construct
450 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
451 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
452 any of the 256 code points in the Tibetan block).
454 =head2 Code Point Arguments
456 A <code point argument> is either a decimal or a hexadecimal scalar
457 designating a Unicode character, or "U+" followed by hexadecimals
458 designating a Unicode character. Note that Unicode is B<not> limited
459 to 16 bits (the number of Unicode characters is open-ended, in theory
460 unlimited): you may have more than 4 hexdigits.
464 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
465 can also test whether a code point is in the I<range> as returned by
466 L</charblock> and L</charscript> or as the values of the hash returned
467 by L</charblocks> and L</charscripts> by using charinrange():
469 use Unicode::UCD qw(charscript charinrange);
471 $range = charscript('Hiragana');
472 print "looks like hiragana\n" if charinrange($range, $codepoint);
478 use Unicode::UCD 'compexcl';
480 my $compexcl = compexcl("09dc");
482 The compexcl() returns the composition exclusion (that is, if the
483 character should not be produced during a precomposition) of the
484 character specified by a B<code point argument>.
486 If there is a composition exclusion for the character, true is
487 returned. Otherwise, false is returned.
495 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
496 while (<$COMPEXCLFH>) {
497 if (/^([0-9A-F]+) \# /) {
499 $COMPEXCL{$code} = undef;
509 my $code = _getcode($arg);
510 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
511 unless defined $code;
513 _compexcl() unless %COMPEXCL;
515 return exists $COMPEXCL{$code};
520 use Unicode::UCD 'casefold';
522 my %casefold = casefold("09dc");
524 The casefold() returns the locale-independent case folding of the
525 character specified by a B<code point argument>.
527 If there is a case folding for that character, a reference to a hash
528 with the following fields is returned:
532 code code point with at least four hexdigits
533 status "C", "F", "S", or "I"
534 mapping one or more codes separated by spaces
536 The meaning of the I<status> is as follows:
538 C common case folding, common mappings shared
539 by both simple and full mappings
540 F full case folding, mappings that cause strings
541 to grow in length. Multiple characters are separated
543 S simple case folding, mappings to single characters
544 where different from F
545 I special case for dotted uppercase I and
547 - If this mapping is included, the result is
548 case-insensitive, but dotless and dotted I's
549 are not distinguished
550 - If this mapping is excluded, the result is not
551 fully case-insensitive, but dotless and dotted
552 I's are distinguished
554 If there is no case folding for that character, C<undef> is returned.
556 For more information about case mappings see
557 http://www.unicode.org/unicode/reports/tr21/
565 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
566 while (<$CASEFOLDFH>) {
567 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
569 $CASEFOLD{$code} = { code => $1,
581 my $code = _getcode($arg);
582 croak __PACKAGE__, "::casefold: unknown code '$arg'"
583 unless defined $code;
585 _casefold() unless %CASEFOLD;
587 return $CASEFOLD{$code};
592 use Unicode::UCD 'casespec';
594 my %casespec = casespec("09dc");
596 The casespec() returns the potentially locale-dependent case mapping
597 of the character specified by a B<code point argument>. The mapping
598 may change the length of the string (which the basic Unicode case
599 mappings as returned by charinfo() never do).
601 If there is a case folding for that character, a reference to a hash
602 with the following fields is returned:
606 code code point with at least four hexdigits
610 condition condition list (may be undef)
612 The C<condition> is optional. Where present, it consists of one or
613 more I<locales> or I<contexts>, separated by spaces (other than as
614 used to separate elements, spaces are to be ignored). A condition
615 list overrides the normal behavior if all of the listed conditions are
616 true. Case distinctions in the condition list are not significant.
617 Conditions preceded by "NON_" represent the negation of the condition
619 Note that when there are multiple case folding definitions for a
620 single code point because of different locales, the value returned by
621 casespec() is a hash reference which has the locales as the keys and
622 hash references as described above as the values.
624 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
625 followed by a "_" and a 2-letter ISO language code (possibly followed
626 by a "_" and a variant code). You can find the lists of those codes,
627 see L<Locale::Country> and L<Locale::Language>.
629 A I<context> is one of the following choices:
631 FINAL The letter is not followed by a letter of
632 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
633 MODERN The mapping is only used for modern text
634 AFTER_i The last base character was "i" (U+0069)
636 For more information about case mappings see
637 http://www.unicode.org/unicode/reports/tr21/
645 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
646 while (<$CASESPECFH>) {
647 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
648 my ($hexcode, $lower, $title, $upper, $condition) =
649 ($1, $2, $3, $4, $5);
650 my $code = hex($hexcode);
651 if (exists $CASESPEC{$code}) {
652 if (exists $CASESPEC{$code}->{code}) {
657 @{$CASESPEC{$code}}{qw(lower
662 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
663 if (defined $oldlocale) {
664 delete $CASESPEC{$code};
665 $CASESPEC{$code}->{$oldlocale} =
670 condition => $oldcondition };
672 warn __PACKAGE__, ": SpecCase.txt:", $., ": No oldlocale for 0x$hexcode\n"
676 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
677 $CASESPEC{$code}->{$locale} =
682 condition => $condition };
689 condition => $condition };
700 my $code = _getcode($arg);
701 croak __PACKAGE__, "::casespec: unknown code '$arg'"
702 unless defined $code;
704 _casespec() unless %CASESPEC;
706 return $CASESPEC{$code};
709 =head2 Unicode::UCD::UnicodeVersion
711 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
712 Character Database, in other words, the version of the Unicode
713 standard the database implements. The version is a string
714 of numbers delimited by dots (C<'.'>).
721 unless (defined $UNICODEVERSION) {
722 openunicode(\$VERSIONFH, "version");
723 chomp($UNICODEVERSION = <$VERSIONFH>);
725 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
726 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
728 return $UNICODEVERSION;
731 =head2 Implementation Note
733 The first use of charinfo() opens a read-only filehandle to the Unicode
734 Character Database (the database is included in the Perl distribution).
735 The filehandle is then kept open for further queries. In other words,
736 if you are wondering where one of your filehandles went, that's where.
740 Does not yet support EBCDIC platforms.