10 our @ISA = qw(Exporter);
12 our @EXPORT_OK = qw(charinfo
14 charblocks charscripts
23 Unicode::UCD - Unicode character database
27 use Unicode::UCD 'charinfo';
28 my $charinfo = charinfo($codepoint);
30 use Unicode::UCD 'charblock';
31 my $charblock = charblock($codepoint);
33 use Unicode::UCD 'charscript';
34 my $charscript = charblock($codepoint);
36 use Unicode::UCD 'charblocks';
37 my $charblocks = charblocks();
39 use Unicode::UCD 'charscripts';
40 my %charscripts = charscripts();
42 use Unicode::UCD qw(charscript charinrange);
43 my $range = charscript($script);
44 print "looks like $script\n" if charinrange($range, $codepoint);
46 use Unicode::UCD 'compexcl';
47 my $compexcl = compexcl($codepoint);
49 my $unicode_version = Unicode::UCD::UnicodeVersion();
53 The Unicode::UCD module offers a simple interface to the Unicode Character
67 my ($rfh, @path) = @_;
69 unless (defined $$rfh) {
72 $f = File::Spec->catfile($d, "unicore", @path);
73 last if open($$rfh, $f);
76 croak __PACKAGE__, ": failed to find ",
77 File::Spec->catfile(@path), " in @INC"
85 use Unicode::UCD 'charinfo';
87 my $charinfo = charinfo(0x41);
89 charinfo() returns a reference to a hash that has the following fields
90 as defined by the Unicode standard:
94 code code point with at least four hexdigits
95 name name of the character IN UPPER CASE
96 category general category of the character
97 combining classes used in the Canonical Ordering Algorithm
98 bidi bidirectional category
99 decomposition character decomposition mapping
100 decimal if decimal digit this is the integer numeric value
101 digit if digit this is the numeric value
102 numeric if numeric is the integer or rational numeric value
103 mirrored if mirrored in bidirectional text
104 unicode10 Unicode 1.0 name if existed and different
105 comment ISO 10646 comment field
106 upper uppercase equivalent mapping
107 lower lowercase equivalent mapping
108 title titlecase equivalent mapping
110 block block the character belongs to (used in \p{In...})
111 script script the character belongs to
113 If no match is found, a reference to an empty hash is returned.
115 The C<block> property is the same as as returned by charinfo(). It is
116 not defined in the Unicode Character Database proper (Chapter 4 of the
117 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
118 (Chapter 14 of TUS3). Similarly for the C<script> property.
120 Note that you cannot do (de)composition and casing based solely on the
121 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
122 you will need also the compexcl(), casefold(), and casespec() functions.
129 if ($arg =~ /^\d+$/) {
131 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
140 my $code = _getcode($arg);
141 croak __PACKAGE__, "::han_charname: unknown code '$arg'"
142 unless defined $code;
143 croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'"
144 unless 0x3400 <= $code && $code <= 0x4DB5
145 || 0x4E00 <= $code && $code <= 0x9FA5
146 || 0x20000 <= $code && $code <= 0x2A6D6;
147 sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
150 my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
151 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
152 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
155 my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
156 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
157 "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
158 "YU", "EU", "YI", "I",
161 my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
162 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
163 "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
164 "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
172 LCount => 19, # scalar @JamoL
173 VCount => 21, # scalar @JamoV
174 TCount => 28, # scalar @JamoT
175 NCount => 588, # VCount * TCount
176 SCount => 11172, # LCount * NCount
177 Final => 0xD7A3, # SBase -1 + SCount
180 sub hangul_charname {
182 my $code = _getcode($arg);
183 croak __PACKAGE__, "::hangul_charname: unknown code '$arg'"
184 unless defined $code;
185 croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
186 unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
187 my $SIndex = $code - $HangulConst{SBase};
188 my $LIndex = int( $SIndex / $HangulConst{NCount});
189 my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
190 my $TIndex = $SIndex % $HangulConst{TCount};
201 my $code = _getcode($arg);
202 croak __PACKAGE__, "::hangul_decomp: unknown code '$arg'"
203 unless defined $code;
204 croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
205 unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
206 my $SIndex = $code - $HangulConst{SBase};
207 my $LIndex = int( $SIndex / $HangulConst{NCount});
208 my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
209 my $TIndex = $SIndex % $HangulConst{TCount};
212 sprintf("%04X", $HangulConst{LBase} + $LIndex),
213 sprintf("%04X", $HangulConst{VBase} + $VIndex),
215 sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
219 my @CharinfoRanges = (
221 # [ first, last, coderef to name, coderef to decompose ],
222 # CJK Ideographs Extension A
223 [ 0x3400, 0x4DB5, \&han_charname, undef ],
225 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
227 [ 0xAC00, 0xD7A3, \&hangul_charname, \&hangul_decomp ],
228 # Non-Private Use High Surrogates
229 [ 0xD800, 0xDB7F, undef, undef ],
230 # Private Use High Surrogates
231 [ 0xDB80, 0xDBFF, undef, undef ],
233 [ 0xDC00, 0xDFFF, undef, undef ],
234 # The Private Use Area
235 [ 0xE000, 0xF8FF, undef, undef ],
236 # CJK Ideographs Extension B
237 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
238 # Plane 15 Private Use Area
239 [ 0xF0000, 0xFFFFD, undef, undef ],
240 # Plane 16 Private Use Area
241 [ 0x100000, 0x10FFFD, undef, undef ],
246 my $code = _getcode($arg);
247 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
248 unless defined $code;
249 my $hexk = sprintf("%06X", $code);
250 my($rcode,$rname,$rdec);
251 foreach my $range (@CharinfoRanges){
252 if ($range->[0] <= $code && $code <= $range->[1]) {
255 $rcode = sprintf("%04X", hex($rcode));
256 $rname = $range->[2] ? $range->[2]->($code) : '';
257 $rdec = $range->[3] ? $range->[3]->($code) : '';
258 $hexk = sprintf("%06X", $range->[0]); # replace by the first
262 openunicode(\$UNICODEFH, "Unicode.txt");
263 if (defined $UNICODEFH) {
264 use Search::Dict 1.02;
265 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
266 my $line = <$UNICODEFH>;
271 combining bidi decomposition
272 decimal digit numeric
273 mirrored unicode10 comment
275 )} = split(/;/, $line, -1);
277 $hexk = sprintf("%04X", hex($hexk));
278 if ($prop{code} eq $hexk) {
279 $prop{block} = charblock($code);
280 $prop{script} = charscript($code);
282 $prop{code} = $rcode;
283 $prop{name} = $rname;
284 $prop{decomposition} = $rdec;
293 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
294 my ($table, $lo, $hi, $code) = @_;
298 my $mid = int(($lo+$hi) / 2);
300 if ($table->[$mid]->[0] < $code) {
301 if ($table->[$mid]->[1] >= $code) {
302 return $table->[$mid]->[2];
304 _search($table, $mid + 1, $hi, $code);
306 } elsif ($table->[$mid]->[0] > $code) {
307 _search($table, $lo, $mid - 1, $code);
309 return $table->[$mid]->[2];
314 my ($range, $arg) = @_;
315 my $code = _getcode($arg);
316 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
317 unless defined $code;
318 _search($range, 0, $#$range, $code);
323 use Unicode::UCD 'charblock';
325 my $charblock = charblock(0x41);
326 my $charblock = charblock(1234);
327 my $charblock = charblock("0x263a");
328 my $charblock = charblock("U+263a");
330 my $range = charblock('Armenian');
332 With a B<code point argument> charblock() returns the I<block> the character
333 belongs to, e.g. C<Basic Latin>. Note that not all the character
334 positions within all blocks are defined.
336 See also L</Blocks versus Scripts>.
338 If supplied with an argument that can't be a code point, charblock()
339 tries to do the opposite and interpret the argument as a character
340 block. The return value is a I<range>: an anonymous list that
341 contains anonymous lists, which in turn contain I<start-of-range>,
342 I<end-of-range> code point pairs. You can test whether a code point
343 is in a range using the L</charinrange> function. If the argument is
344 not a known charater block, C<undef> is returned.
353 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
354 while (<$BLOCKSFH>) {
355 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
356 my ($lo, $hi) = (hex($1), hex($2));
357 my $subrange = [ $lo, $hi, $3 ];
358 push @BLOCKS, $subrange;
359 push @{$BLOCKS{$3}}, $subrange;
370 _charblocks() unless @BLOCKS;
372 my $code = _getcode($arg);
375 _search(\@BLOCKS, 0, $#BLOCKS, $code);
377 if (exists $BLOCKS{$arg}) {
378 return $BLOCKS{$arg};
387 use Unicode::UCD 'charscript';
389 my $charscript = charscript(0x41);
390 my $charscript = charscript(1234);
391 my $charscript = charscript("U+263a");
393 my $range = charscript('Thai');
395 With a B<code point argument> charscript() returns the I<script> the
396 character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
398 See also L</Blocks versus Scripts>.
400 If supplied with an argument that can't be a code point, charscript()
401 tries to do the opposite and interpret the argument as a character
402 script. The return value is a I<range>: an anonymous list that
403 contains anonymous lists, which in turn contain I<start-of-range>,
404 I<end-of-range> code point pairs. You can test whether a code point
405 is in a range using the L</charinrange> function. If the argument is
406 not a known charater script, C<undef> is returned.
415 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
416 while (<$SCRIPTSFH>) {
417 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
418 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
420 $script =~ s/\b(\w)/uc($1)/ge;
421 my $subrange = [ $lo, $hi, $script ];
422 push @SCRIPTS, $subrange;
423 push @{$SCRIPTS{$script}}, $subrange;
427 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
435 _charscripts() unless @SCRIPTS;
437 my $code = _getcode($arg);
440 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
442 if (exists $SCRIPTS{$arg}) {
443 return $SCRIPTS{$arg};
452 use Unicode::UCD 'charblocks';
454 my $charblocks = charblocks();
456 charblocks() returns a reference to a hash with the known block names
457 as the keys, and the code point ranges (see L</charblock>) as the values.
459 See also L</Blocks versus Scripts>.
464 _charblocks() unless %BLOCKS;
470 use Unicode::UCD 'charscripts';
472 my %charscripts = charscripts();
474 charscripts() returns a hash with the known script names as the keys,
475 and the code point ranges (see L</charscript>) as the values.
477 See also L</Blocks versus Scripts>.
482 _charscripts() unless %SCRIPTS;
486 =head2 Blocks versus Scripts
488 The difference between a block and a script is that scripts are closer
489 to the linguistic notion of a set of characters required to present
490 languages, while block is more of an artifact of the Unicode character
491 numbering and separation into blocks of 256 characters.
493 For example the Latin B<script> is spread over several B<blocks>, such
494 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
495 C<Latin Extended-B>. On the other hand, the Latin script does not
496 contain all the characters of the C<Basic Latin> block (also known as
497 the ASCII): it includes only the letters, not for example the digits
500 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
502 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
504 =head2 Matching Scripts and Blocks
506 Both scripts and blocks can be matched using the regular expression
507 construct C<\p{In...}> and its negation C<\P{In...}>.
509 The name of the script or the block comes after the C<In>, for example
510 C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
511 removed from the names for the C<\p{In...}>, for example
512 C<LatinExtendedA> instead of C<Latin Extended-A>.
514 There are a few cases where there is both a script and a block by the
515 same name, in these cases the block version has C<Block> appended to
516 its name: C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is
519 =head2 Code Point Arguments
521 A <code point argument> is either a decimal or a hexadecimal scalar
522 designating a Unicode character, or "U+" followed by hexadecimals
523 designating a Unicode character. Note that Unicode is B<not> limited
524 to 16 bits (the number of Unicode characters is open-ended, in theory
525 unlimited): you may have more than 4 hexdigits.
529 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
530 can also test whether a code point is in the I<range> as returned by
531 L</charblock> and L</charscript> or as the values of the hash returned
532 by L</charblocks> and L</charscripts> by using charinrange():
534 use Unicode::UCD qw(charscript charinrange);
536 $range = charscript('Hiragana');
537 print "looks like hiragana\n" if charinrange($range, $codepoint);
543 use Unicode::UCD 'compexcl';
545 my $compexcl = compexcl("09dc");
547 The compexcl() returns the composition exclusion (that is, if the
548 character should not be produced during a precomposition) of the
549 character specified by a B<code point argument>.
551 If there is a composition exclusion for the character, true is
552 returned. Otherwise, false is returned.
560 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
561 while (<$COMPEXCLFH>) {
562 if (/^([0-9A-F]+) \# /) {
564 $COMPEXCL{$code} = undef;
574 my $code = _getcode($arg);
575 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
576 unless defined $code;
578 _compexcl() unless %COMPEXCL;
580 return exists $COMPEXCL{$code};
585 use Unicode::UCD 'casefold';
587 my %casefold = casefold("09dc");
589 The casefold() returns the locale-independent case folding of the
590 character specified by a B<code point argument>.
592 If there is a case folding for that character, a reference to a hash
593 with the following fields is returned:
597 code code point with at least four hexdigits
598 status "C", "F", "S", or "I"
599 mapping one or more codes separated by spaces
601 The meaning of the I<status> is as follows:
603 C common case folding, common mappings shared
604 by both simple and full mappings
605 F full case folding, mappings that cause strings
606 to grow in length. Multiple characters are separated
608 S simple case folding, mappings to single characters
609 where different from F
610 I special case for dotted uppercase I and
612 - If this mapping is included, the result is
613 case-insensitive, but dotless and dotted I's
614 are not distinguished
615 - If this mapping is excluded, the result is not
616 fully case-insensitive, but dotless and dotted
617 I's are distinguished
619 If there is no case folding for that character, C<undef> is returned.
621 For more information about case mappings see
622 http://www.unicode.org/unicode/reports/tr21/
630 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
631 while (<$CASEFOLDFH>) {
632 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
634 $CASEFOLD{$code} = { code => $1,
646 my $code = _getcode($arg);
647 croak __PACKAGE__, "::casefold: unknown code '$arg'"
648 unless defined $code;
650 _casefold() unless %CASEFOLD;
652 return $CASEFOLD{$code};
657 use Unicode::UCD 'casespec';
659 my %casespec = casespec("09dc");
661 The casespec() returns the potentially locale-dependent case mapping
662 of the character specified by a B<code point argument>. The mapping
663 may change the length of the string (which the basic Unicode case
664 mappings as returned by charinfo() never do).
666 If there is a case folding for that character, a reference to a hash
667 with the following fields is returned:
671 code code point with at least four hexdigits
675 condition condition list (may be undef)
677 The C<condition> is optional. Where present, it consists of one or
678 more I<locales> or I<contexts>, separated by spaces (other than as
679 used to separate elements, spaces are to be ignored). A condition
680 list overrides the normal behavior if all of the listed conditions are
681 true. Case distinctions in the condition list are not significant.
682 Conditions preceded by "NON_" represent the negation of the condition
684 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
685 followed by a "_" and a 2-letter ISO language code (possibly followed
686 by a "_" and a variant code). You can find the lists of those codes,
687 see L<Locale::Country> and L<Locale::Language>.
689 A I<context> is one of the following choices:
691 FINAL The letter is not followed by a letter of
692 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
693 MODERN The mapping is only used for modern text
694 AFTER_i The last base character was "i" (U+0069)
696 For more information about case mappings see
697 http://www.unicode.org/unicode/reports/tr21/
705 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
706 while (<$CASESPECFH>) {
707 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
709 $CASESPEC{$code} = { code => $1,
723 my $code = _getcode($arg);
724 croak __PACKAGE__, "::casespec: unknown code '$arg'"
725 unless defined $code;
727 _casespec() unless %CASESPEC;
729 return $CASESPEC{$code};
732 =head2 Unicode::UCD::UnicodeVersion
734 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
735 Character Database, in other words, the version of the Unicode
736 standard the database implements. The version is a string
737 of numbers delimited by dots (C<'.'>).
744 unless (defined $UNICODEVERSION) {
745 openunicode(\$VERSIONFH, "version");
746 chomp($UNICODEVERSION = <$VERSIONFH>);
748 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
749 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
751 return $UNICODEVERSION;
754 =head2 Implementation Note
756 The first use of charinfo() opens a read-only filehandle to the Unicode
757 Character Database (the database is included in the Perl distribution).
758 The filehandle is then kept open for further queries. In other words,
759 if you are wondering where one of your filehandles went, that's where.