10 our @ISA = qw(Exporter);
12 our @EXPORT_OK = qw(charinfo
14 charblocks charscripts
23 UnicodeCD - Unicode character database
27 use UnicodeCD 'charinfo';
28 my $charinfo = charinfo($codepoint);
30 use UnicodeCD 'charblock';
31 my $charblock = charblock($codepoint);
33 use UnicodeCD 'charscript';
34 my $charscript = charblock($codepoint);
36 use UnicodeCD 'charblocks';
37 my $charblocks = charblocks();
39 use UnicodeCD 'charscripts';
40 my %charscripts = charscripts();
42 use UnicodeCD qw(charscript charinrange);
43 my $range = charscript($script);
44 print "looks like $script\n" if charinrange($range, $codepoint);
46 use UnicodeCD 'compexcl';
47 my $compexcl = compexcl($codepoint);
49 my $unicode_version = UnicodeCD::UnicodeVersion();
53 The UnicodeCD module offers a simple interface to the Unicode Character
67 my ($rfh, @path) = @_;
69 unless (defined $$rfh) {
72 $f = File::Spec->catfile($d, "unicode", @path);
73 last if open($$rfh, $f);
76 croak __PACKAGE__, ": failed to find ",
77 File::Spec->catfile(@path), " in @INC"
85 use UnicodeCD 'charinfo';
87 my $charinfo = charinfo(0x41);
89 charinfo() returns a reference to a hash that has the following fields
90 as defined by the Unicode standard:
94 code code point with at least four hexdigits
95 name name of the character IN UPPER CASE
96 category general category of the character
97 combining classes used in the Canonical Ordering Algorithm
98 bidi bidirectional category
99 decomposition character decomposition mapping
100 decimal if decimal digit this is the integer numeric value
101 digit if digit this is the numeric value
102 numeric if numeric is the integer or rational numeric value
103 mirrored if mirrored in bidirectional text
104 unicode10 Unicode 1.0 name if existed and different
105 comment ISO 10646 comment field
106 upper uppercase equivalent mapping
107 lower lowercase equivalent mapping
108 title titlecase equivalent mapping
110 block block the character belongs to (used in \p{In...})
111 script script the character belongs to
113 If no match is found, a reference to an empty hash is returned.
115 The C<block> property is the same as as returned by charinfo(). It is
116 not defined in the Unicode Character Database proper (Chapter 4 of the
117 Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
118 of TUS3). Similarly for the C<script> property.
120 Note that you cannot do (de)composition and casing based solely on the
121 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
122 you will need also the compexcl(), casefold(), and casespec() functions.
129 if ($arg =~ /^\d+$/) {
131 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
140 my $code = _getcode($arg);
141 croak __PACKAGE__, "::han_charname: unknown code '$arg'"
142 unless defined $code;
143 croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'"
144 unless 0x3400 <= $code && $code <= 0x4DB5
145 || 0x4E00 <= $code && $code <= 0x9FA5
146 || 0x20000 <= $code && $code <= 0x2A6D6;
147 sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
150 my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
151 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
152 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
155 my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
156 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
157 "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
158 "YU", "EU", "YI", "I",
161 my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
162 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
163 "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
164 "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
172 LCount => 19, # scalar @JamoL
173 VCount => 21, # scalar @JamoV
174 TCount => 28, # scalar @JamoT
175 NCount => 588, # VCount * TCount
176 SCount => 11172, # LCount * NCount
177 Final => 0xD7A3, # SBase -1 + SCount
180 sub hangul_charname {
182 my $code = _getcode($arg);
183 croak __PACKAGE__, "::hangul_charname: unknown code '$arg'"
184 unless defined $code;
185 croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
186 unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
187 my $SIndex = $code - $HangulConst{SBase};
188 my $LIndex = int( $SIndex / $HangulConst{NCount});
189 my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
190 my $TIndex = $SIndex % $HangulConst{TCount};
201 my $code = _getcode($arg);
202 croak __PACKAGE__, "::hangul_decomp: unknown code '$arg'"
203 unless defined $code;
204 croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
205 unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
206 my $SIndex = $code - $HangulConst{SBase};
207 my $LIndex = int( $SIndex / $HangulConst{NCount});
208 my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
209 my $TIndex = $SIndex % $HangulConst{TCount};
212 sprintf("%04X", $HangulConst{LBase} + $LIndex),
213 sprintf("%04X", $HangulConst{VBase} + $VIndex),
215 sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
219 my @CharinfoRanges = (
221 # [ first, last, coderef to name, coderef to decompose ],
222 # CJK Ideographs Extension A
223 [ 0x3400, 0x4DB5, \&han_charname, undef ],
225 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
227 [ 0xAC00, 0xD7A3, \&hangul_charname, \&hangul_decomp ],
228 # Non-Private Use High Surrogates
229 [ 0xD800, 0xDB7F, undef, undef ],
230 # Private Use High Surrogates
231 [ 0xDB80, 0xDBFF, undef, undef ],
233 [ 0xDC00, 0xDFFF, undef, undef ],
234 # The Private Use Area
235 [ 0xE000, 0xF8FF, undef, undef ],
236 # CJK Ideographs Extension B
237 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
238 # Plane 15 Private Use Area
239 [ 0xF0000, 0xFFFFD, undef, undef ],
240 # Plane 16 Private Use Area
241 [ 0x100000, 0x10FFFD, undef, undef ],
246 my $code = _getcode($arg);
247 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
248 unless defined $code;
249 my $hexk = sprintf("%06X", $code);
250 my($rcode,$rname,$rdec);
251 foreach my $range (@CharinfoRanges){
252 if ($range->[0] <= $code && $code <= $range->[1]) {
255 $rcode = sprintf("%04X", hex($rcode));
256 $rname = $range->[2] ? $range->[2]->($code) : '';
257 $rdec = $range->[3] ? $range->[3]->($code) : '';
258 $hexk = sprintf("%06X", $range->[0]); # replace by the first
262 openunicode(\$UNICODEFH, "Unicode.txt");
263 if (defined $UNICODEFH) {
264 use Search::Dict 1.02;
265 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
266 my $line = <$UNICODEFH>;
271 combining bidi decomposition
272 decimal digit numeric
273 mirrored unicode10 comment
275 )} = split(/;/, $line, -1);
277 $hexk = sprintf("%04X", hex($hexk));
278 if ($prop{code} eq $hexk) {
279 $prop{block} = charblock($code);
280 $prop{script} = charscript($code);
282 $prop{code} = $rcode;
283 $prop{name} = $rname;
284 $prop{decomposition} = $rdec;
293 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
294 my ($table, $lo, $hi, $code) = @_;
298 my $mid = int(($lo+$hi) / 2);
300 if ($table->[$mid]->[0] < $code) {
301 if ($table->[$mid]->[1] >= $code) {
302 return $table->[$mid]->[2];
304 _search($table, $mid + 1, $hi, $code);
306 } elsif ($table->[$mid]->[0] > $code) {
307 _search($table, $lo, $mid - 1, $code);
309 return $table->[$mid]->[2];
314 my ($range, $arg) = @_;
315 my $code = _getcode($arg);
316 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
317 unless defined $code;
318 _search($range, 0, $#$range, $code);
323 use UnicodeCD 'charblock';
325 my $charblock = charblock(0x41);
326 my $charblock = charblock(1234);
327 my $charblock = charblock("0x263a");
328 my $charblock = charblock("U+263a");
330 my $ranges = charblock('Armenian');
332 With a B<code point argument> charblock() returns the block the character
333 belongs to, e.g. C<Basic Latin>. Note that not all the character
334 positions within all blocks are defined.
336 If supplied with an argument that can't be a code point, charblock()
337 tries to do the opposite and interpret the argument as a character
338 block. The return value is a I<range>: an anonymous list that
339 contains anonymous lists, which in turn contain I<start-of-range>,
340 I<end-of-range> code point pairs. You can test whether a code point
341 is in a range using the L</charinrange> function. If the argument is
342 not a known charater block, C<undef> is returned.
351 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
352 while (<$BLOCKSFH>) {
353 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
354 my ($lo, $hi) = (hex($1), hex($2));
355 my $subrange = [ $lo, $hi, $3 ];
356 push @BLOCKS, $subrange;
357 push @{$BLOCKS{$3}}, $subrange;
368 _charblocks() unless @BLOCKS;
370 my $code = _getcode($arg);
373 _search(\@BLOCKS, 0, $#BLOCKS, $code);
375 if (exists $BLOCKS{$arg}) {
376 return $BLOCKS{$arg};
385 use UnicodeCD 'charscript';
387 my $charscript = charscript(0x41);
388 my $charscript = charscript(1234);
389 my $charscript = charscript("U+263a");
391 my $ranges = charscript('Thai');
393 With a B<code point argument> charscript() returns the script the
394 character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
396 If supplied with an argument that can't be a code point, charscript()
397 tries to do the opposite and interpret the argument as a character
398 script. The return value is a I<range>: an anonymous list that
399 contains anonymous lists, which in turn contain I<start-of-range>,
400 I<end-of-range> code point pairs. You can test whether a code point
401 is in a range using the L</charinrange> function. If the argument is
402 not a known charater script, C<undef> is returned.
411 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
412 while (<$SCRIPTSFH>) {
413 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
414 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
416 $script =~ s/\b(\w)/uc($1)/ge;
417 my $subrange = [ $lo, $hi, $script ];
418 push @SCRIPTS, $subrange;
419 push @{$SCRIPTS{$script}}, $subrange;
423 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
431 _charscripts() unless @SCRIPTS;
433 my $code = _getcode($arg);
436 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
438 if (exists $SCRIPTS{$arg}) {
439 return $SCRIPTS{$arg};
448 use UnicodeCD 'charblocks';
450 my $charblocks = charblocks();
452 charblocks() returns a reference to a hash with the known block names
453 as the keys, and the code point ranges (see L</charblock>) as the values.
458 _charblocks() unless %BLOCKS;
464 use UnicodeCD 'charscripts';
466 my %charscripts = charscripts();
468 charscripts() returns a hash with the known script names as the keys,
469 and the code point ranges (see L</charscript>) as the values.
474 _charscripts() unless %SCRIPTS;
478 =head2 Blocks versus Scripts
480 The difference between a block and a script is that scripts are closer
481 to the linguistic notion of a set of characters required to present
482 languages, while block is more of an artifact of the Unicode character
483 numbering and separation into blocks of 256 characters.
485 For example the Latin B<script> is spread over several B<blocks>, such
486 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
487 C<Latin Extended-B>. On the other hand, the Latin script does not
488 contain all the characters of the C<Basic Latin> block (also known as
489 the ASCII): it includes only the letters, not for example the digits
492 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
494 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
496 =head2 Matching Scripts and Blocks
498 Both scripts and blocks can be matched using the regular expression
499 construct C<\p{In...}> and its negation C<\P{In...}>.
501 The name of the script or the block comes after the C<In>, for example
502 C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
503 removed from the names for the C<\p{In...}>, for example
504 C<LatinExtendedA> instead of C<Latin Extended-A>.
506 There are a few cases where there exists both a script and a block by
507 the same name, in these cases the block version has C<Block> appended:
508 C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
510 =head2 Code Point Arguments
512 A <code point argument> is either a decimal or a hexadecimal scalar,
513 or "U+" followed by hexadecimals.
517 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
518 can also test whether a code point is in the I<range> as returned by
519 L</charblock> and L</charscript> or as the values of the hash returned
520 by L</charblocks> and L</charscripts> by using charinrange():
522 use UnicodeCD qw(charscript charinrange);
524 $range = charscript('Hiragana');
525 print "looks like hiragana\n" if charinrange($range, $codepoint);
531 use UnicodeCD 'compexcl';
533 my $compexcl = compexcl("09dc");
535 The compexcl() returns the composition exclusion (that is, if the
536 character should not be produced during a precomposition) of the
537 character specified by a B<code point argument>.
539 If there is a composition exclusion for the character, true is
540 returned. Otherwise, false is returned.
548 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
549 while (<$COMPEXCLFH>) {
550 if (/^([0-9A-F]+) \# /) {
552 $COMPEXCL{$code} = undef;
562 my $code = _getcode($arg);
563 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
564 unless defined $code;
566 _compexcl() unless %COMPEXCL;
568 return exists $COMPEXCL{$code};
573 use UnicodeCD 'casefold';
575 my %casefold = casefold("09dc");
577 The casefold() returns the locale-independent case folding of the
578 character specified by a B<code point argument>.
580 If there is a case folding for that character, a reference to a hash
581 with the following fields is returned:
585 code code point with at least four hexdigits
586 status "C", "F", "S", or "I"
587 mapping one or more codes separated by spaces
589 The meaning of the I<status> is as follows:
591 C common case folding, common mappings shared
592 by both simple and full mappings
593 F full case folding, mappings that cause strings
594 to grow in length. Multiple characters are separated
596 S simple case folding, mappings to single characters
597 where different from F
598 I special case for dotted uppercase I and
600 - If this mapping is included, the result is
601 case-insensitive, but dotless and dotted I's
602 are not distinguished
603 - If this mapping is excluded, the result is not
604 fully case-insensitive, but dotless and dotted
605 I's are distinguished
607 If there is no case folding for that character, C<undef> is returned.
609 For more information about case mappings see
610 http://www.unicode.org/unicode/reports/tr21/
618 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
619 while (<$CASEFOLDFH>) {
620 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
622 $CASEFOLD{$code} = { code => $1,
634 my $code = _getcode($arg);
635 croak __PACKAGE__, "::casefold: unknown code '$arg'"
636 unless defined $code;
638 _casefold() unless %CASEFOLD;
640 return $CASEFOLD{$code};
645 use UnicodeCD 'casespec';
647 my %casespec = casespec("09dc");
649 The casespec() returns the potentially locale-dependent case mapping
650 of the character specified by a B<code point argument>. The mapping
651 may change the length of the string (which the basic Unicode case
652 mappings as returned by charinfo() never do).
654 If there is a case folding for that character, a reference to a hash
655 with the following fields is returned:
659 code code point with at least four hexdigits
663 condition condition list (may be undef)
665 The C<condition> is optional. Where present, it consists of one or
666 more I<locales> or I<contexts>, separated by spaces (other than as
667 used to separate elements, spaces are to be ignored). A condition
668 list overrides the normal behavior if all of the listed conditions are
669 true. Case distinctions in the condition list are not significant.
670 Conditions preceded by "NON_" represent the negation of the condition
672 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
673 followed by a "_" and a 2-letter ISO language code (possibly followed
674 by a "_" and a variant code). You can find the lists of those codes,
675 see L<Locale::Country> and L<Locale::Language>.
677 A I<context> is one of the following choices:
679 FINAL The letter is not followed by a letter of
680 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
681 MODERN The mapping is only used for modern text
682 AFTER_i The last base character was "i" (U+0069)
684 For more information about case mappings see
685 http://www.unicode.org/unicode/reports/tr21/
693 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
694 while (<$CASESPECFH>) {
695 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
697 $CASESPEC{$code} = { code => $1,
711 my $code = _getcode($arg);
712 croak __PACKAGE__, "::casespec: unknown code '$arg'"
713 unless defined $code;
715 _casespec() unless %CASESPEC;
717 return $CASESPEC{$code};
720 =head2 UnicodeCD::UnicodeVersion
722 UnicodeCD::UnicodeVersion() returns the version of the Unicode Character
723 Database, in other words, the version of the Unicode standard the
731 unless (defined $UNICODEVERSION) {
732 openunicode(\$VERSIONFH, "version");
733 chomp($UNICODEVERSION = <$VERSIONFH>);
735 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
736 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
738 return $UNICODEVERSION;
741 =head2 Implementation Note
743 The first use of charinfo() opens a read-only filehandle to the Unicode
744 Character Database (the database is included in the Perl distribution).
745 The filehandle is then kept open for further queries.