8 use Storable qw(dclone);
12 our @ISA = qw(Exporter);
14 our @EXPORT_OK = qw(charinfo
16 charblocks charscripts
18 general_categories bidi_types
27 Unicode::UCD - Unicode character database
31 use Unicode::UCD 'charinfo';
32 my $charinfo = charinfo($codepoint);
34 use Unicode::UCD 'charblock';
35 my $charblock = charblock($codepoint);
37 use Unicode::UCD 'charscript';
38 my $charscript = charscript($codepoint);
40 use Unicode::UCD 'charblocks';
41 my $charblocks = charblocks();
43 use Unicode::UCD 'charscripts';
44 my $charscripts = charscripts();
46 use Unicode::UCD qw(charscript charinrange);
47 my $range = charscript($script);
48 print "looks like $script\n" if charinrange($range, $codepoint);
50 use Unicode::UCD qw(general_categories bidi_types);
51 my $categories = general_categories();
52 my $types = bidi_types();
54 use Unicode::UCD 'compexcl';
55 my $compexcl = compexcl($codepoint);
57 use Unicode::UCD 'namedseq';
58 my $namedseq = namedseq($named_sequence_name);
60 my $unicode_version = Unicode::UCD::UnicodeVersion();
64 The Unicode::UCD module offers a simple interface to the Unicode
79 my ($rfh, @path) = @_;
81 unless (defined $$rfh) {
84 $f = File::Spec->catfile($d, "unicore", @path);
85 last if open($$rfh, $f);
88 croak __PACKAGE__, ": failed to find ",
89 File::Spec->catfile(@path), " in @INC"
97 use Unicode::UCD 'charinfo';
99 my $charinfo = charinfo(0x41);
101 charinfo() returns a reference to a hash that has the following fields
102 as defined by the Unicode standard:
106 code code point with at least four hexdigits
107 name name of the character IN UPPER CASE
108 category general category of the character
109 combining classes used in the Canonical Ordering Algorithm
110 bidi bidirectional type
111 decomposition character decomposition mapping
112 decimal if decimal digit this is the integer numeric value
113 digit if digit this is the numeric value
114 numeric if numeric is the integer or rational numeric value
115 mirrored if mirrored in bidirectional text
116 unicode10 Unicode 1.0 name if existed and different
117 comment ISO 10646 comment field
118 upper uppercase equivalent mapping
119 lower lowercase equivalent mapping
120 title titlecase equivalent mapping
122 block block the character belongs to (used in \p{In...})
123 script script the character belongs to
125 If no match is found, a reference to an empty hash is returned.
127 The C<block> property is the same as returned by charinfo(). It is
128 not defined in the Unicode Character Database proper (Chapter 4 of the
129 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
130 (Chapter 14 of TUS3). Similarly for the C<script> property.
132 Note that you cannot do (de)composition and casing based solely on the
133 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
134 you will need also the compexcl(), casefold(), and casespec() functions.
138 # NB: This function is duplicated in charnames.pm
142 if ($arg =~ /^[1-9]\d*$/) {
144 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
151 # Lingua::KO::Hangul::Util not part of the standard distribution
152 # but it will be used if available.
154 eval { require Lingua::KO::Hangul::Util };
155 my $hasHangulUtil = ! $@;
156 if ($hasHangulUtil) {
157 Lingua::KO::Hangul::Util->import();
160 sub hangul_decomp { # internal: called from charinfo
161 if ($hasHangulUtil) {
162 my @tmp = decomposeHangul(shift);
163 return sprintf("%04X %04X", @tmp) if @tmp == 2;
164 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
169 sub hangul_charname { # internal: called from charinfo
170 return sprintf("HANGUL SYLLABLE-%04X", shift);
173 sub han_charname { # internal: called from charinfo
174 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
177 my @CharinfoRanges = (
179 # [ first, last, coderef to name, coderef to decompose ],
180 # CJK Ideographs Extension A
181 [ 0x3400, 0x4DB5, \&han_charname, undef ],
183 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
185 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
186 # Non-Private Use High Surrogates
187 [ 0xD800, 0xDB7F, undef, undef ],
188 # Private Use High Surrogates
189 [ 0xDB80, 0xDBFF, undef, undef ],
191 [ 0xDC00, 0xDFFF, undef, undef ],
192 # The Private Use Area
193 [ 0xE000, 0xF8FF, undef, undef ],
194 # CJK Ideographs Extension B
195 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
196 # Plane 15 Private Use Area
197 [ 0xF0000, 0xFFFFD, undef, undef ],
198 # Plane 16 Private Use Area
199 [ 0x100000, 0x10FFFD, undef, undef ],
204 my $code = _getcode($arg);
205 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
206 unless defined $code;
207 my $hexk = sprintf("%06X", $code);
208 my($rcode,$rname,$rdec);
209 foreach my $range (@CharinfoRanges){
210 if ($range->[0] <= $code && $code <= $range->[1]) {
213 $rcode = sprintf("%04X", hex($rcode));
214 $rname = $range->[2] ? $range->[2]->($code) : '';
215 $rdec = $range->[3] ? $range->[3]->($code) : '';
216 $hexk = sprintf("%06X", $range->[0]); # replace by the first
220 openunicode(\$UNICODEFH, "UnicodeData.txt");
221 if (defined $UNICODEFH) {
222 use Search::Dict 1.02;
223 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
224 my $line = <$UNICODEFH>;
225 return unless defined $line;
230 combining bidi decomposition
231 decimal digit numeric
232 mirrored unicode10 comment
234 )} = split(/;/, $line, -1);
236 $hexk = sprintf("%04X", hex($hexk));
237 if ($prop{code} eq $hexk) {
238 $prop{block} = charblock($code);
239 $prop{script} = charscript($code);
241 $prop{code} = $rcode;
242 $prop{name} = $rname;
243 $prop{decomposition} = $rdec;
252 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
253 my ($table, $lo, $hi, $code) = @_;
257 my $mid = int(($lo+$hi) / 2);
259 if ($table->[$mid]->[0] < $code) {
260 if ($table->[$mid]->[1] >= $code) {
261 return $table->[$mid]->[2];
263 _search($table, $mid + 1, $hi, $code);
265 } elsif ($table->[$mid]->[0] > $code) {
266 _search($table, $lo, $mid - 1, $code);
268 return $table->[$mid]->[2];
273 my ($range, $arg) = @_;
274 my $code = _getcode($arg);
275 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
276 unless defined $code;
277 _search($range, 0, $#$range, $code);
282 use Unicode::UCD 'charblock';
284 my $charblock = charblock(0x41);
285 my $charblock = charblock(1234);
286 my $charblock = charblock("0x263a");
287 my $charblock = charblock("U+263a");
289 my $range = charblock('Armenian');
291 With a B<code point argument> charblock() returns the I<block> the character
292 belongs to, e.g. C<Basic Latin>. Note that not all the character
293 positions within all blocks are defined.
295 See also L</Blocks versus Scripts>.
297 If supplied with an argument that can't be a code point, charblock() tries
298 to do the opposite and interpret the argument as a character block. The
299 return value is a I<range>: an anonymous list of lists that contain
300 I<start-of-range>, I<end-of-range> code point pairs. You can test whether
301 a code point is in a range using the L</charinrange> function. If the
302 argument is not a known character block, C<undef> is returned.
311 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
313 while (<$BLOCKSFH>) {
314 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
315 my ($lo, $hi) = (hex($1), hex($2));
316 my $subrange = [ $lo, $hi, $3 ];
317 push @BLOCKS, $subrange;
318 push @{$BLOCKS{$3}}, $subrange;
329 _charblocks() unless @BLOCKS;
331 my $code = _getcode($arg);
334 _search(\@BLOCKS, 0, $#BLOCKS, $code);
336 if (exists $BLOCKS{$arg}) {
337 return dclone $BLOCKS{$arg};
346 use Unicode::UCD 'charscript';
348 my $charscript = charscript(0x41);
349 my $charscript = charscript(1234);
350 my $charscript = charscript("U+263a");
352 my $range = charscript('Thai');
354 With a B<code point argument> charscript() returns the I<script> the
355 character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
357 See also L</Blocks versus Scripts>.
359 If supplied with an argument that can't be a code point, charscript() tries
360 to do the opposite and interpret the argument as a character script. The
361 return value is a I<range>: an anonymous list of lists that contain
362 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
363 code point is in a range using the L</charinrange> function. If the
364 argument is not a known character script, C<undef> is returned.
373 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
375 while (<$SCRIPTSFH>) {
376 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
377 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
379 $script =~ s/\b(\w)/uc($1)/ge;
380 my $subrange = [ $lo, $hi, $script ];
381 push @SCRIPTS, $subrange;
382 push @{$SCRIPTS{$script}}, $subrange;
386 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
394 _charscripts() unless @SCRIPTS;
396 my $code = _getcode($arg);
399 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
401 if (exists $SCRIPTS{$arg}) {
402 return dclone $SCRIPTS{$arg};
411 use Unicode::UCD 'charblocks';
413 my $charblocks = charblocks();
415 charblocks() returns a reference to a hash with the known block names
416 as the keys, and the code point ranges (see L</charblock>) as the values.
418 See also L</Blocks versus Scripts>.
423 _charblocks() unless %BLOCKS;
424 return dclone \%BLOCKS;
429 use Unicode::UCD 'charscripts';
431 my $charscripts = charscripts();
433 charscripts() returns a reference to a hash with the known script
434 names as the keys, and the code point ranges (see L</charscript>) as
437 See also L</Blocks versus Scripts>.
442 _charscripts() unless %SCRIPTS;
443 return dclone \%SCRIPTS;
446 =head2 Blocks versus Scripts
448 The difference between a block and a script is that scripts are closer
449 to the linguistic notion of a set of characters required to present
450 languages, while block is more of an artifact of the Unicode character
451 numbering and separation into blocks of (mostly) 256 characters.
453 For example the Latin B<script> is spread over several B<blocks>, such
454 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
455 C<Latin Extended-B>. On the other hand, the Latin script does not
456 contain all the characters of the C<Basic Latin> block (also known as
457 the ASCII): it includes only the letters, and not, for example, the digits
460 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
462 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
464 =head2 Matching Scripts and Blocks
466 Scripts are matched with the regular-expression construct
467 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
468 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
469 any of the 256 code points in the Tibetan block).
471 =head2 Code Point Arguments
473 A I<code point argument> is either a decimal or a hexadecimal scalar
474 designating a Unicode character, or C<U+> followed by hexadecimals
475 designating a Unicode character. In other words, if you want a code
476 point to be interpreted as a hexadecimal number, you must prefix it
477 with either C<0x> or C<U+>, because a string like e.g. C<123> will
478 be interpreted as a decimal code point. Also note that Unicode is
479 B<not> limited to 16 bits (the number of Unicode characters is
480 open-ended, in theory unlimited): you may have more than 4 hexdigits.
484 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
485 can also test whether a code point is in the I<range> as returned by
486 L</charblock> and L</charscript> or as the values of the hash returned
487 by L</charblocks> and L</charscripts> by using charinrange():
489 use Unicode::UCD qw(charscript charinrange);
491 $range = charscript('Hiragana');
492 print "looks like hiragana\n" if charinrange($range, $codepoint);
496 my %GENERAL_CATEGORIES =
499 'LC' => 'CasedLetter',
500 'Lu' => 'UppercaseLetter',
501 'Ll' => 'LowercaseLetter',
502 'Lt' => 'TitlecaseLetter',
503 'Lm' => 'ModifierLetter',
504 'Lo' => 'OtherLetter',
506 'Mn' => 'NonspacingMark',
507 'Mc' => 'SpacingMark',
508 'Me' => 'EnclosingMark',
510 'Nd' => 'DecimalNumber',
511 'Nl' => 'LetterNumber',
512 'No' => 'OtherNumber',
513 'P' => 'Punctuation',
514 'Pc' => 'ConnectorPunctuation',
515 'Pd' => 'DashPunctuation',
516 'Ps' => 'OpenPunctuation',
517 'Pe' => 'ClosePunctuation',
518 'Pi' => 'InitialPunctuation',
519 'Pf' => 'FinalPunctuation',
520 'Po' => 'OtherPunctuation',
522 'Sm' => 'MathSymbol',
523 'Sc' => 'CurrencySymbol',
524 'Sk' => 'ModifierSymbol',
525 'So' => 'OtherSymbol',
527 'Zs' => 'SpaceSeparator',
528 'Zl' => 'LineSeparator',
529 'Zp' => 'ParagraphSeparator',
534 'Co' => 'PrivateUse',
535 'Cn' => 'Unassigned',
538 sub general_categories {
539 return dclone \%GENERAL_CATEGORIES;
542 =head2 general_categories
544 use Unicode::UCD 'general_categories';
546 my $categories = general_categories();
548 The general_categories() returns a reference to a hash which has short
549 general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
550 names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
551 C<Symbol>) as values. The hash is reversible in case you need to go
552 from the long names to the short names. The general category is the
553 one returned from charinfo() under the C<category> key.
559 'L' => 'Left-to-Right',
560 'LRE' => 'Left-to-Right Embedding',
561 'LRO' => 'Left-to-Right Override',
562 'R' => 'Right-to-Left',
563 'AL' => 'Right-to-Left Arabic',
564 'RLE' => 'Right-to-Left Embedding',
565 'RLO' => 'Right-to-Left Override',
566 'PDF' => 'Pop Directional Format',
567 'EN' => 'European Number',
568 'ES' => 'European Number Separator',
569 'ET' => 'European Number Terminator',
570 'AN' => 'Arabic Number',
571 'CS' => 'Common Number Separator',
572 'NSM' => 'Non-Spacing Mark',
573 'BN' => 'Boundary Neutral',
574 'B' => 'Paragraph Separator',
575 'S' => 'Segment Separator',
576 'WS' => 'Whitespace',
577 'ON' => 'Other Neutrals',
581 return dclone \%BIDI_TYPES;
586 use Unicode::UCD 'bidi_types';
588 my $categories = bidi_types();
590 The bidi_types() returns a reference to a hash which has the short
591 bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
592 names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The
593 hash is reversible in case you need to go from the long names to the
594 short names. The bidi type is the one returned from charinfo()
595 under the C<bidi> key. For the exact meaning of the various bidi classes
596 the Unicode TR9 is recommended reading:
597 http://www.unicode.org/reports/tr9/tr9-17.html
598 (as of Unicode 5.0.0)
604 use Unicode::UCD 'compexcl';
606 my $compexcl = compexcl("09dc");
608 The compexcl() returns the composition exclusion (that is, if the
609 character should not be produced during a precomposition) of the
610 character specified by a B<code point argument>.
612 If there is a composition exclusion for the character, true is
613 returned. Otherwise, false is returned.
621 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
623 while (<$COMPEXCLFH>) {
624 if (/^([0-9A-F]+)\s+\#\s+/) {
626 $COMPEXCL{$code} = undef;
636 my $code = _getcode($arg);
637 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
638 unless defined $code;
640 _compexcl() unless %COMPEXCL;
642 return exists $COMPEXCL{$code};
647 use Unicode::UCD 'casefold';
649 my $casefold = casefold("00DF");
651 The casefold() returns the locale-independent case folding of the
652 character specified by a B<code point argument>.
654 If there is a case folding for that character, a reference to a hash
655 with the following fields is returned:
659 code code point with at least four hexdigits
660 status "C", "F", "S", or "I"
661 mapping one or more codes separated by spaces
663 The meaning of the I<status> is as follows:
665 C common case folding, common mappings shared
666 by both simple and full mappings
667 F full case folding, mappings that cause strings
668 to grow in length. Multiple characters are separated
670 S simple case folding, mappings to single characters
671 where different from F
672 I special case for dotted uppercase I and
674 - If this mapping is included, the result is
675 case-insensitive, but dotless and dotted I's
676 are not distinguished
677 - If this mapping is excluded, the result is not
678 fully case-insensitive, but dotless and dotted
679 I's are distinguished
681 If there is no case folding for that character, C<undef> is returned.
683 For more information about case mappings see
684 http://www.unicode.org/unicode/reports/tr21/
692 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
694 while (<$CASEFOLDFH>) {
695 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
697 $CASEFOLD{$code} = { code => $1,
709 my $code = _getcode($arg);
710 croak __PACKAGE__, "::casefold: unknown code '$arg'"
711 unless defined $code;
713 _casefold() unless %CASEFOLD;
715 return $CASEFOLD{$code};
720 use Unicode::UCD 'casespec';
722 my $casespec = casespec("FB00");
724 The casespec() returns the potentially locale-dependent case mapping
725 of the character specified by a B<code point argument>. The mapping
726 may change the length of the string (which the basic Unicode case
727 mappings as returned by charinfo() never do).
729 If there is a case folding for that character, a reference to a hash
730 with the following fields is returned:
734 code code point with at least four hexdigits
738 condition condition list (may be undef)
740 The C<condition> is optional. Where present, it consists of one or
741 more I<locales> or I<contexts>, separated by spaces (other than as
742 used to separate elements, spaces are to be ignored). A condition
743 list overrides the normal behavior if all of the listed conditions are
744 true. Case distinctions in the condition list are not significant.
745 Conditions preceded by "NON_" represent the negation of the condition.
747 Note that when there are multiple case folding definitions for a
748 single code point because of different locales, the value returned by
749 casespec() is a hash reference which has the locales as the keys and
750 hash references as described above as the values.
752 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
753 followed by a "_" and a 2-letter ISO language code (possibly followed
754 by a "_" and a variant code). You can find the lists of those codes,
755 see L<Locale::Country> and L<Locale::Language>.
757 A I<context> is one of the following choices:
759 FINAL The letter is not followed by a letter of
760 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
761 MODERN The mapping is only used for modern text
762 AFTER_i The last base character was "i" (U+0069)
764 For more information about case mappings see
765 http://www.unicode.org/unicode/reports/tr21/
773 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
775 while (<$CASESPECFH>) {
776 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
777 my ($hexcode, $lower, $title, $upper, $condition) =
778 ($1, $2, $3, $4, $5);
779 my $code = hex($hexcode);
780 if (exists $CASESPEC{$code}) {
781 if (exists $CASESPEC{$code}->{code}) {
786 @{$CASESPEC{$code}}{qw(lower
790 if (defined $oldcondition) {
792 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
793 delete $CASESPEC{$code};
794 $CASESPEC{$code}->{$oldlocale} =
799 condition => $oldcondition };
803 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
804 $CASESPEC{$code}->{$locale} =
809 condition => $condition };
816 condition => $condition };
827 my $code = _getcode($arg);
828 croak __PACKAGE__, "::casespec: unknown code '$arg'"
829 unless defined $code;
831 _casespec() unless %CASESPEC;
833 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
838 use Unicode::UCD 'namedseq';
840 my $namedseq = namedseq("KATAKANA LETTER AINU P");
841 my @namedseq = namedseq("KATAKANA LETTER AINU P");
842 my %namedseq = namedseq();
844 If used with a single argument in a scalar context, returns the string
845 consisting of the code points of the named sequence, or C<undef> if no
846 named sequence by that name exists. If used with a single argument in
847 a list context, returns list of the code points. If used with no
848 arguments in a list context, returns a hash with the names of the
849 named sequences as the keys and the named sequences as strings as
850 the values. Otherwise, returns C<undef> or empty list depending
853 (New from Unicode 4.1.0)
861 if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
863 while (<$NAMEDSEQFH>) {
864 if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
865 my ($n, $s) = ($1, $2);
866 my @s = map { chr(hex($_)) } split(' ', $s);
867 $NAMEDSEQ{$n} = join("", @s);
876 _namedseq() unless %NAMEDSEQ;
877 my $wantarray = wantarray();
878 if (defined $wantarray) {
883 my $s = $NAMEDSEQ{ $_[0] };
884 return defined $s ? map { ord($_) } split('', $s) : ();
887 return $NAMEDSEQ{ $_[0] };
893 =head2 Unicode::UCD::UnicodeVersion
895 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
896 Character Database, in other words, the version of the Unicode
897 standard the database implements. The version is a string
898 of numbers delimited by dots (C<'.'>).
905 unless (defined $UNICODEVERSION) {
906 openunicode(\$VERSIONFH, "version");
907 chomp($UNICODEVERSION = <$VERSIONFH>);
909 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
910 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
912 return $UNICODEVERSION;
915 =head2 Implementation Note
917 The first use of charinfo() opens a read-only filehandle to the Unicode
918 Character Database (the database is included in the Perl distribution).
919 The filehandle is then kept open for further queries. In other words,
920 if you are wondering where one of your filehandles went, that's where.
924 Does not yet support EBCDIC platforms.