10 our @ISA = qw(Exporter);
11 our @EXPORT_OK = qw(charinfo
13 charblocks charscripts
22 UnicodeCD - Unicode character database
26 use UnicodeCD 'charinfo';
27 my $charinfo = charinfo($codepoint);
29 use UnicodeCD 'charblock';
30 my $charblock = charblock($codepoint);
32 use UnicodeCD 'charscript';
33 my $charscript = charblock($codepoint);
37 The Unicode module offers a simple interface to the Unicode Character
51 my ($rfh, @path) = @_;
53 unless (defined $$rfh) {
56 $f = File::Spec->catfile($d, "unicode", @path);
57 last if open($$rfh, $f);
60 croak __PACKAGE__, ": failed to find ",
61 File::Spec->catfile(@path), " in @INC"
69 use UnicodeCD 'charinfo';
71 my $charinfo = charinfo(0x41);
73 charinfo() returns a reference to a hash that has the following fields
74 as defined by the Unicode standard:
78 code code point with at least four hexdigits
79 name name of the character IN UPPER CASE
80 category general category of the character
81 combining classes used in the Canonical Ordering Algorithm
82 bidi bidirectional category
83 decomposition character decomposition mapping
84 decimal if decimal digit this is the integer numeric value
85 digit if digit this is the numeric value
86 numeric if numeric is the integer or rational numeric value
87 mirrored if mirrored in bidirectional text
88 unicode10 Unicode 1.0 name if existed and different
89 comment ISO 10646 comment field
90 upper uppercase equivalent mapping
91 lower lowercase equivalent mapping
92 title titlecase equivalent mapping
94 block block the character belongs to (used in \p{In...})
95 script script the character belongs to
97 If no match is found, a reference to an empty hash is returned.
99 The C<block> property is the same as as returned by charinfo(). It is
100 not defined in the Unicode Character Database proper (Chapter 4 of the
101 Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
102 of TUS3). Similarly for the C<script> property.
104 Note that you cannot do (de)composition and casing based solely on the
105 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
106 you will need also the compexcl(), casefold(), and casespec() functions.
113 if ($arg =~ /^\d+$/) {
115 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
124 my $code = _getcode($arg);
125 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
126 unless defined $code;
127 my $hexk = sprintf("%04X", $code);
129 openunicode(\$UNICODEFH, "Unicode.txt");
130 if (defined $UNICODEFH) {
132 if (look($UNICODEFH, "$hexk;") >= 0) {
133 my $line = <$UNICODEFH>;
138 combining bidi decomposition
139 decimal digit numeric
140 mirrored unicode10 comment
142 )} = split(/;/, $line, -1);
143 if ($prop{code} eq $hexk) {
144 $prop{block} = charblock($code);
145 $prop{script} = charscript($code);
153 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
154 my ($table, $lo, $hi, $code) = @_;
158 my $mid = int(($lo+$hi) / 2);
160 if ($table->[$mid]->[0] < $code) {
161 if ($table->[$mid]->[1] >= $code) {
162 return $table->[$mid]->[2];
164 _search($table, $mid + 1, $hi, $code);
166 } elsif ($table->[$mid]->[0] > $code) {
167 _search($table, $lo, $mid - 1, $code);
169 return $table->[$mid]->[2];
174 my ($range, $arg) = @_;
175 my $code = _getcode($arg);
176 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
177 unless defined $code;
178 _search($range, 0, $#$range, $code);
183 use UnicodeCD 'charblock';
185 my $charblock = charblock(0x41);
186 my $charblock = charblock(1234);
187 my $charblock = charblock("0x263a");
188 my $charblock = charblock("U+263a");
190 my $ranges = charblock('Armenian');
192 With a B<code point argument> charblock() returns the block the character
193 belongs to, e.g. C<Basic Latin>. Note that not all the character
194 positions within all blocks are defined.
196 If supplied with an argument that can't be a code point, charblock()
197 tries to do the opposite and interpret the argument as a character
198 block. The return value is a I<range>: an anonymous list that
199 contains anonymous lists, which in turn contain I<start-of-range>,
200 I<end-of-range> code point pairs. You can test whether a code point
201 is in a range using the L</charinrange> function. If the argument is
202 not a known charater block, C<undef> is returned.
211 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
212 while (<$BLOCKSFH>) {
213 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
214 my ($lo, $hi) = (hex($1), hex($2));
215 my $subrange = [ $lo, $hi, $3 ];
216 push @BLOCKS, $subrange;
217 push @{$BLOCKS{$3}}, $subrange;
228 _charblocks() unless @BLOCKS;
230 my $code = _getcode($arg);
233 _search(\@BLOCKS, 0, $#BLOCKS, $code);
235 if (exists $BLOCKS{$arg}) {
236 return $BLOCKS{$arg};
245 use UnicodeCD 'charscript';
247 my $charscript = charscript(0x41);
248 my $charscript = charscript(1234);
249 my $charscript = charscript("U+263a");
251 my $ranges = charscript('Thai');
253 With a B<code point argument> charscript() returns the script the
254 character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
256 If supplied with an argument that can't be a code point, charscript()
257 tries to do the opposite and interpret the argument as a character
258 script. The return value is a I<range>: an anonymous list that
259 contains anonymous lists, which in turn contain I<start-of-range>,
260 I<end-of-range> code point pairs. You can test whether a code point
261 is in a range using the L</charinrange> function. If the argument is
262 not a known charater script, C<undef> is returned.
271 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
272 while (<$SCRIPTSFH>) {
273 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
274 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
276 $script =~ s/\b(\w)/uc($1)/ge;
277 my $subrange = [ $lo, $hi, $script ];
278 push @SCRIPTS, $subrange;
279 push @{$SCRIPTS{$script}}, $subrange;
283 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
291 _charscripts() unless @SCRIPTS;
293 my $code = _getcode($arg);
296 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
298 if (exists $SCRIPTS{$arg}) {
299 return $SCRIPTS{$arg};
308 use UnicodeCD 'charblocks';
310 my $charblocks = charblocks();
312 charblocks() returns a reference to a hash with the known block names
313 as the keys, and the code point ranges (see L</charblock>) as the values.
318 _charblocks() unless %BLOCKS;
324 use UnicodeCD 'charscripts';
326 my %charscripts = charscripts();
328 charscripts() returns a hash with the known script names as the keys,
329 and the code point ranges (see L</charscript>) as the values.
334 _charscripts() unless %SCRIPTS;
338 =head2 Blocks versus Scripts
340 The difference between a block and a script is that scripts are closer
341 to the linguistic notion of a set of characters required to present
342 languages, while block is more of an artifact of the Unicode character
343 numbering and separation into blocks of 256 characters.
345 For example the Latin B<script> is spread over several B<blocks>, such
346 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
347 C<Latin Extended-B>. On the other hand, the Latin script does not
348 contain all the characters of the C<Basic Latin> block (also known as
349 the ASCII): it includes only the letters, not for example the digits
352 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
354 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
356 =head2 Matching Scripts and Blocks
358 Both scripts and blocks can be matched using the regular expression
359 construct C<\p{In...}> and its negation C<\P{In...}>.
361 The name of the script or the block comes after the C<In>, for example
362 C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
363 removed from the names for the C<\p{In...}>, for example
364 C<LatinExtendedA> instead of C<Latin Extended-A>.
366 There are a few cases where there exists both a script and a block by
367 the same name, in these cases the block version has C<Block> appended:
368 C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
370 =head2 Code Point Arguments
372 A <code point argument> is either a decimal or a hexadecimal scalar,
373 or "U+" followed by hexadecimals.
377 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
378 can also test whether a code point is in the I<range> as returned by
379 L</charblock> and L</charscript> or as the values of the hash returned
380 by L</charblocks> and </charscripts> by using charinrange():
382 use UnicodeCD qw(charscript charinrange);
384 $range = charscript('Hiragana');
385 print "looks like hiragana\n" if charinrange($range, $code);
391 use UnicodeCD 'compexcl';
393 my $compexcl = compexcl("09dc");
395 The compexcl() returns the composition exclusion (that is, if the
396 character should not be produced during a precomposition) of the
397 character specified by a B<code point argument>.
399 If there is a composition exclusion for the character, true is
400 returned. Otherwise, false is returned.
408 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
409 while (<$COMPEXCLFH>) {
410 if (/^([0-9A-F]+) \# /) {
412 $COMPEXCL{$code} = undef;
422 my $code = _getcode($arg);
424 _compexcl() unless %COMPEXCL;
426 return exists $COMPEXCL{$code};
431 use UnicodeCD 'casefold';
433 my %casefold = casefold("09dc");
435 The casefold() returns the locale-independent case folding of the
436 character specified by a B<code point argument>.
438 If there is a case folding for that character, a reference to a hash
439 with the following fields is returned:
443 code code point with at least four hexdigits
444 status "C", "F", "S", or "I"
445 mapping one or more codes separated by spaces
447 The meaning of the I<status> is as follows:
449 C common case folding, common mappings shared
450 by both simple and full mappings
451 F full case folding, mappings that cause strings
452 to grow in length. Multiple characters are separated
454 S simple case folding, mappings to single characters
455 where different from F
456 I special case for dotted uppercase I and
458 - If this mapping is included, the result is
459 case-insensitive, but dotless and dotted I's
460 are not distinguished
461 - If this mapping is excluded, the result is not
462 fully case-insensitive, but dotless and dotted
463 I's are distinguished
465 If there is no case folding for that character, C<undef> is returned.
467 For more information about case mappings see
468 http://www.unicode.org/unicode/reports/tr21/
476 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
477 while (<$CASEFOLDFH>) {
478 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
480 $CASEFOLD{$code} = { code => $1,
492 my $code = _getcode($arg);
494 _casefold() unless %CASEFOLD;
496 return $CASEFOLD{$code};
501 use UnicodeCD 'casespec';
503 my %casespec = casespec("09dc");
505 The casespec() returns the potentially locale-dependent case mapping
506 of the character specified by a B<code point argument>. The mapping
507 may change the length of the string (which the basic Unicode case
508 mappings as returned by charinfo() never do).
510 If there is a case folding for that character, a reference to a hash
511 with the following fields is returned:
515 code code point with at least four hexdigits
519 condition condition list (may be undef)
521 The C<condition> is optional. Where present, it consists of one or
522 more I<locales> or I<contexts>, separated by spaces (other than as
523 used to separate elements, spaces are to be ignored). A condition
524 list overrides the normal behavior if all of the listed conditions are
525 true. Case distinctions in the condition list are not significant.
526 Conditions preceded by "NON_" represent the negation of the condition
528 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
529 followed by a "_" and a 2-letter ISO language code (, possibly followed
530 by a "_" and a variant code). You can find the list of those codes
531 in L<Locale::Country> and L<Locale::Language>.
533 A I<context> is one of the following choices:
535 FINAL The letter is not followed by a letter of
536 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
537 MODERN The mapping is only used for modern text
538 AFTER_i The last base character was "i" 0069
540 For more information about case mappings see
541 http://www.unicode.org/unicode/reports/tr21/
549 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
550 while (<$CASESPECFH>) {
551 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
553 $CASESPEC{$code} = { code => $1,
567 my $code = _getcode($arg);
569 _casespec() unless %CASESPEC;
571 return $CASESPEC{$code};
574 =head2 UnicodeCD::UnicodeVersion
576 UnicodeCD::UnicodeVersion() returns the version of the Unicode Character
577 Database, in other words, the version of the Unicode standard the
585 unless (defined $UNICODEVERSION) {
586 openunicode(\$VERSIONFH, "version");
587 chomp($UNICODEVERSION = <$VERSIONFH>);
589 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
590 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
592 return $UNICODEVERSION;
595 =head2 Implementation Note
597 The first use of charinfo() opens a read-only filehandle to the Unicode
598 Character Database (the database is included in the Perl distribution).
599 The filehandle is then kept open for further queries.