6 our $VERSION = '3.1.0';
10 our @ISA = qw(Exporter);
11 our @EXPORT_OK = qw(charinfo charblock charscript);
17 Unicode::UCD - Unicode character database
21 use Unicode::UCD 3.1.0;
22 # requires that level of the Unicode character database
24 use Unicode::UCD 'charinfo';
25 my %charinfo = charinfo($codepoint);
27 use Unicode::UCD 'charblock';
28 my $charblock = charblock($codepoint);
30 use Unicode::UCD 'charscript';
31 my $charscript = charblock($codepoint);
35 The Unicode module offers a simple interface to the Unicode Character
45 my ($rfh, @path) = @_;
47 unless (defined $$rfh) {
50 $f = File::Spec->catfile($d, "unicode", @path);
51 last if open($$rfh, $f);
54 croak __PACKAGE__, ": failed to find ",
55 File::Spec->catfile(@path), " in @INC"
63 use Unicode::UCD 'charinfo';
65 my %charinfo = charinfo(0x41);
67 charinfo() returns a hash that has the following fields as defined
68 by the Unicode standard:
72 code code point with at least four hexdigits
73 name name of the character IN UPPER CASE
74 category general category of the character
75 combining classes used in the Canonical Ordering Algorithm
76 bidi bidirectional category
77 decomposition character decomposition mapping
78 decimal if decimal digit this is the integer numeric value
79 digit if digit this is the numeric value
80 numeric if numeric is the integer or rational numeric value
81 mirrored if mirrored in bidirectional text
82 unicode10 Unicode 1.0 name if existed and different
83 comment ISO 10646 comment field
84 upper uppercase equivalent mapping
85 lower lowercase equivalent mapping
86 title titlecase equivalent mapping
88 block block the character belongs to (used in \p{In...})
89 script script the character belongs to
91 If no match is found, an empty hash is returned.
93 The C<block> property is the same as as returned by charinfo(). It is
94 not defined in the Unicode Character Database proper (Chapter 4 of the
95 Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
96 of TUS3). Similarly for the C<script> property.
98 Note that you cannot do (de)composition and casing based solely on the
99 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
100 you will need also the I<Composition Exclusions>, I<Case Folding>, and
101 I<SpecialCasing> tables, available as files F<CompExcl.txt>,
102 F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
108 my $hexk = sprintf("%04X", $code);
110 openunicode(\$UNICODE, "Unicode.txt");
111 if (defined $UNICODE) {
113 if (look($UNICODE, "$hexk;") >= 0) {
114 my $line = <$UNICODE>;
119 combining bidi decomposition
120 decimal digit numeric
121 mirrored unicode10 comment
123 )} = split(/;/, $line, -1);
124 if ($prop{code} eq $hexk) {
125 $prop{block} = charblock($code);
126 $prop{script} = charscript($code);
134 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
135 my ($table, $lo, $hi, $code) = @_;
139 my $mid = int(($lo+$hi) / 2);
141 if ($table->[$mid]->[0] < $code) {
142 if (defined $table->[$mid]->[1] && $table->[$mid]->[1] >= $code) {
143 return $table->[$mid]->[2];
145 _search($table, $mid + 1, $hi, $code);
147 } elsif ($table->[$mid]->[0] > $code) {
148 _search($table, $lo, $mid - 1, $code);
150 return $table->[$mid]->[2];
156 use Unicode::UCD 'charblock';
158 my $charblock = charblock(0x41);
160 charblock() returns the block the character belongs to, e.g.
161 C<Basic Latin>. Note that not all the character positions within all
164 The name is the same name that is used in the C<\p{In...}> construct,
165 for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
166 away from the names for the C<\p{In...}>, for example C<LatinExtendedA>
167 instead of C<Latin Extended-A>.
177 if (openunicode(\$BLOCKS, "Blocks.txt")) {
179 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
180 push @BLOCKS, [ hex($1), hex($2), $3 ];
187 _search(\@BLOCKS, 0, $#BLOCKS, $code);
192 use Unicode::UCD 'charscript';
194 my $charscript = charscript(0x41);
196 charscript() returns the script the character belongs to, e.g.
197 C<Latin>, C<Greek>, C<Han>.
199 Unfortunately, currently (Perl 5.8.0) there is no regular expression
200 notation for matching scripts as there is for blocks (C<\p{In...}>.
210 if (openunicode(\$SCRIPTS, "Scripts.txt")) {
212 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
213 push @SCRIPTS, [ hex($1), $2 ? hex($2) : undef, $3 ];
217 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
221 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
224 =head2 charblock versus charscript
226 The difference between a character block and a script is that scripts
227 are closer to the linguistic notion of a set of characters required to
228 present languages, while block is more of an artifact of the Unicode
229 character numbering. For example the Latin B<script> is spread over
230 several B<blocks>, such as C<Basic Latin>, C<Latin 1 Supplement>,
231 C<Latin Extended-A>, and C<Latin Extended-B>. On the other hand, the
232 Latin script does not contain all the characters of the C<Basic Latin>
233 block (also known as the ASCII): it includes only the letters, not for
234 example the digits or the punctuation.
236 For block see http://www.unicode.org/Public/UNIDATA/Blocks.txt
238 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
240 Note also that the script names are all in uppercase, e.g. C<HEBREW>,
241 while the block names are Capitalized and with intermixed spaces,
242 e.g. C<Yi Syllables>.
278 =head1 IMPLEMENTATION NOTE
280 The first use of charinfo() opens a read-only filehandle to the Unicode
281 Character Database (the database is included in the Perl distribution).
282 The filehandle is then kept open for further queries.