6 our $VERSION = '3.1.0';
10 our @ISA = qw(Exporter);
11 our @EXPORT_OK = qw(charinfo charblock charscript);
17 Unicode::UCD - Unicode character database
21 use Unicode::UCD 3.1.0;
22 # requires that level of the Unicode character database
24 use Unicode::UCD 'charinfo';
25 my %charinfo = charinfo($codepoint);
27 use Unicode::UCD 'charblock';
28 my $charblock = charblock($codepoint);
30 use Unicode::UCD 'charscript';
31 my $charscript = charblock($codepoint);
35 The Unicode module offers a simple interface to the Unicode Character
45 my ($rfh, @path) = @_;
47 unless (defined $$rfh) {
50 $f = File::Spec->catfile($d, "unicode", @path);
51 last if open($$rfh, $f);
54 croak __PACKAGE__, ": failed to find ",
55 File::Spec->catfile(@path), " in @INC"
63 use Unicode::UCD 'charinfo';
65 my %charinfo = charinfo(0x41);
67 charinfo() returns a hash that has the following fields as defined
68 by the Unicode standard:
72 code code point with at least four hexdigits
73 name name of the character IN UPPER CASE
74 category general category of the character
75 combining classes used in the Canonical Ordering Algorithm
76 bidi bidirectional category
77 decomposition character decomposition mapping
78 decimal if decimal digit this is the integer numeric value
79 digit if digit this is the numeric value
80 numeric if numeric is the integer or rational numeric value
81 mirrored if mirrored in bidirectional text
82 unicode10 Unicode 1.0 name if existed and different
83 comment ISO 10646 comment field
84 upper uppercase equivalent mapping
85 lower lowercase equivalent mapping
86 title titlecase equivalent mapping
88 block block the character belongs to (used in \p{In...})
89 script script the character belongs to
91 If no match is found, an empty hash is returned.
93 The C<block> property is the same as as returned by charinfo(). It is
94 not defined in the Unicode Character Database proper (Chapter 4 of the
95 Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
96 of TUS3). Similarly for the C<script> property.
98 Note that you cannot do (de)composition and casing based solely on the
99 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
100 you will need also the I<Composition Exclusions>, I<Case Folding>, and
101 I<SpecialCasing> tables, available as files F<CompExcl.txt>,
102 F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
108 my $hexk = sprintf("%04X", $code);
110 openunicode(\$UNICODE, "Unicode.txt");
111 if (defined $UNICODE) {
113 if (look($UNICODE, "$hexk;") >= 0) {
114 my $line = <$UNICODE>;
119 combining bidi decomposition
120 decimal digit numeric
121 mirrored unicode10 comment
123 )} = split(/;/, $line, -1);
124 if ($prop{code} eq $hexk) {
125 $prop{block} = charblock($code);
126 $prop{script} = charscript($code);
134 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
135 my ($table, $lo, $hi, $code) = @_;
139 my $mid = int(($lo+$hi) / 2);
141 if ($table->[$mid]->[0] < $code) {
142 if (defined $table->[$mid]->[1] && $table->[$mid]->[1] >= $code) {
143 return $table->[$mid]->[2];
145 _search($table, $mid + 1, $hi, $code);
147 } elsif ($table->[$mid]->[0] > $code) {
148 _search($table, $lo, $mid - 1, $code);
150 return $table->[$mid]->[2];
156 use Unicode::UCD 'charblock';
158 my $charblock = charblock(0x41);
160 charblock() returns the block the character belongs to, e.g.
161 C<Basic Latin>. Note that not all the character positions within all
172 if (openunicode(\$BLOCKS, "Blocks.txt")) {
174 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
175 push @BLOCKS, [ hex($1), hex($2), $3 ];
182 _search(\@BLOCKS, 0, $#BLOCKS, $code);
187 use Unicode::UCD 'charscript';
189 my $charscript = charscript(0x41);
191 charscript() returns the script the character belongs to, e.g.
192 C<Latin>, C<Greek>, C<Han>.
202 if (openunicode(\$SCRIPTS, "Scripts.txt")) {
204 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
205 push @SCRIPTS, [ hex($1), $2 ? hex($2) : undef, $3 ];
209 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
213 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
216 =head2 charblock versus charscript
218 The difference between a character block and a script is that scripts
219 are closer to the linguistic notion of a set of characters required to
220 present languages, while block is more of an artifact of the Unicode
221 character numbering and separation into blocks of 256 characters.
223 For example the Latin B<script> is spread over several B<blocks>, such
224 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
225 C<Latin Extended-B>. On the other hand, the Latin script does not
226 contain all the characters of the C<Basic Latin> block (also known as
227 the ASCII): it includes only the letters, not for example the digits
230 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
232 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
234 =head2 Matching Scripts and Blocks
236 Both scripts and blocks can be matched using the regular expression
237 construct C<\p{In...}> and its negation C<\P{In...}>.
239 The name of the script or the block comes after the C<In>, for example
240 C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
241 squished away from the names for the C<\p{In...}>, for example
242 C<LatinExtendedA> instead of C<Latin Extended-A>. There are a few
243 cases where there exists both a script and a block by the same name,
244 in these cases the block version has C<Block> appended: C<\p{InKatakana}>
245 is the script, C<\p{InKatakanaBlock}> is the block.
247 =head2 Implementation Note
249 The first use of charinfo() opens a read-only filehandle to the Unicode
250 Character Database (the database is included in the Perl distribution).
251 The filehandle is then kept open for further queries.