6 our $VERSION = '3.1.0';
10 our @ISA = qw(Exporter);
11 our @EXPORT_OK = qw(charinfo charblock charscript);
17 Unicode::UCD - Unicode character database
21 use Unicode::UCD 3.1.0;
22 # requires that level of the Unicode character database
24 use Unicode::UCD 'charinfo';
25 my %charinfo = charinfo($codepoint);
27 use Unicode::UCD 'charblock';
28 my $charblock = charblock($codepoint);
30 use Unicode::UCD 'charscript';
31 my $charscript = charblock($codepoint);
35 The Unicode module offers a simple interface to the Unicode Character
45 my ($rfh, @path) = @_;
47 unless (defined $$rfh) {
50 $f = File::Spec->catfile($d, "unicode", @path);
51 last if open($$rfh, $f);
54 croak __PACKAGE__, ": failed to find ",
55 File::Spec->catfile(@path), " in @INC"
63 use Unicode::UCD 'charinfo';
65 my %charinfo = charinfo(0x41);
67 charinfo() returns a hash that has the following fields as defined
68 by the Unicode standard:
72 code code point with at least four hexdigits
73 name name of the character IN UPPER CASE
74 category general category of the character
75 combining classes used in the Canonical Ordering Algorithm
76 bidi bidirectional category
77 decomposition character decomposition mapping
78 decimal if decimal digit this is the integer numeric value
79 digit if digit this is the numeric value
80 numeric if numeric is the integer or rational numeric value
81 mirrored if mirrored in bidirectional text
82 unicode10 Unicode 1.0 name if existed and different
83 comment ISO 10646 comment field
84 upper uppercase equivalent mapping
85 lower lowercase equivalent mapping
86 title titlecase equivalent mapping
88 block block the character belongs to (used in \p{In...})
89 script script the character belongs to
91 If no match is found, an empty hash is returned.
93 The C<block> property is the same as as returned by charinfo(). It is
94 not defined in the Unicode Character Database proper (Chapter 4 of the
95 Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
96 of TUS3). Similarly for the C<script> property.
98 Note that you cannot do (de)composition and casing based solely on the
99 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
100 you will need also the I<Composition Exclusions>, I<Case Folding>, and
101 I<SpecialCasing> tables, available as files F<CompExcl.txt>,
102 F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
108 my $hexk = sprintf("%04X", $code);
110 openunicode(\$UNICODE, "Unicode.txt");
111 if (defined $UNICODE) {
113 if (look($UNICODE, "$hexk;") >= 0) {
114 my $line = <$UNICODE>;
119 combining bidi decomposition
120 decimal digit numeric
121 mirrored unicode10 comment
123 )} = split(/;/, $line, -1);
124 if ($prop{code} eq $hexk) {
125 $prop{block} = charblock($code);
133 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
134 my ($table, $lo, $hi, $code) = @_;
138 my $mid = int(($lo+$hi) / 2);
140 if ($table->[$mid]->[0] < $code) {
141 if ($table->[$mid]->[1] >= $code) {
142 return $table->[$mid]->[2];
144 _search($table, $mid + 1, $hi, $code);
146 } elsif ($table->[$mid]->[0] > $code) {
147 _search($table, $lo, $mid - 1, $code);
149 return $table->[$mid]->[2];
155 use Unicode::UCD 'charblock';
157 my $charblock = charblock(0x41);
159 charblock() returns the block the character belongs to, e.g.
160 C<Basic Latin>. Note that not all the character positions within all
163 The name is the same name that is used in the C<\p{In...}> construct,
164 for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
165 away from the names for the C<\p{In...}>, for example C<LatinExtendedA>
166 instead of C<Latin Extended-A>.
176 if (openunicode(\$BLOCKS, "Blocks.pl")) {
178 if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
179 push @BLOCKS, [ hex($1), hex($2), $3 ];
186 _search(\@BLOCKS, 0, $#BLOCKS, $code);
191 use Unicode::UCD 'charscript';
193 my $charscript = charscript(0x41);
195 charscript() returns the script the character belongs to, e.g.
196 C<Latin>, C<Greek>, C<Han>. Note that not all the character positions
197 within all scripts are defined.
199 The difference between a character block and a script is that script
200 names are closer to the linguistic notion of a set of characters,
201 while block is more of an artifact of the Unicode character numbering.
202 For example the Latin B<script> is spread over several B<blocks>.
204 Note also that the script names are all in uppercase, e.g. C<HEBREW>,
205 while the block names are Capitalized and with intermixed spaces,
206 e.g. C<Yi Syllables>.
208 Unfortunately, currently (Perl 5.8.0) there is no regular expression
209 notation for matching scripts as there is for blocks (C<\p{In...}>.
219 if (openunicode(\$SCRIPTS, "Scripts.txt")) {
221 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
222 push @SCRIPTS, [ hex($1), $2 ? hex($2) : undef, $3 ];
226 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
230 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
233 =head1 IMPLEMENTATION NOTE
235 The first use of L<charinfo> opens a read-only filehandle to the Unicode
236 Character Database. The filehandle is kept open for further queries.