6 our $VERSION = '3.1.0';
10 our @ISA = qw(Exporter);
11 our @EXPORT_OK = qw(charinfo charblock charscript);
17 Unicode::UCD - Unicode character database
21 use Unicode::UCD 3.1.0;
22 # requires that level of the Unicode character database
24 use Unicode::UCD 'charinfo';
25 my %charinfo = charinfo($codepoint);
27 use Unicode::UCD 'charblock';
28 my $charblock = charblock($codepoint);
30 use Unicode::UCD 'charscript';
31 my $charscript = charblock($codepoint);
35 The Unicode module offers a simple interface to the Unicode Character
45 my ($rfh, @path) = @_;
47 unless (defined $$rfh) {
50 $f = File::Spec->catfile($d, "unicode", @path);
51 last if open($$rfh, $f);
54 croak __PACKAGE__, ": failed to find ",
55 File::Spec->catfile(@path), " in @INC"
63 use Unicode::UCD 'charinfo';
65 my %charinfo = charinfo(0x41);
67 charinfo() returns a hash that has the following fields as defined
68 by the Unicode standard:
72 code code point with at least four hexdigits
73 name name of the character IN UPPER CASE
74 category general category of the character
75 combining classes used in the Canonical Ordering Algorithm
76 bidi bidirectional category
77 decomposition character decomposition mapping
78 decimal if decimal digit this is the integer numeric value
79 digit if digit this is the numeric value
80 numeric if numeric is the integer or rational numeric value
81 mirrored if mirrored in bidirectional text
82 unicode10 Unicode 1.0 name if existed and different
83 comment ISO 10646 comment field
84 upper uppercase equivalent mapping
85 lower lowercase equivalent mapping
86 title titlecase equivalent mapping
88 block block the character belongs to (used in \p{In...})
89 script script the character belongs to
91 If no match is found, an empty hash is returned.
93 The C<block> property is the same as as returned by charinfo(). It is
94 not defined in the Unicode Character Database proper (Chapter 4 of the
95 Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
96 of TUS3). Similarly for the C<script> property.
98 Note that you cannot do (de)composition and casing based solely on the
99 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
100 you will need also the I<Composition Exclusions>, I<Case Folding>, and
101 I<SpecialCasing> tables, available as files F<CompExcl.txt>,
102 F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
108 my $hexk = sprintf("%04X", $code);
110 openunicode(\$UNICODE, "Unicode.txt");
111 if (defined $UNICODE) {
113 if (look($UNICODE, "$hexk;") >= 0) {
114 my $line = <$UNICODE>;
119 combining bidi decomposition
120 decimal digit numeric
121 mirrored unicode10 comment
123 )} = split(/;/, $line, -1);
124 if ($prop{code} eq $hexk) {
125 $prop{block} = charblock($code);
126 $prop{script} = charscript($code);
134 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
135 my ($table, $lo, $hi, $code) = @_;
139 my $mid = int(($lo+$hi) / 2);
141 if ($table->[$mid]->[0] < $code) {
142 if (defined $table->[$mid]->[1] && $table->[$mid]->[1] >= $code) {
143 return $table->[$mid]->[2];
145 _search($table, $mid + 1, $hi, $code);
147 } elsif ($table->[$mid]->[0] > $code) {
148 _search($table, $lo, $mid - 1, $code);
150 return $table->[$mid]->[2];
156 use Unicode::UCD 'charblock';
158 my $charblock = charblock(0x41);
160 charblock() returns the block the character belongs to, e.g.
161 C<Basic Latin>. Note that not all the character positions within all
164 The name is the same name that is used in the C<\p{In...}> construct,
165 for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
166 away from the names for the C<\p{In...}>, for example C<LatinExtendedA>
167 instead of C<Latin Extended-A>.
177 if (openunicode(\$BLOCKS, "Blocks.pl")) {
179 if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
180 push @BLOCKS, [ hex($1), hex($2), $3 ];
187 _search(\@BLOCKS, 0, $#BLOCKS, $code);
192 use Unicode::UCD 'charscript';
194 my $charscript = charscript(0x41);
196 charscript() returns the script the character belongs to, e.g.
197 C<Latin>, C<Greek>, C<Han>. Note that not all the character positions
198 within all scripts are defined.
200 The difference between a character block and a script is that script
201 names are closer to the linguistic notion of a set of characters,
202 while block is more of an artifact of the Unicode character numbering.
203 For example the Latin B<script> is spread over several B<blocks>.
205 Note also that the script names are all in uppercase, e.g. C<HEBREW>,
206 while the block names are Capitalized and with intermixed spaces,
207 e.g. C<Yi Syllables>.
209 Unfortunately, currently (Perl 5.8.0) there is no regular expression
210 notation for matching scripts as there is for blocks (C<\p{In...}>.
220 if (openunicode(\$SCRIPTS, "Scripts.txt")) {
222 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
223 push @SCRIPTS, [ hex($1), $2 ? hex($2) : undef, $3 ];
227 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
231 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
234 =head1 IMPLEMENTATION NOTE
236 The first use of L<charinfo> opens a read-only filehandle to the Unicode
237 Character Database. The filehandle is kept open for further queries.