Commit | Line | Data |
561c79ed |
1 | package Unicode::UCD; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | |
6f50a187 |
6 | our $VERSION = '3.1.0'; |
561c79ed |
7 | |
8 | require Exporter; |
9 | |
10 | our @ISA = qw(Exporter); |
11 | our @EXPORT_OK = qw(charinfo charblock); |
12 | |
13 | use Carp; |
14 | |
15 | =head1 NAME |
16 | |
17 | Unicode - Unicode character database |
18 | |
19 | =head1 SYNOPSIS |
20 | |
21 | use Unicode::UCD 3.1.0; |
22 | # requires that level of the Unicode character database |
23 | |
24 | use Unicode::UCD 'charinfo'; |
25 | my %charinfo = charinfo($codepoint); |
26 | |
27 | use Unicode::UCD 'charblock'; |
28 | my $charblock = charblock($codepoint); |
29 | |
30 | =head1 DESCRIPTION |
31 | |
32 | The Unicode module offers a simple interface to the Unicode Character |
33 | Database. |
34 | |
35 | =cut |
36 | |
37 | my $UNICODE; |
38 | my $BLOCKS; |
39 | |
40 | sub openunicode { |
41 | my ($rfh, @path) = @_; |
42 | my $f; |
43 | unless (defined $$rfh) { |
44 | for my $d (@INC) { |
45 | use File::Spec; |
46 | $f = File::Spec->catfile($d, "unicode", @path); |
47 | if (open($$rfh, $f)) { |
48 | last; |
49 | } else { |
50 | croak __PACKAGE__, ": open '$f' failed: $!\n"; |
51 | } |
52 | } |
53 | croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n" |
54 | unless defined $rfh; |
55 | } |
56 | return $f; |
57 | } |
58 | |
59 | =head2 charinfo |
60 | |
61 | use Unicode::UCD 'charinfo'; |
62 | |
63 | my %charinfo = charinfo(0x41); |
64 | |
65 | charinfo() returns a hash that has the following fields as defined |
66 | by the Unicode standard: |
67 | |
68 | key |
69 | |
70 | code code point with at least four hexdigits |
71 | name name of the character IN UPPER CASE |
72 | category general category of the character |
73 | combining classes used in the Canonical Ordering Algorithm |
74 | bidi bidirectional category |
75 | decomposition character decomposition mapping |
76 | decimal if decimal digit this is the integer numeric value |
77 | digit if digit this is the numeric value |
78 | numeric if numeric is the integer or rational numeric value |
79 | mirrored if mirrored in bidirectional text |
80 | unicode10 Unicode 1.0 name if existed and different |
81 | comment ISO 10646 comment field |
82 | upper uppercase equivalent mapping |
83 | lower lowercase equivalent mapping |
84 | title titlecase equivalent mapping |
85 | block block the character belongs to (used in \p{In...}) |
86 | |
87 | If no match is found, an empty hash is returned. |
88 | |
89 | The C<block> property is the same as as returned by charinfo(). |
90 | (It is not defined in the Unicode Character Database proper but |
91 | instead in an auxiliary database.) |
92 | |
93 | =cut |
94 | |
95 | sub charinfo { |
96 | my $code = shift; |
97 | my $hexk = sprintf("%04X", $code); |
98 | |
99 | openunicode(\$UNICODE, "Unicode.txt"); |
100 | if (defined $UNICODE) { |
101 | use Search::Dict; |
102 | if (look($UNICODE, "$hexk;") >= 0) { |
103 | my $line = <$UNICODE>; |
104 | chomp $line; |
105 | my %prop; |
106 | @prop{qw( |
107 | code name category |
108 | combining bidi decomposition |
109 | decimal digit numeric |
110 | mirrored unicode10 comment |
111 | upper lower title |
112 | )} = split(/;/, $line, -1); |
113 | if ($prop{code} eq $hexk) { |
114 | $prop{block} = charblock($code); |
115 | return %prop; |
116 | } |
117 | } |
118 | } |
119 | return; |
120 | } |
121 | |
122 | =head2 charbloc |
123 | |
124 | use Unicode::UCD 'charblock'; |
125 | |
126 | my $charblock = charblock(0x41); |
127 | |
128 | charblock() returns the block the character belongs to, e.g. |
129 | C<Basic Latin>. Note that not all the character positions within all |
130 | block are defined. |
131 | |
132 | The name is the same name that is used in the C<\p{In...}> construct, |
133 | for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished |
134 | away from the names for the C<\p{In...}>. |
135 | |
136 | =cut |
137 | |
138 | my @BLOCKS; |
139 | |
140 | sub _charblock { |
141 | my ($code, $lo, $hi) = @_; |
142 | |
143 | return if $lo > $hi; |
144 | |
145 | my $mid = int(($lo+$hi) / 2); |
146 | |
147 | if ($BLOCKS[$mid]->[0] < $code) { |
148 | if ($BLOCKS[$mid]->[1] >= $code) { |
149 | return $BLOCKS[$mid]->[2]; |
150 | } else { |
151 | _charblock($code, $mid + 1, $hi); |
152 | } |
153 | } elsif ($BLOCKS[$mid]->[0] > $code) { |
154 | _charblock($code, $lo, $mid - 1); |
155 | } else { |
156 | return $BLOCKS[$mid]->[2]; |
157 | } |
158 | } |
159 | |
160 | sub charblock { |
161 | my $code = shift; |
162 | |
163 | unless (@BLOCKS) { |
164 | if (openunicode(\$BLOCKS, "Blocks.pl")) { |
165 | while (<$BLOCKS>) { |
166 | if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) { |
167 | push @BLOCKS, [ hex($1), hex($2), $3 ]; |
168 | } |
169 | } |
170 | close($BLOCKS); |
171 | } |
172 | } |
173 | |
174 | _charblock($code, 0, $#BLOCKS); |
175 | } |
176 | |
177 | =head1 AUTHOR |
178 | |
179 | Jarkko Hietaniemi |
180 | |
181 | =cut |
182 | |
183 | 1; |