Commit | Line | Data |
423cee85 |
1 | package charnames; |
b177ca84 |
2 | use strict; |
3 | use warnings; |
4 | use Carp; |
5 | our $VERSION = '1.01'; |
b75c8c73 |
6 | |
d5448623 |
7 | use bytes (); # for $bytes::hint_bits |
8 | $charnames::hint_bits = 0x20000; |
423cee85 |
9 | |
423cee85 |
10 | my $txt; |
11 | |
12 | # This is not optimized in any way yet |
b177ca84 |
13 | sub charnames |
14 | { |
15 | my $name = shift; |
16 | |
17 | ## Suck in the code/name list as a big string. |
18 | ## Lines look like: |
19 | ## "0052\t\tLATIN CAPITAL LETTER R\n" |
55d7b906 |
20 | $txt = do "unicore/Name.pl" unless $txt; |
b177ca84 |
21 | |
22 | ## @off will hold the index into the code/name string of the start and |
23 | ## end of the name as we find it. |
423cee85 |
24 | my @off; |
b177ca84 |
25 | |
26 | ## If :full, look for the the name exactly |
423cee85 |
27 | if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) { |
28 | @off = ($-[0], $+[0]); |
29 | } |
b177ca84 |
30 | |
31 | ## If we didn't get above, and :short allowed, look for the short name. |
32 | ## The short name is like "greek:Sigma" |
423cee85 |
33 | unless (@off) { |
b177ca84 |
34 | if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) { |
423cee85 |
35 | my ($script, $cname) = ($1,$2); |
36 | my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"); |
37 | if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) { |
38 | @off = ($-[0], $+[0]); |
39 | } |
40 | } |
41 | } |
b177ca84 |
42 | |
43 | ## If we still don't have it, check for the name among the loaded |
44 | ## scripts. |
45 | if (not @off) |
46 | { |
47 | my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"); |
48 | for my $script ( @{$^H{charnames_scripts}} ) |
49 | { |
50 | if ($txt =~ m/\t\t$script (?:$case )?LETTER \U$name$/m) { |
51 | @off = ($-[0], $+[0]); |
52 | last; |
53 | } |
54 | } |
423cee85 |
55 | } |
b177ca84 |
56 | |
57 | ## If we don't have it by now, give up. |
f0175764 |
58 | unless (@off) { |
59 | carp "Unknown charname '$name'"; |
60 | return "\x{FFFD}"; |
61 | } |
b896c7a5 |
62 | |
b177ca84 |
63 | ## |
64 | ## Now know where in the string the name starts. |
65 | ## The code, in hex, is befor that. |
66 | ## |
67 | ## The code can be 4-6 characters long, so we've got to sort of |
68 | ## go look for it, just after the newline that comes before $off[0]. |
69 | ## |
70 | ## This would be much easier if unicore/Name.pl had info in |
71 | ## a name/code order, instead of code/name order. |
72 | ## |
73 | ## The +1 after the rindex() is to skip past the newline we're finding, |
74 | ## or, if the rindex() fails, to put us to an offset of zero. |
75 | ## |
76 | my $hexstart = rindex($txt, "\n", $off[0]) + 1; |
77 | |
78 | ## we know where it starts, so turn into number - the ordinal for the char. |
79 | my $ord = hex substr($txt, $hexstart, $off[0] - $hexstart); |
80 | |
d5448623 |
81 | if ($^H & $bytes::hint_bits) { # "use bytes" in effect? |
8058d7ab |
82 | use bytes; |
d41ff1b8 |
83 | return chr $ord if $ord <= 255; |
f0175764 |
84 | my $hex = sprintf "%04x", $ord; |
d41ff1b8 |
85 | my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2; |
f0175764 |
86 | croak "Character 0x$hex with name '$fname' is above 0xFF"; |
423cee85 |
87 | } |
f0175764 |
88 | |
bfa383d6 |
89 | return pack "U", $ord; |
423cee85 |
90 | } |
91 | |
b177ca84 |
92 | sub import |
93 | { |
94 | shift; ## ignore class name |
95 | |
96 | if (not @_) |
97 | { |
98 | carp("`use charnames' needs explicit imports list"); |
99 | } |
d5448623 |
100 | $^H |= $charnames::hint_bits; |
423cee85 |
101 | $^H{charnames} = \&charnames ; |
b177ca84 |
102 | |
103 | ## |
104 | ## fill %h keys with our @_ args. |
105 | ## |
423cee85 |
106 | my %h; |
107 | @h{@_} = (1) x @_; |
b177ca84 |
108 | |
423cee85 |
109 | $^H{charnames_full} = delete $h{':full'}; |
110 | $^H{charnames_short} = delete $h{':short'}; |
111 | $^H{charnames_scripts} = [map uc, keys %h]; |
b177ca84 |
112 | |
113 | ## |
114 | ## If utf8? warnings are enabled, and some scripts were given, |
115 | ## see if at least we can find one letter of each script. |
116 | ## |
117 | if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) |
118 | { |
119 | $txt = do "unicore/Name.pl" unless $txt; |
120 | |
121 | for my $script (@{$^H{charnames_scripts}}) |
122 | { |
123 | if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) { |
124 | warnings::warn('utf8', "No such script: '$script'"); |
125 | } |
126 | } |
bd62941a |
127 | } |
423cee85 |
128 | } |
129 | |
f0175764 |
130 | require Unicode::UCD; # for Unicode::UCD::_getcode() |
131 | |
4e2cda5d |
132 | my %viacode; |
133 | |
b177ca84 |
134 | sub viacode |
135 | { |
136 | if (@_ != 1) { |
daf0d493 |
137 | carp "charnames::viacode() expects one numeric argument"; |
b177ca84 |
138 | return () |
139 | } |
f0175764 |
140 | |
b177ca84 |
141 | my $arg = shift; |
f0175764 |
142 | my $code = Unicode::UCD::_getcode($arg); |
b177ca84 |
143 | |
144 | my $hex; |
f0175764 |
145 | |
146 | if (defined $code) { |
b177ca84 |
147 | $hex = sprintf "%04X", $arg; |
148 | } else { |
149 | carp("unexpected arg \"$arg\" to charnames::viacode()"); |
daf0d493 |
150 | return; |
b177ca84 |
151 | } |
152 | |
f0175764 |
153 | if ($code > 0x10FFFF) { |
154 | carp "Unicode characters only allocated up to 0x10FFFF (you asked for $hex)"; |
155 | return "\x{FFFD}"; |
156 | } |
157 | |
4e2cda5d |
158 | return $viacode{$hex} if exists $viacode{$hex}; |
159 | |
b177ca84 |
160 | $txt = do "unicore/Name.pl" unless $txt; |
161 | |
162 | if ($txt =~ m/^$hex\t\t(.+)/m) { |
4e2cda5d |
163 | return $viacode{$hex} = $1; |
b177ca84 |
164 | } else { |
daf0d493 |
165 | return; |
166 | } |
167 | } |
168 | |
4e2cda5d |
169 | my %vianame; |
170 | |
daf0d493 |
171 | sub vianame |
172 | { |
173 | if (@_ != 1) { |
174 | carp "charnames::vianame() expects one name argument"; |
175 | return () |
176 | } |
177 | |
178 | my $arg = shift; |
179 | |
4e2cda5d |
180 | return $vianame{$arg} if exists $vianame{$arg}; |
181 | |
daf0d493 |
182 | $txt = do "unicore/Name.pl" unless $txt; |
183 | |
184 | if ($txt =~ m/^([0-9A-F]+)\t\t($arg)/m) { |
4e2cda5d |
185 | return $vianame{$arg} = hex $1; |
daf0d493 |
186 | } else { |
187 | return; |
b177ca84 |
188 | } |
189 | } |
190 | |
423cee85 |
191 | |
192 | 1; |
193 | __END__ |
194 | |
195 | =head1 NAME |
196 | |
b177ca84 |
197 | charnames - define character names for C<\N{named}> string literal escapes. |
423cee85 |
198 | |
199 | =head1 SYNOPSIS |
200 | |
201 | use charnames ':full'; |
4a2d328f |
202 | print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n"; |
423cee85 |
203 | |
204 | use charnames ':short'; |
4a2d328f |
205 | print "\N{greek:Sigma} is an upper-case sigma.\n"; |
423cee85 |
206 | |
207 | use charnames qw(cyrillic greek); |
4a2d328f |
208 | print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n"; |
423cee85 |
209 | |
b177ca84 |
210 | print charname::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE" |
daf0d493 |
211 | printf "%04X", charname::vianame("GOTHIC LETTER AHSA"); # prints "10330" |
b177ca84 |
212 | |
423cee85 |
213 | =head1 DESCRIPTION |
214 | |
215 | Pragma C<use charnames> supports arguments C<:full>, C<:short> and |
216 | script names. If C<:full> is present, for expansion of |
4a2d328f |
217 | C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of |
423cee85 |
218 | standard Unicode names of chars. If C<:short> is present, and |
219 | C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up |
220 | as a letter in script C<SCRIPT>. If pragma C<use charnames> is used |
4a2d328f |
221 | with script name arguments, then for C<\N{CHARNAME}}> the name |
423cee85 |
222 | C<CHARNAME> is looked up as a letter in the given scripts (in the |
223 | specified order). |
224 | |
225 | For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME> |
d5448623 |
226 | this pragma looks for the names |
423cee85 |
227 | |
228 | SCRIPTNAME CAPITAL LETTER CHARNAME |
229 | SCRIPTNAME SMALL LETTER CHARNAME |
230 | SCRIPTNAME LETTER CHARNAME |
231 | |
232 | in the table of standard Unicode names. If C<CHARNAME> is lowercase, |
daf0d493 |
233 | then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant |
234 | is ignored. |
235 | |
236 | Note that C<\N{...}> is compile-time, it's a special form of string |
237 | constant used inside double-quoted strings: in other words, you cannot |
4e2cda5d |
238 | use variables inside the C<\N{...}>. If you want similar run-time |
daf0d493 |
239 | functionality, use charnames::vianame(). |
423cee85 |
240 | |
241 | =head1 CUSTOM TRANSLATORS |
242 | |
d5448623 |
243 | The mechanism of translation of C<\N{...}> escapes is general and not |
423cee85 |
244 | hardwired into F<charnames.pm>. A module can install custom |
d5448623 |
245 | translations (inside the scope which C<use>s the module) with the |
423cee85 |
246 | following magic incantation: |
247 | |
d5448623 |
248 | use charnames (); # for $charnames::hint_bits |
249 | sub import { |
250 | shift; |
251 | $^H |= $charnames::hint_bits; |
252 | $^H{charnames} = \&translator; |
253 | } |
423cee85 |
254 | |
255 | Here translator() is a subroutine which takes C<CHARNAME> as an |
256 | argument, and returns text to insert into the string instead of the |
4a2d328f |
257 | C<\N{CHARNAME}> escape. Since the text to insert should be different |
d5448623 |
258 | in C<bytes> mode and out of it, the function should check the current |
259 | state of C<bytes>-flag as in: |
260 | |
261 | use bytes (); # for $bytes::hint_bits |
262 | sub translator { |
263 | if ($^H & $bytes::hint_bits) { |
264 | return bytes_translator(@_); |
265 | } |
266 | else { |
267 | return utf8_translator(@_); |
268 | } |
423cee85 |
269 | } |
423cee85 |
270 | |
b177ca84 |
271 | =head1 charnames::viacode(code) |
272 | |
273 | Returns the full name of the character indicated by the numeric code. |
274 | The example |
275 | |
276 | print charnames::viacode(0x2722); |
277 | |
278 | prints "FOUR TEARDROP-SPOKED ASTERISK". |
279 | |
daf0d493 |
280 | Returns undef if no name is known for the code. |
281 | |
282 | This works only for the standard names, and does not yet aply |
283 | to custom translators. |
284 | |
285 | =head1 charnames::vianame(code) |
286 | |
287 | Returns the code point indicated by the name. |
288 | The example |
289 | |
290 | printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK"); |
291 | |
292 | prints "2722". |
293 | |
294 | Returns undef if no name is known for the name. |
b177ca84 |
295 | |
296 | This works only for the standard names, and does not yet aply |
297 | to custom translators. |
298 | |
f0175764 |
299 | =head1 ILLEGAL CHARACTERS |
300 | |
301 | If you ask for a character that does not exist, a warning is given |
302 | and the special Unicode I<replacement character> "\x{FFFD}" is returned. |
303 | |
423cee85 |
304 | =head1 BUGS |
305 | |
306 | Since evaluation of the translation function happens in a middle of |
307 | compilation (of a string literal), the translation function should not |
308 | do any C<eval>s or C<require>s. This restriction should be lifted in |
309 | a future version of Perl. |
310 | |
311 | =cut |