Commit | Line | Data |
423cee85 |
1 | package charnames; |
b177ca84 |
2 | use strict; |
3 | use warnings; |
4 | use Carp; |
5 | our $VERSION = '1.01'; |
b75c8c73 |
6 | |
d5448623 |
7 | use bytes (); # for $bytes::hint_bits |
8 | $charnames::hint_bits = 0x20000; |
423cee85 |
9 | |
52ea3e69 |
10 | my %alias1 = ( |
11 | # Icky 3.2 names with parentheses. |
12 | 'LINE FEED' => 'LINE FEED (LF)', |
13 | 'FORM FEED' => 'FORM FEED (FF)', |
14 | 'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)', |
15 | 'NEXT LINE' => 'NEXT LINE (NEL)', |
16 | # Convenience. |
17 | 'LF' => 'LINE FEED (LF)', |
18 | 'FF' => 'FORM FEED (FF)', |
19 | 'CR' => 'CARRIAGE RETURN (LF)', |
51e9e896 |
20 | 'NEL' => 'NEXT LINE (NEL)', |
52ea3e69 |
21 | 'BOM' => 'BYTE ORDER MARK', |
22 | ); |
23 | |
24 | my %alias2 = ( |
25 | # Pre-3.2 compatibility (only for the first 256 characters). |
26 | 'HORIZONTAL TABULATION' => 'CHARACTER TABULATION', |
27 | 'VERTICAL TABULATION' => 'LINE TABULATION', |
28 | 'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR', |
29 | 'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE', |
30 | 'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO', |
31 | 'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE', |
32 | 'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD', |
33 | 'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD', |
34 | ); |
35 | |
423cee85 |
36 | my $txt; |
37 | |
38 | # This is not optimized in any way yet |
b177ca84 |
39 | sub charnames |
40 | { |
41 | my $name = shift; |
42 | |
52ea3e69 |
43 | if (exists $alias1{$name}) { |
44 | $name = $alias1{$name}; |
45 | } |
46 | if (exists $alias2{$name}) { |
47 | require warnings; |
48 | warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead}); |
49 | $name = $alias2{$name}; |
50 | } |
b177ca84 |
51 | |
52ea3e69 |
52 | my $ord; |
423cee85 |
53 | my @off; |
52ea3e69 |
54 | my $fname; |
55 | |
56 | if ($name eq "BYTE ORDER MARK") { |
57 | $fname = $name; |
d7d589a8 |
58 | $ord = 0xFEFF; |
52ea3e69 |
59 | } else { |
60 | ## Suck in the code/name list as a big string. |
61 | ## Lines look like: |
62 | ## "0052\t\tLATIN CAPITAL LETTER R\n" |
63 | $txt = do "unicore/Name.pl" unless $txt; |
b177ca84 |
64 | |
52ea3e69 |
65 | ## @off will hold the index into the code/name string of the start and |
66 | ## end of the name as we find it. |
67 | |
68 | ## If :full, look for the the name exactly |
69 | if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) { |
70 | @off = ($-[0], $+[0]); |
423cee85 |
71 | } |
b177ca84 |
72 | |
52ea3e69 |
73 | ## If we didn't get above, and :short allowed, look for the short name. |
74 | ## The short name is like "greek:Sigma" |
75 | unless (@off) { |
76 | if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) { |
77 | my ($script, $cname) = ($1,$2); |
78 | my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"); |
79 | if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) { |
80 | @off = ($-[0], $+[0]); |
81 | } |
82 | } |
83 | } |
84 | |
85 | ## If we still don't have it, check for the name among the loaded |
86 | ## scripts. |
87 | if (not @off) |
b177ca84 |
88 | { |
52ea3e69 |
89 | my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"); |
90 | for my $script ( @{$^H{charnames_scripts}} ) |
91 | { |
92 | if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) { |
93 | @off = ($-[0], $+[0]); |
94 | last; |
95 | } |
96 | } |
b177ca84 |
97 | } |
52ea3e69 |
98 | |
99 | ## If we don't have it by now, give up. |
100 | unless (@off) { |
101 | carp "Unknown charname '$name'"; |
102 | return "\x{FFFD}"; |
103 | } |
104 | |
105 | ## |
106 | ## Now know where in the string the name starts. |
274085e3 |
107 | ## The code, in hex, is before that. |
52ea3e69 |
108 | ## |
109 | ## The code can be 4-6 characters long, so we've got to sort of |
110 | ## go look for it, just after the newline that comes before $off[0]. |
111 | ## |
112 | ## This would be much easier if unicore/Name.pl had info in |
113 | ## a name/code order, instead of code/name order. |
114 | ## |
115 | ## The +1 after the rindex() is to skip past the newline we're finding, |
116 | ## or, if the rindex() fails, to put us to an offset of zero. |
117 | ## |
118 | my $hexstart = rindex($txt, "\n", $off[0]) + 1; |
119 | |
120 | ## we know where it starts, so turn into number - |
121 | ## the ordinal for the char. |
122 | $ord = hex substr($txt, $hexstart, $off[0] - $hexstart); |
423cee85 |
123 | } |
b177ca84 |
124 | |
d5448623 |
125 | if ($^H & $bytes::hint_bits) { # "use bytes" in effect? |
8058d7ab |
126 | use bytes; |
d41ff1b8 |
127 | return chr $ord if $ord <= 255; |
f0175764 |
128 | my $hex = sprintf "%04x", $ord; |
52ea3e69 |
129 | if (not defined $fname) { |
130 | $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2; |
131 | } |
f0175764 |
132 | croak "Character 0x$hex with name '$fname' is above 0xFF"; |
423cee85 |
133 | } |
f0175764 |
134 | |
52ea3e69 |
135 | no warnings 'utf8'; # allow even illegal characters |
bfa383d6 |
136 | return pack "U", $ord; |
423cee85 |
137 | } |
138 | |
b177ca84 |
139 | sub import |
140 | { |
141 | shift; ## ignore class name |
142 | |
143 | if (not @_) |
144 | { |
145 | carp("`use charnames' needs explicit imports list"); |
146 | } |
d5448623 |
147 | $^H |= $charnames::hint_bits; |
423cee85 |
148 | $^H{charnames} = \&charnames ; |
b177ca84 |
149 | |
150 | ## |
151 | ## fill %h keys with our @_ args. |
152 | ## |
423cee85 |
153 | my %h; |
154 | @h{@_} = (1) x @_; |
b177ca84 |
155 | |
423cee85 |
156 | $^H{charnames_full} = delete $h{':full'}; |
157 | $^H{charnames_short} = delete $h{':short'}; |
158 | $^H{charnames_scripts} = [map uc, keys %h]; |
b177ca84 |
159 | |
160 | ## |
161 | ## If utf8? warnings are enabled, and some scripts were given, |
162 | ## see if at least we can find one letter of each script. |
163 | ## |
164 | if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) |
165 | { |
166 | $txt = do "unicore/Name.pl" unless $txt; |
167 | |
168 | for my $script (@{$^H{charnames_scripts}}) |
169 | { |
170 | if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) { |
171 | warnings::warn('utf8', "No such script: '$script'"); |
172 | } |
173 | } |
bd62941a |
174 | } |
423cee85 |
175 | } |
176 | |
f0175764 |
177 | require Unicode::UCD; # for Unicode::UCD::_getcode() |
178 | |
4e2cda5d |
179 | my %viacode; |
180 | |
b177ca84 |
181 | sub viacode |
182 | { |
183 | if (@_ != 1) { |
274085e3 |
184 | carp "charnames::viacode() expects one argument"; |
b177ca84 |
185 | return () |
186 | } |
f0175764 |
187 | |
b177ca84 |
188 | my $arg = shift; |
f0175764 |
189 | my $code = Unicode::UCD::_getcode($arg); |
b177ca84 |
190 | |
191 | my $hex; |
f0175764 |
192 | |
193 | if (defined $code) { |
b177ca84 |
194 | $hex = sprintf "%04X", $arg; |
195 | } else { |
196 | carp("unexpected arg \"$arg\" to charnames::viacode()"); |
daf0d493 |
197 | return; |
b177ca84 |
198 | } |
199 | |
f0175764 |
200 | if ($code > 0x10FFFF) { |
201 | carp "Unicode characters only allocated up to 0x10FFFF (you asked for $hex)"; |
202 | return "\x{FFFD}"; |
203 | } |
204 | |
4e2cda5d |
205 | return $viacode{$hex} if exists $viacode{$hex}; |
206 | |
b177ca84 |
207 | $txt = do "unicore/Name.pl" unless $txt; |
208 | |
209 | if ($txt =~ m/^$hex\t\t(.+)/m) { |
4e2cda5d |
210 | return $viacode{$hex} = $1; |
b177ca84 |
211 | } else { |
a23c04e4 |
212 | carp "Unknown charcode '$hex'"; |
213 | return "\x{FFFD}"; |
daf0d493 |
214 | } |
215 | } |
216 | |
4e2cda5d |
217 | my %vianame; |
218 | |
daf0d493 |
219 | sub vianame |
220 | { |
221 | if (@_ != 1) { |
222 | carp "charnames::vianame() expects one name argument"; |
223 | return () |
224 | } |
225 | |
226 | my $arg = shift; |
227 | |
4e2cda5d |
228 | return $vianame{$arg} if exists $vianame{$arg}; |
229 | |
daf0d493 |
230 | $txt = do "unicore/Name.pl" unless $txt; |
231 | |
232 | if ($txt =~ m/^([0-9A-F]+)\t\t($arg)/m) { |
4e2cda5d |
233 | return $vianame{$arg} = hex $1; |
daf0d493 |
234 | } else { |
235 | return; |
b177ca84 |
236 | } |
237 | } |
238 | |
423cee85 |
239 | |
240 | 1; |
241 | __END__ |
242 | |
243 | =head1 NAME |
244 | |
274085e3 |
245 | charnames - define character names for C<\N{named}> string literal escapes |
423cee85 |
246 | |
247 | =head1 SYNOPSIS |
248 | |
249 | use charnames ':full'; |
4a2d328f |
250 | print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n"; |
423cee85 |
251 | |
252 | use charnames ':short'; |
4a2d328f |
253 | print "\N{greek:Sigma} is an upper-case sigma.\n"; |
423cee85 |
254 | |
255 | use charnames qw(cyrillic greek); |
4a2d328f |
256 | print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n"; |
423cee85 |
257 | |
a23c04e4 |
258 | print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE" |
259 | printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330" |
b177ca84 |
260 | |
423cee85 |
261 | =head1 DESCRIPTION |
262 | |
263 | Pragma C<use charnames> supports arguments C<:full>, C<:short> and |
264 | script names. If C<:full> is present, for expansion of |
4a2d328f |
265 | C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of |
423cee85 |
266 | standard Unicode names of chars. If C<:short> is present, and |
267 | C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up |
268 | as a letter in script C<SCRIPT>. If pragma C<use charnames> is used |
4a2d328f |
269 | with script name arguments, then for C<\N{CHARNAME}}> the name |
423cee85 |
270 | C<CHARNAME> is looked up as a letter in the given scripts (in the |
271 | specified order). |
272 | |
273 | For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME> |
d5448623 |
274 | this pragma looks for the names |
423cee85 |
275 | |
276 | SCRIPTNAME CAPITAL LETTER CHARNAME |
277 | SCRIPTNAME SMALL LETTER CHARNAME |
278 | SCRIPTNAME LETTER CHARNAME |
279 | |
280 | in the table of standard Unicode names. If C<CHARNAME> is lowercase, |
daf0d493 |
281 | then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant |
282 | is ignored. |
283 | |
284 | Note that C<\N{...}> is compile-time, it's a special form of string |
285 | constant used inside double-quoted strings: in other words, you cannot |
4e2cda5d |
286 | use variables inside the C<\N{...}>. If you want similar run-time |
daf0d493 |
287 | functionality, use charnames::vianame(). |
423cee85 |
288 | |
301a3cda |
289 | For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F) |
290 | as of Unicode 3.1, there are no official Unicode names but you can |
291 | use instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth). |
292 | In Unicode 3.2 some naming changes will happen since ISO 6429 has been |
293 | updated. Also note that the U+UU80, U+0081, U+0084, and U+0099 |
294 | do not have names even in ISO 6429. |
295 | |
423cee85 |
296 | =head1 CUSTOM TRANSLATORS |
297 | |
d5448623 |
298 | The mechanism of translation of C<\N{...}> escapes is general and not |
423cee85 |
299 | hardwired into F<charnames.pm>. A module can install custom |
d5448623 |
300 | translations (inside the scope which C<use>s the module) with the |
423cee85 |
301 | following magic incantation: |
302 | |
d5448623 |
303 | use charnames (); # for $charnames::hint_bits |
304 | sub import { |
305 | shift; |
306 | $^H |= $charnames::hint_bits; |
307 | $^H{charnames} = \&translator; |
308 | } |
423cee85 |
309 | |
310 | Here translator() is a subroutine which takes C<CHARNAME> as an |
311 | argument, and returns text to insert into the string instead of the |
4a2d328f |
312 | C<\N{CHARNAME}> escape. Since the text to insert should be different |
d5448623 |
313 | in C<bytes> mode and out of it, the function should check the current |
314 | state of C<bytes>-flag as in: |
315 | |
316 | use bytes (); # for $bytes::hint_bits |
317 | sub translator { |
318 | if ($^H & $bytes::hint_bits) { |
319 | return bytes_translator(@_); |
320 | } |
321 | else { |
322 | return utf8_translator(@_); |
323 | } |
423cee85 |
324 | } |
423cee85 |
325 | |
b177ca84 |
326 | =head1 charnames::viacode(code) |
327 | |
328 | Returns the full name of the character indicated by the numeric code. |
329 | The example |
330 | |
331 | print charnames::viacode(0x2722); |
332 | |
333 | prints "FOUR TEARDROP-SPOKED ASTERISK". |
334 | |
daf0d493 |
335 | Returns undef if no name is known for the code. |
336 | |
274085e3 |
337 | This works only for the standard names, and does not yet apply |
daf0d493 |
338 | to custom translators. |
339 | |
274085e3 |
340 | Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK |
341 | SPACE", not "BYTE ORDER MARK". |
342 | |
daf0d493 |
343 | =head1 charnames::vianame(code) |
344 | |
345 | Returns the code point indicated by the name. |
346 | The example |
347 | |
348 | printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK"); |
349 | |
350 | prints "2722". |
351 | |
352 | Returns undef if no name is known for the name. |
b177ca84 |
353 | |
354 | This works only for the standard names, and does not yet aply |
355 | to custom translators. |
356 | |
52ea3e69 |
357 | =head1 ALIASES |
358 | |
359 | A few aliases have been defined for convenience: instead of having |
360 | to use the official names |
361 | |
362 | LINE FEED (LF) |
363 | FORM FEED (FF) |
364 | CARRIAGE RETURN (CR) |
365 | NEXT LINE (NEL) |
366 | |
367 | (yes, with parentheses) one can use |
368 | |
369 | LINE FEED |
370 | FORM FEED |
371 | CARRIAGE RETURN |
372 | NEXT LINE |
373 | LF |
374 | FF |
375 | CR |
376 | NEL |
377 | |
378 | One can also use |
379 | |
380 | BYTE ORDER MARK |
381 | BOM |
382 | |
383 | though that is of course not a legal character as such. |
384 | |
385 | For backward compatibility one can use the old names for |
386 | certain C0 and C1 controls |
387 | |
388 | old new |
389 | |
390 | HORIZONTAL TABULATION CHARACTER TABULATION |
391 | VERTICAL TABULATION LINE TABULATION |
392 | FILE SEPARATOR INFORMATION SEPARATOR FOUR |
393 | GROUP SEPARATOR INFORMATION SEPARATOR THREE |
394 | RECORD SEPARATOR INFORMATION SEPARATOR TWO |
395 | UNIT SEPARATOR INFORMATION SEPARATOR ONE |
396 | PARTIAL LINE DOWN PARTIAL LINE FORWARD |
397 | PARTIAL LINE UP PARTIAL LINE BACKWARD |
398 | |
399 | but the old names in addition to giving the character |
400 | will also give a warning about being deprecated. |
401 | |
f0175764 |
402 | =head1 ILLEGAL CHARACTERS |
403 | |
a23c04e4 |
404 | If you ask for a character that does not exist, a warning is given |
405 | and the Unicode I<replacement character> "\x{FFFD}" is returned. |
f0175764 |
406 | |
423cee85 |
407 | =head1 BUGS |
408 | |
409 | Since evaluation of the translation function happens in a middle of |
410 | compilation (of a string literal), the translation function should not |
411 | do any C<eval>s or C<require>s. This restriction should be lifted in |
412 | a future version of Perl. |
413 | |
414 | =cut |