Commit | Line | Data |
423cee85 |
1 | package charnames; |
b177ca84 |
2 | use strict; |
3 | use warnings; |
51cf30b6 |
4 | use File::Spec; |
529f008e |
5 | our $VERSION = '1.08'; |
b75c8c73 |
6 | |
d5448623 |
7 | use bytes (); # for $bytes::hint_bits |
423cee85 |
8 | |
52ea3e69 |
9 | my %alias1 = ( |
10 | # Icky 3.2 names with parentheses. |
11 | 'LINE FEED' => 'LINE FEED (LF)', |
12 | 'FORM FEED' => 'FORM FEED (FF)', |
13 | 'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)', |
14 | 'NEXT LINE' => 'NEXT LINE (NEL)', |
15 | # Convenience. |
16 | 'LF' => 'LINE FEED (LF)', |
17 | 'FF' => 'FORM FEED (FF)', |
eb380778 |
18 | 'CR' => 'CARRIAGE RETURN (CR)', |
51e9e896 |
19 | 'NEL' => 'NEXT LINE (NEL)', |
24b5d5cc |
20 | # More convenience. For futher convencience, |
21 | # it is suggested some way using using the NamesList |
22 | # aliases is implemented. |
23 | 'ZWNJ' => 'ZERO WIDTH NON-JOINER', |
24 | 'ZWJ' => 'ZERO WIDTH JOINER', |
52ea3e69 |
25 | 'BOM' => 'BYTE ORDER MARK', |
26 | ); |
27 | |
28 | my %alias2 = ( |
29 | # Pre-3.2 compatibility (only for the first 256 characters). |
30 | 'HORIZONTAL TABULATION' => 'CHARACTER TABULATION', |
31 | 'VERTICAL TABULATION' => 'LINE TABULATION', |
32 | 'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR', |
33 | 'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE', |
34 | 'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO', |
35 | 'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE', |
36 | 'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD', |
37 | 'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD', |
38 | ); |
39 | |
35c0985d |
40 | my %alias3 = ( |
41 | # User defined aliasses. Even more convenient :) |
42 | ); |
423cee85 |
43 | my $txt; |
44 | |
8878f897 |
45 | sub croak |
46 | { |
47 | require Carp; goto &Carp::croak; |
48 | } # croak |
49 | |
50 | sub carp |
51 | { |
52 | require Carp; goto &Carp::carp; |
53 | } # carp |
54 | |
35c0985d |
55 | sub alias (@) |
56 | { |
57 | @_ or return %alias3; |
58 | my $alias = ref $_[0] ? $_[0] : { @_ }; |
59 | @alias3{keys %$alias} = values %$alias; |
60 | } # alias |
61 | |
62 | sub alias_file ($) |
63 | { |
51cf30b6 |
64 | my ($arg, $file) = @_; |
65 | if (-f $arg && File::Spec->file_name_is_absolute ($arg)) { |
66 | $file = $arg; |
67 | } |
68 | elsif ($arg =~ m/^\w+$/) { |
69 | $file = "unicore/${arg}_alias.pl"; |
70 | } |
71 | else { |
72 | croak "Charnames alias files can only have identifier characters"; |
73 | } |
35c0985d |
74 | if (my @alias = do $file) { |
51cf30b6 |
75 | @alias == 1 && !defined $alias[0] and |
76 | croak "$file cannot be used as alias file for charnames"; |
77 | @alias % 2 and |
78 | croak "$file did not return a (valid) list of alias pairs"; |
35c0985d |
79 | alias (@alias); |
80 | return (1); |
81 | } |
82 | 0; |
83 | } # alias_file |
84 | |
423cee85 |
85 | # This is not optimized in any way yet |
b177ca84 |
86 | sub charnames |
87 | { |
88 | my $name = shift; |
89 | |
52ea3e69 |
90 | if (exists $alias1{$name}) { |
35c0985d |
91 | $name = $alias1{$name}; |
52ea3e69 |
92 | } |
35c0985d |
93 | elsif (exists $alias2{$name}) { |
94 | require warnings; |
95 | warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead}); |
96 | $name = $alias2{$name}; |
97 | } |
98 | elsif (exists $alias3{$name}) { |
99 | $name = $alias3{$name}; |
52ea3e69 |
100 | } |
b177ca84 |
101 | |
52ea3e69 |
102 | my $ord; |
423cee85 |
103 | my @off; |
52ea3e69 |
104 | my $fname; |
105 | |
106 | if ($name eq "BYTE ORDER MARK") { |
35c0985d |
107 | $fname = $name; |
108 | $ord = 0xFEFF; |
52ea3e69 |
109 | } else { |
35c0985d |
110 | ## Suck in the code/name list as a big string. |
111 | ## Lines look like: |
112 | ## "0052\t\tLATIN CAPITAL LETTER R\n" |
113 | $txt = do "unicore/Name.pl" unless $txt; |
114 | |
115 | ## @off will hold the index into the code/name string of the start and |
116 | ## end of the name as we find it. |
117 | |
a6d05634 |
118 | ## If :full, look for the name exactly |
35c0985d |
119 | if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) { |
120 | @off = ($-[0], $+[0]); |
121 | } |
122 | |
123 | ## If we didn't get above, and :short allowed, look for the short name. |
124 | ## The short name is like "greek:Sigma" |
125 | unless (@off) { |
126 | if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) { |
127 | my ($script, $cname) = ($1, $2); |
128 | my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"; |
129 | if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) { |
52ea3e69 |
130 | @off = ($-[0], $+[0]); |
35c0985d |
131 | } |
423cee85 |
132 | } |
35c0985d |
133 | } |
b177ca84 |
134 | |
35c0985d |
135 | ## If we still don't have it, check for the name among the loaded |
136 | ## scripts. |
137 | if (not @off) { |
138 | my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"; |
139 | for my $script (@{$^H{charnames_scripts}}) { |
140 | if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) { |
141 | @off = ($-[0], $+[0]); |
142 | last; |
143 | } |
52ea3e69 |
144 | } |
35c0985d |
145 | } |
146 | |
147 | ## If we don't have it by now, give up. |
148 | unless (@off) { |
149 | carp "Unknown charname '$name'"; |
150 | return "\x{FFFD}"; |
151 | } |
152 | |
153 | ## |
154 | ## Now know where in the string the name starts. |
155 | ## The code, in hex, is before that. |
156 | ## |
157 | ## The code can be 4-6 characters long, so we've got to sort of |
158 | ## go look for it, just after the newline that comes before $off[0]. |
159 | ## |
160 | ## This would be much easier if unicore/Name.pl had info in |
161 | ## a name/code order, instead of code/name order. |
162 | ## |
163 | ## The +1 after the rindex() is to skip past the newline we're finding, |
164 | ## or, if the rindex() fails, to put us to an offset of zero. |
165 | ## |
166 | my $hexstart = rindex($txt, "\n", $off[0]) + 1; |
167 | |
168 | ## we know where it starts, so turn into number - |
169 | ## the ordinal for the char. |
075d4edd |
170 | $ord = CORE::hex substr($txt, $hexstart, $off[0] - $hexstart); |
423cee85 |
171 | } |
b177ca84 |
172 | |
d5448623 |
173 | if ($^H & $bytes::hint_bits) { # "use bytes" in effect? |
8058d7ab |
174 | use bytes; |
d41ff1b8 |
175 | return chr $ord if $ord <= 255; |
f0175764 |
176 | my $hex = sprintf "%04x", $ord; |
52ea3e69 |
177 | if (not defined $fname) { |
35c0985d |
178 | $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2; |
52ea3e69 |
179 | } |
f0175764 |
180 | croak "Character 0x$hex with name '$fname' is above 0xFF"; |
423cee85 |
181 | } |
f0175764 |
182 | |
52ea3e69 |
183 | no warnings 'utf8'; # allow even illegal characters |
bfa383d6 |
184 | return pack "U", $ord; |
35c0985d |
185 | } # charnames |
423cee85 |
186 | |
b177ca84 |
187 | sub import |
188 | { |
189 | shift; ## ignore class name |
190 | |
35c0985d |
191 | if (not @_) { |
192 | carp("`use charnames' needs explicit imports list"); |
b177ca84 |
193 | } |
423cee85 |
194 | $^H{charnames} = \&charnames ; |
b177ca84 |
195 | |
196 | ## |
197 | ## fill %h keys with our @_ args. |
198 | ## |
35c0985d |
199 | my ($promote, %h, @args) = (0); |
e5c3f898 |
200 | while (my $arg = shift) { |
201 | if ($arg eq ":alias") { |
51cf30b6 |
202 | @_ or |
203 | croak ":alias needs an argument in charnames"; |
35c0985d |
204 | my $alias = shift; |
205 | if (ref $alias) { |
206 | ref $alias eq "HASH" or |
51cf30b6 |
207 | croak "Only HASH reference supported as argument to :alias"; |
35c0985d |
208 | alias ($alias); |
209 | next; |
210 | } |
51cf30b6 |
211 | if ($alias =~ m{:(\w+)$}) { |
212 | $1 eq "full" || $1 eq "short" and |
213 | croak ":alias cannot use existing pragma :$1 (reversed order?)"; |
214 | alias_file ($1) and $promote = 1; |
215 | next; |
35c0985d |
216 | } |
51cf30b6 |
217 | alias_file ($alias); |
218 | next; |
219 | } |
e5c3f898 |
220 | if (substr($arg, 0, 1) eq ':' and ! ($arg eq ":full" || $arg eq ":short")) { |
221 | warn "unsupported special '$arg' in charnames"; |
51cf30b6 |
222 | next; |
35c0985d |
223 | } |
e5c3f898 |
224 | push @args, $arg; |
35c0985d |
225 | } |
226 | @args == 0 && $promote and @args = (":full"); |
227 | @h{@args} = (1) x @args; |
b177ca84 |
228 | |
423cee85 |
229 | $^H{charnames_full} = delete $h{':full'}; |
230 | $^H{charnames_short} = delete $h{':short'}; |
231 | $^H{charnames_scripts} = [map uc, keys %h]; |
b177ca84 |
232 | |
233 | ## |
234 | ## If utf8? warnings are enabled, and some scripts were given, |
235 | ## see if at least we can find one letter of each script. |
236 | ## |
35c0985d |
237 | if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) { |
238 | $txt = do "unicore/Name.pl" unless $txt; |
239 | |
240 | for my $script (@{$^H{charnames_scripts}}) { |
241 | if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) { |
242 | warnings::warn('utf8', "No such script: '$script'"); |
b177ca84 |
243 | } |
35c0985d |
244 | } |
bd62941a |
245 | } |
35c0985d |
246 | } # import |
423cee85 |
247 | |
4e2cda5d |
248 | my %viacode; |
249 | |
b177ca84 |
250 | sub viacode |
251 | { |
35c0985d |
252 | if (@_ != 1) { |
253 | carp "charnames::viacode() expects one argument"; |
bd5c3bd9 |
254 | return; |
35c0985d |
255 | } |
f0175764 |
256 | |
35c0985d |
257 | my $arg = shift; |
b177ca84 |
258 | |
e10d7780 |
259 | # this is derived from Unicode::UCD, where it is nearly the same as the |
260 | # function _getcode(), but it makes sure that even a hex argument has the |
261 | # proper number of leading zeros, which is critical in matching against $txt |
262 | # below |
35c0985d |
263 | my $hex; |
bd5c3bd9 |
264 | if ($arg =~ /^[1-9]\d*$/) { |
35c0985d |
265 | $hex = sprintf "%04X", $arg; |
bd5c3bd9 |
266 | } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { |
e10d7780 |
267 | # Below is the line that differs from the _getcode() source |
268 | $hex = sprintf "%04X", hex $arg; |
35c0985d |
269 | } else { |
270 | carp("unexpected arg \"$arg\" to charnames::viacode()"); |
271 | return; |
272 | } |
b177ca84 |
273 | |
bd5c3bd9 |
274 | # checking the length first is slightly faster |
275 | if (length($hex) > 5 && hex($hex) > 0x10FFFF) { |
9b5be9b5 |
276 | carp "Unicode characters only allocated up to U+10FFFF (you asked for U+$hex)"; |
35c0985d |
277 | return; |
278 | } |
f0175764 |
279 | |
35c0985d |
280 | return $viacode{$hex} if exists $viacode{$hex}; |
4e2cda5d |
281 | |
35c0985d |
282 | $txt = do "unicore/Name.pl" unless $txt; |
b177ca84 |
283 | |
bd5c3bd9 |
284 | return unless $txt =~ m/^$hex\t\t(.+)/m; |
285 | |
286 | $viacode{$hex} = $1; |
35c0985d |
287 | } # viacode |
daf0d493 |
288 | |
4e2cda5d |
289 | my %vianame; |
290 | |
daf0d493 |
291 | sub vianame |
292 | { |
35c0985d |
293 | if (@_ != 1) { |
294 | carp "charnames::vianame() expects one name argument"; |
295 | return () |
296 | } |
daf0d493 |
297 | |
35c0985d |
298 | my $arg = shift; |
daf0d493 |
299 | |
075d4edd |
300 | return chr CORE::hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/; |
dbc0d4f2 |
301 | |
35c0985d |
302 | return $vianame{$arg} if exists $vianame{$arg}; |
4e2cda5d |
303 | |
35c0985d |
304 | $txt = do "unicore/Name.pl" unless $txt; |
daf0d493 |
305 | |
35c0985d |
306 | my $pos = index $txt, "\t\t$arg\n"; |
859172fe |
307 | if (0 <= $pos) { |
35c0985d |
308 | my $posLF = rindex $txt, "\n", $pos; |
309 | (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d; |
075d4edd |
310 | return $vianame{$arg} = CORE::hex $code; |
35c0985d |
311 | |
859172fe |
312 | # If $pos is at the 1st line, $posLF must be -1 (not found); |
313 | # then $posLF + 1 equals to 0 (at the beginning of $txt). |
35c0985d |
314 | # Otherwise $posLF is the position of "\n"; |
315 | # then $posLF + 1 must be the position of the next to "\n" |
316 | # (the beginning of the line). |
317 | # substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t", |
318 | # "10300\t", "100000", etc. So we can get the code via removing TAB. |
319 | } else { |
320 | return; |
321 | } |
322 | } # vianame |
b177ca84 |
323 | |
423cee85 |
324 | |
325 | 1; |
326 | __END__ |
327 | |
328 | =head1 NAME |
329 | |
274085e3 |
330 | charnames - define character names for C<\N{named}> string literal escapes |
423cee85 |
331 | |
332 | =head1 SYNOPSIS |
333 | |
334 | use charnames ':full'; |
4a2d328f |
335 | print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n"; |
423cee85 |
336 | |
337 | use charnames ':short'; |
4a2d328f |
338 | print "\N{greek:Sigma} is an upper-case sigma.\n"; |
423cee85 |
339 | |
340 | use charnames qw(cyrillic greek); |
4a2d328f |
341 | print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n"; |
423cee85 |
342 | |
35c0985d |
343 | use charnames ":full", ":alias" => { |
344 | e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE", |
76ae0c45 |
345 | }; |
35c0985d |
346 | print "\N{e_ACUTE} is a small letter e with an acute.\n"; |
347 | |
76ae0c45 |
348 | use charnames (); |
a23c04e4 |
349 | print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE" |
350 | printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330" |
b177ca84 |
351 | |
423cee85 |
352 | =head1 DESCRIPTION |
353 | |
35c0985d |
354 | Pragma C<use charnames> supports arguments C<:full>, C<:short>, script |
355 | names and customized aliases. If C<:full> is present, for expansion of |
76ae0c45 |
356 | C<\N{CHARNAME}>, the string C<CHARNAME> is first looked up in the list of |
357 | standard Unicode character names. If C<:short> is present, and |
423cee85 |
358 | C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up |
359 | as a letter in script C<SCRIPT>. If pragma C<use charnames> is used |
a191c821 |
360 | with script name arguments, then for C<\N{CHARNAME}> the name |
423cee85 |
361 | C<CHARNAME> is looked up as a letter in the given scripts (in the |
35c0985d |
362 | specified order). Customized aliases are explained in L</CUSTOM ALIASES>. |
423cee85 |
363 | |
364 | For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME> |
d5448623 |
365 | this pragma looks for the names |
423cee85 |
366 | |
367 | SCRIPTNAME CAPITAL LETTER CHARNAME |
368 | SCRIPTNAME SMALL LETTER CHARNAME |
369 | SCRIPTNAME LETTER CHARNAME |
370 | |
371 | in the table of standard Unicode names. If C<CHARNAME> is lowercase, |
daf0d493 |
372 | then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant |
373 | is ignored. |
374 | |
375 | Note that C<\N{...}> is compile-time, it's a special form of string |
376 | constant used inside double-quoted strings: in other words, you cannot |
4e2cda5d |
377 | use variables inside the C<\N{...}>. If you want similar run-time |
daf0d493 |
378 | functionality, use charnames::vianame(). |
423cee85 |
379 | |
301a3cda |
380 | For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F) |
dbc0d4f2 |
381 | as of Unicode 3.1, there are no official Unicode names but you can use |
382 | instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth). In |
383 | Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429 |
384 | has been updated, see L</ALIASES>. Also note that the U+UU80, U+0081, |
385 | U+0084, and U+0099 do not have names even in ISO 6429. |
386 | |
387 | Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}" |
388 | is the Unicode smiley face, or "\N{WHITE SMILING FACE}". |
301a3cda |
389 | |
5ffe0e96 |
390 | =head1 ALIASES |
423cee85 |
391 | |
5ffe0e96 |
392 | A few aliases have been defined for convenience: instead of having |
393 | to use the official names |
423cee85 |
394 | |
5ffe0e96 |
395 | LINE FEED (LF) |
396 | FORM FEED (FF) |
397 | CARRIAGE RETURN (CR) |
398 | NEXT LINE (NEL) |
423cee85 |
399 | |
5ffe0e96 |
400 | (yes, with parentheses) one can use |
d5448623 |
401 | |
5ffe0e96 |
402 | LINE FEED |
403 | FORM FEED |
404 | CARRIAGE RETURN |
405 | NEXT LINE |
406 | LF |
407 | FF |
408 | CR |
409 | NEL |
410 | |
411 | One can also use |
412 | |
413 | BYTE ORDER MARK |
414 | BOM |
415 | |
416 | and |
417 | |
418 | ZWNJ |
419 | ZWJ |
420 | |
421 | for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER. |
422 | |
423 | For backward compatibility one can use the old names for |
424 | certain C0 and C1 controls |
425 | |
426 | old new |
427 | |
428 | HORIZONTAL TABULATION CHARACTER TABULATION |
429 | VERTICAL TABULATION LINE TABULATION |
430 | FILE SEPARATOR INFORMATION SEPARATOR FOUR |
431 | GROUP SEPARATOR INFORMATION SEPARATOR THREE |
432 | RECORD SEPARATOR INFORMATION SEPARATOR TWO |
433 | UNIT SEPARATOR INFORMATION SEPARATOR ONE |
434 | PARTIAL LINE DOWN PARTIAL LINE FORWARD |
435 | PARTIAL LINE UP PARTIAL LINE BACKWARD |
436 | |
437 | but the old names in addition to giving the character |
438 | will also give a warning about being deprecated. |
423cee85 |
439 | |
35c0985d |
440 | =head1 CUSTOM ALIASES |
441 | |
442 | This version of charnames supports three mechanisms of adding local |
55bc7d3c |
443 | or customized aliases to standard Unicode naming conventions (:full). |
444 | |
445 | Note that an alias should not be something that is a legal curly |
446 | brace-enclosed quantifier (see L<perlreref/QUANTIFIERS>). For example |
447 | C<\N{123}> means to match 123 non-newline characters, and is not treated as an |
448 | alias. Aliases are discouraged from beginning with anything other than an |
449 | alphabetic character and from containing anything other than alphanumerics, |
bee80e93 |
450 | spaces, dashes, colons, parentheses, and underscores. Currently they must be |
451 | ASCII. |
35c0985d |
452 | |
453 | =head2 Anonymous hashes |
454 | |
455 | use charnames ":full", ":alias" => { |
456 | e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE", |
457 | }; |
458 | my $str = "\N{e_ACUTE}"; |
459 | |
460 | =head2 Alias file |
461 | |
462 | use charnames ":full", ":alias" => "pro"; |
463 | |
464 | will try to read "unicore/pro_alias.pl" from the @INC path. This |
465 | file should return a list in plain perl: |
466 | |
467 | ( |
468 | A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE", |
469 | A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX", |
470 | A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS", |
471 | A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE", |
472 | A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE", |
473 | A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE", |
474 | A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON", |
475 | ); |
476 | |
477 | =head2 Alias shortcut |
478 | |
479 | use charnames ":alias" => ":pro"; |
480 | |
481 | works exactly the same as the alias pairs, only this time, |
482 | ":full" is inserted automatically as first argument (if no |
483 | other argument is given). |
484 | |
b177ca84 |
485 | =head1 charnames::viacode(code) |
486 | |
487 | Returns the full name of the character indicated by the numeric code. |
488 | The example |
489 | |
490 | print charnames::viacode(0x2722); |
491 | |
492 | prints "FOUR TEARDROP-SPOKED ASTERISK". |
493 | |
daf0d493 |
494 | Returns undef if no name is known for the code. |
495 | |
35c0985d |
496 | This works only for the standard names, and does not yet apply |
daf0d493 |
497 | to custom translators. |
498 | |
274085e3 |
499 | Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK |
500 | SPACE", not "BYTE ORDER MARK". |
501 | |
eb6a2339 |
502 | =head1 charnames::vianame(name) |
daf0d493 |
503 | |
504 | Returns the code point indicated by the name. |
505 | The example |
506 | |
507 | printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK"); |
508 | |
509 | prints "2722". |
510 | |
eb6a2339 |
511 | Returns undef if the name is unknown. |
b177ca84 |
512 | |
35c0985d |
513 | This works only for the standard names, and does not yet apply |
b177ca84 |
514 | to custom translators. |
515 | |
5ffe0e96 |
516 | =head1 CUSTOM TRANSLATORS |
52ea3e69 |
517 | |
5ffe0e96 |
518 | The mechanism of translation of C<\N{...}> escapes is general and not |
519 | hardwired into F<charnames.pm>. A module can install custom |
520 | translations (inside the scope which C<use>s the module) with the |
521 | following magic incantation: |
52ea3e69 |
522 | |
5ffe0e96 |
523 | sub import { |
524 | shift; |
525 | $^H{charnames} = \&translator; |
526 | } |
52ea3e69 |
527 | |
5ffe0e96 |
528 | Here translator() is a subroutine which takes C<CHARNAME> as an |
529 | argument, and returns text to insert into the string instead of the |
530 | C<\N{CHARNAME}> escape. Since the text to insert should be different |
531 | in C<bytes> mode and out of it, the function should check the current |
532 | state of C<bytes>-flag as in: |
52ea3e69 |
533 | |
5ffe0e96 |
534 | use bytes (); # for $bytes::hint_bits |
535 | sub translator { |
536 | if ($^H & $bytes::hint_bits) { |
537 | return bytes_translator(@_); |
538 | } |
539 | else { |
540 | return utf8_translator(@_); |
541 | } |
542 | } |
52ea3e69 |
543 | |
55bc7d3c |
544 | See L</CUSTOM ALIASES> above for restrictions on C<CHARNAME>. |
545 | |
f0175764 |
546 | =head1 ILLEGAL CHARACTERS |
547 | |
55bc7d3c |
548 | If you ask by name for a character that does not exist, a warning is given and |
549 | the Unicode I<replacement character> "\x{FFFD}" is returned. |
00d835f2 |
550 | |
55bc7d3c |
551 | If you ask by code for a character that is unassigned, no warning is |
00d835f2 |
552 | given and C<undef> is returned. (Though if you ask for a code point |
55bc7d3c |
553 | past U+10FFFF you do get a warning.) See L</BUGS> below. |
f0175764 |
554 | |
423cee85 |
555 | =head1 BUGS |
556 | |
55bc7d3c |
557 | viacode should return an empty string for unassigned in-range Unicode code |
558 | points, as that is their correct current name. |
559 | |
560 | viacode(0) doesn't return C<NULL>, but C<undef> |
561 | |
562 | vianame returns a chr if the input name is of the form C<U+...>, and an ord |
563 | otherwise. It is planned to change this to always return an ord. |
564 | |
565 | None of the functions work on almost all the Hangul syllable and CJK Unicode |
566 | characters that have their code points as part of their names. |
567 | |
bee80e93 |
568 | Names must be ASCII characters only. |
569 | |
fe749c9a |
570 | Unicode standard named sequences are not recognized, such as |
571 | C<LATIN CAPITAL LETTER A WITH MACRON AND GRAVE> |
572 | (which should mean C<LATIN CAPITAL LETTER A WITH MACRON> with an additional |
573 | C<COMBINING GRAVE ACCENT>). |
574 | |
55bc7d3c |
575 | Since evaluation of the translation function happens in the middle of |
423cee85 |
576 | compilation (of a string literal), the translation function should not |
577 | do any C<eval>s or C<require>s. This restriction should be lifted in |
578 | a future version of Perl. |
579 | |
580 | =cut |