integrate from perlio: changes #17709, #17795, #17796, #18032
[p5sagit/p5-mst-13.2.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84 2use strict;
3use warnings;
4use Carp;
35c0985d 5our $VERSION = '1.02';
b75c8c73 6
d5448623 7use bytes (); # for $bytes::hint_bits
8$charnames::hint_bits = 0x20000;
423cee85 9
52ea3e69 10my %alias1 = (
11 # Icky 3.2 names with parentheses.
12 'LINE FEED' => 'LINE FEED (LF)',
13 'FORM FEED' => 'FORM FEED (FF)',
14 'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)',
15 'NEXT LINE' => 'NEXT LINE (NEL)',
16 # Convenience.
17 'LF' => 'LINE FEED (LF)',
18 'FF' => 'FORM FEED (FF)',
eb380778 19 'CR' => 'CARRIAGE RETURN (CR)',
51e9e896 20 'NEL' => 'NEXT LINE (NEL)',
24b5d5cc 21 # More convenience. For futher convencience,
22 # it is suggested some way using using the NamesList
23 # aliases is implemented.
24 'ZWNJ' => 'ZERO WIDTH NON-JOINER',
25 'ZWJ' => 'ZERO WIDTH JOINER',
52ea3e69 26 'BOM' => 'BYTE ORDER MARK',
27 );
28
29my %alias2 = (
30 # Pre-3.2 compatibility (only for the first 256 characters).
31 'HORIZONTAL TABULATION' => 'CHARACTER TABULATION',
32 'VERTICAL TABULATION' => 'LINE TABULATION',
33 'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR',
34 'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE',
35 'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO',
36 'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE',
37 'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD',
38 'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD',
39 );
40
35c0985d 41my %alias3 = (
42 # User defined aliasses. Even more convenient :)
43 );
423cee85 44my $txt;
45
35c0985d 46sub alias (@)
47{
48 @_ or return %alias3;
49 my $alias = ref $_[0] ? $_[0] : { @_ };
50 @alias3{keys %$alias} = values %$alias;
51} # alias
52
53sub alias_file ($)
54{
55 my $arg = shift;
56 my $file = -f $arg ? $arg : "unicore/${arg}_alias.pl";
57 if (my @alias = do $file) {
58 alias (@alias);
59 return (1);
60 }
61 0;
62} # alias_file
63
423cee85 64# This is not optimized in any way yet
b177ca84 65sub charnames
66{
67 my $name = shift;
68
52ea3e69 69 if (exists $alias1{$name}) {
35c0985d 70 $name = $alias1{$name};
52ea3e69 71 }
35c0985d 72 elsif (exists $alias2{$name}) {
73 require warnings;
74 warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead});
75 $name = $alias2{$name};
76 }
77 elsif (exists $alias3{$name}) {
78 $name = $alias3{$name};
52ea3e69 79 }
b177ca84 80
52ea3e69 81 my $ord;
423cee85 82 my @off;
52ea3e69 83 my $fname;
84
85 if ($name eq "BYTE ORDER MARK") {
35c0985d 86 $fname = $name;
87 $ord = 0xFEFF;
52ea3e69 88 } else {
35c0985d 89 ## Suck in the code/name list as a big string.
90 ## Lines look like:
91 ## "0052\t\tLATIN CAPITAL LETTER R\n"
92 $txt = do "unicore/Name.pl" unless $txt;
93
94 ## @off will hold the index into the code/name string of the start and
95 ## end of the name as we find it.
96
97 ## If :full, look for the the name exactly
98 if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) {
99 @off = ($-[0], $+[0]);
100 }
101
102 ## If we didn't get above, and :short allowed, look for the short name.
103 ## The short name is like "greek:Sigma"
104 unless (@off) {
105 if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
106 my ($script, $cname) = ($1, $2);
107 my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
108 if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
52ea3e69 109 @off = ($-[0], $+[0]);
35c0985d 110 }
423cee85 111 }
35c0985d 112 }
b177ca84 113
35c0985d 114 ## If we still don't have it, check for the name among the loaded
115 ## scripts.
116 if (not @off) {
117 my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
118 for my $script (@{$^H{charnames_scripts}}) {
119 if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
120 @off = ($-[0], $+[0]);
121 last;
122 }
52ea3e69 123 }
35c0985d 124 }
125
126 ## If we don't have it by now, give up.
127 unless (@off) {
128 carp "Unknown charname '$name'";
129 return "\x{FFFD}";
130 }
131
132 ##
133 ## Now know where in the string the name starts.
134 ## The code, in hex, is before that.
135 ##
136 ## The code can be 4-6 characters long, so we've got to sort of
137 ## go look for it, just after the newline that comes before $off[0].
138 ##
139 ## This would be much easier if unicore/Name.pl had info in
140 ## a name/code order, instead of code/name order.
141 ##
142 ## The +1 after the rindex() is to skip past the newline we're finding,
143 ## or, if the rindex() fails, to put us to an offset of zero.
144 ##
145 my $hexstart = rindex($txt, "\n", $off[0]) + 1;
146
147 ## we know where it starts, so turn into number -
148 ## the ordinal for the char.
149 $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
423cee85 150 }
b177ca84 151
d5448623 152 if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
8058d7ab 153 use bytes;
d41ff1b8 154 return chr $ord if $ord <= 255;
f0175764 155 my $hex = sprintf "%04x", $ord;
52ea3e69 156 if (not defined $fname) {
35c0985d 157 $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
52ea3e69 158 }
f0175764 159 croak "Character 0x$hex with name '$fname' is above 0xFF";
423cee85 160 }
f0175764 161
52ea3e69 162 no warnings 'utf8'; # allow even illegal characters
bfa383d6 163 return pack "U", $ord;
35c0985d 164} # charnames
423cee85 165
b177ca84 166sub import
167{
168 shift; ## ignore class name
169
35c0985d 170 if (not @_) {
171 carp("`use charnames' needs explicit imports list");
b177ca84 172 }
d5448623 173 $^H |= $charnames::hint_bits;
423cee85 174 $^H{charnames} = \&charnames ;
b177ca84 175
176 ##
177 ## fill %h keys with our @_ args.
178 ##
35c0985d 179 my ($promote, %h, @args) = (0);
180 while (@_ and $_ = shift) {
181 if ($_ eq ":alias" && @_) {
182 my $alias = shift;
183 if (ref $alias) {
184 ref $alias eq "HASH" or
185 die "Only HASH reference supported as argument to :alias";
186 alias ($alias);
187 next;
188 }
189 if ($alias =~ m{:(\w+)$} and $1 ne "full" && $1 ne "short") {
190 alias_file ($1) and $promote = 1, next;
191 }
192 alias_file ($alias) and next;
193 }
194 push @args, $_;
195 }
196 @args == 0 && $promote and @args = (":full");
197 @h{@args} = (1) x @args;
b177ca84 198
423cee85 199 $^H{charnames_full} = delete $h{':full'};
200 $^H{charnames_short} = delete $h{':short'};
201 $^H{charnames_scripts} = [map uc, keys %h];
b177ca84 202
203 ##
204 ## If utf8? warnings are enabled, and some scripts were given,
205 ## see if at least we can find one letter of each script.
206 ##
35c0985d 207 if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
208 $txt = do "unicore/Name.pl" unless $txt;
209
210 for my $script (@{$^H{charnames_scripts}}) {
211 if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
212 warnings::warn('utf8', "No such script: '$script'");
b177ca84 213 }
35c0985d 214 }
bd62941a 215 }
35c0985d 216} # import
423cee85 217
f0175764 218require Unicode::UCD; # for Unicode::UCD::_getcode()
219
4e2cda5d 220my %viacode;
221
b177ca84 222sub viacode
223{
35c0985d 224 if (@_ != 1) {
225 carp "charnames::viacode() expects one argument";
226 return ()
227 }
f0175764 228
35c0985d 229 my $arg = shift;
230 my $code = Unicode::UCD::_getcode($arg);
b177ca84 231
35c0985d 232 my $hex;
f0175764 233
35c0985d 234 if (defined $code) {
235 $hex = sprintf "%04X", $arg;
236 } else {
237 carp("unexpected arg \"$arg\" to charnames::viacode()");
238 return;
239 }
b177ca84 240
35c0985d 241 if ($code > 0x10FFFF) {
242 carp sprintf "Unicode characters only allocated up to U+10FFFF (you asked for U+%X)", $hex;
243 return;
244 }
f0175764 245
35c0985d 246 return $viacode{$hex} if exists $viacode{$hex};
4e2cda5d 247
35c0985d 248 $txt = do "unicore/Name.pl" unless $txt;
b177ca84 249
35c0985d 250 if ($txt =~ m/^$hex\t\t(.+)/m) {
251 return $viacode{$hex} = $1;
252 } else {
253 return;
254 }
255} # viacode
daf0d493 256
4e2cda5d 257my %vianame;
258
daf0d493 259sub vianame
260{
35c0985d 261 if (@_ != 1) {
262 carp "charnames::vianame() expects one name argument";
263 return ()
264 }
daf0d493 265
35c0985d 266 my $arg = shift;
daf0d493 267
35c0985d 268 return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/;
dbc0d4f2 269
35c0985d 270 return $vianame{$arg} if exists $vianame{$arg};
4e2cda5d 271
35c0985d 272 $txt = do "unicore/Name.pl" unless $txt;
daf0d493 273
35c0985d 274 my $pos = index $txt, "\t\t$arg\n";
275 if ($[ <= $pos) {
276 my $posLF = rindex $txt, "\n", $pos;
277 (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d;
278 return $vianame{$arg} = hex $code;
279
280 # If $pos is at the 1st line, $posLF must be $[ - 1 (not found);
281 # then $posLF + 1 equals to $[ (at the beginning of $txt).
282 # Otherwise $posLF is the position of "\n";
283 # then $posLF + 1 must be the position of the next to "\n"
284 # (the beginning of the line).
285 # substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t",
286 # "10300\t", "100000", etc. So we can get the code via removing TAB.
287 } else {
288 return;
289 }
290} # vianame
b177ca84 291
423cee85 292
2931;
294__END__
295
296=head1 NAME
297
274085e3 298charnames - define character names for C<\N{named}> string literal escapes
423cee85 299
300=head1 SYNOPSIS
301
302 use charnames ':full';
4a2d328f 303 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85 304
305 use charnames ':short';
4a2d328f 306 print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85 307
308 use charnames qw(cyrillic greek);
4a2d328f 309 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85 310
35c0985d 311 use charnames ":full", ":alias" => {
312 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
313 };
314 print "\N{e_ACUTE} is a small letter e with an acute.\n";
315
a23c04e4 316 print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
317 printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330"
b177ca84 318
423cee85 319=head1 DESCRIPTION
320
35c0985d 321Pragma C<use charnames> supports arguments C<:full>, C<:short>, script
322names and customized aliases. If C<:full> is present, for expansion of
a191c821 323C<\N{CHARNAME}> string C<CHARNAME> is first looked in the list of
423cee85 324standard Unicode names of chars. If C<:short> is present, and
325C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
326as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
a191c821 327with script name arguments, then for C<\N{CHARNAME}> the name
423cee85 328C<CHARNAME> is looked up as a letter in the given scripts (in the
35c0985d 329specified order). Customized aliases are explained in L</CUSTOM ALIASES>.
423cee85 330
331For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
d5448623 332this pragma looks for the names
423cee85 333
334 SCRIPTNAME CAPITAL LETTER CHARNAME
335 SCRIPTNAME SMALL LETTER CHARNAME
336 SCRIPTNAME LETTER CHARNAME
337
338in the table of standard Unicode names. If C<CHARNAME> is lowercase,
daf0d493 339then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
340is ignored.
341
342Note that C<\N{...}> is compile-time, it's a special form of string
343constant used inside double-quoted strings: in other words, you cannot
4e2cda5d 344use variables inside the C<\N{...}>. If you want similar run-time
daf0d493 345functionality, use charnames::vianame().
423cee85 346
301a3cda 347For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
dbc0d4f2 348as of Unicode 3.1, there are no official Unicode names but you can use
349instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth). In
350Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429
351has been updated, see L</ALIASES>. Also note that the U+UU80, U+0081,
352U+0084, and U+0099 do not have names even in ISO 6429.
353
354Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}"
355is the Unicode smiley face, or "\N{WHITE SMILING FACE}".
301a3cda 356
423cee85 357=head1 CUSTOM TRANSLATORS
358
d5448623 359The mechanism of translation of C<\N{...}> escapes is general and not
423cee85 360hardwired into F<charnames.pm>. A module can install custom
d5448623 361translations (inside the scope which C<use>s the module) with the
423cee85 362following magic incantation:
363
d5448623 364 use charnames (); # for $charnames::hint_bits
365 sub import {
366 shift;
367 $^H |= $charnames::hint_bits;
368 $^H{charnames} = \&translator;
369 }
423cee85 370
371Here translator() is a subroutine which takes C<CHARNAME> as an
372argument, and returns text to insert into the string instead of the
4a2d328f 373C<\N{CHARNAME}> escape. Since the text to insert should be different
d5448623 374in C<bytes> mode and out of it, the function should check the current
375state of C<bytes>-flag as in:
376
377 use bytes (); # for $bytes::hint_bits
378 sub translator {
379 if ($^H & $bytes::hint_bits) {
380 return bytes_translator(@_);
381 }
382 else {
383 return utf8_translator(@_);
384 }
423cee85 385 }
423cee85 386
35c0985d 387=head1 CUSTOM ALIASES
388
389This version of charnames supports three mechanisms of adding local
390or customized aliases to standard Unicode naming conventions (:full)
391
392=head2 Anonymous hashes
393
394 use charnames ":full", ":alias" => {
395 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
396 };
397 my $str = "\N{e_ACUTE}";
398
399=head2 Alias file
400
401 use charnames ":full", ":alias" => "pro";
402
403 will try to read "unicore/pro_alias.pl" from the @INC path. This
404 file should return a list in plain perl:
405
406 (
407 A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE",
408 A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
409 A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS",
410 A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE",
411 A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE",
412 A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE",
413 A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON",
414 );
415
416=head2 Alias shortcut
417
418 use charnames ":alias" => ":pro";
419
420 works exactly the same as the alias pairs, only this time,
421 ":full" is inserted automatically as first argument (if no
422 other argument is given).
423
b177ca84 424=head1 charnames::viacode(code)
425
426Returns the full name of the character indicated by the numeric code.
427The example
428
429 print charnames::viacode(0x2722);
430
431prints "FOUR TEARDROP-SPOKED ASTERISK".
432
daf0d493 433Returns undef if no name is known for the code.
434
35c0985d 435This works only for the standard names, and does not yet apply
daf0d493 436to custom translators.
437
274085e3 438Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
439SPACE", not "BYTE ORDER MARK".
440
eb6a2339 441=head1 charnames::vianame(name)
daf0d493 442
443Returns the code point indicated by the name.
444The example
445
446 printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
447
448prints "2722".
449
eb6a2339 450Returns undef if the name is unknown.
b177ca84 451
35c0985d 452This works only for the standard names, and does not yet apply
b177ca84 453to custom translators.
454
52ea3e69 455=head1 ALIASES
456
457A few aliases have been defined for convenience: instead of having
458to use the official names
459
460 LINE FEED (LF)
461 FORM FEED (FF)
462 CARRIAGE RETURN (CR)
463 NEXT LINE (NEL)
464
465(yes, with parentheses) one can use
466
467 LINE FEED
468 FORM FEED
469 CARRIAGE RETURN
470 NEXT LINE
471 LF
472 FF
473 CR
474 NEL
475
476One can also use
477
478 BYTE ORDER MARK
479 BOM
480
24b5d5cc 481and
482
483 ZWNJ
484 ZWJ
485
486for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER.
52ea3e69 487
488For backward compatibility one can use the old names for
489certain C0 and C1 controls
490
491 old new
492
493 HORIZONTAL TABULATION CHARACTER TABULATION
494 VERTICAL TABULATION LINE TABULATION
495 FILE SEPARATOR INFORMATION SEPARATOR FOUR
496 GROUP SEPARATOR INFORMATION SEPARATOR THREE
497 RECORD SEPARATOR INFORMATION SEPARATOR TWO
498 UNIT SEPARATOR INFORMATION SEPARATOR ONE
499 PARTIAL LINE DOWN PARTIAL LINE FORWARD
500 PARTIAL LINE UP PARTIAL LINE BACKWARD
501
502but the old names in addition to giving the character
503will also give a warning about being deprecated.
504
f0175764 505=head1 ILLEGAL CHARACTERS
506
00d835f2 507If you ask by name for a character that does not exist, a warning is
508given and the Unicode I<replacement character> "\x{FFFD}" is returned.
509
510If you ask by code for a character that does not exist, no warning is
511given and C<undef> is returned. (Though if you ask for a code point
512past U+10FFFF you do get a warning.)
f0175764 513
423cee85 514=head1 BUGS
515
516Since evaluation of the translation function happens in a middle of
517compilation (of a string literal), the translation function should not
518do any C<eval>s or C<require>s. This restriction should be lifted in
519a future version of Perl.
520
521=cut