do just-in-time loading of Carp.pm in some modules
[p5sagit/p5-mst-13.2.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84 2use strict;
3use warnings;
51cf30b6 4use File::Spec;
8878f897 5our $VERSION = '1.04_1';
b75c8c73 6
d5448623 7use bytes (); # for $bytes::hint_bits
9cfe5470 8$charnames::hint_bits = 0x20000; # HINT_LOCALIZE_HH
423cee85 9
52ea3e69 10my %alias1 = (
11 # Icky 3.2 names with parentheses.
12 'LINE FEED' => 'LINE FEED (LF)',
13 'FORM FEED' => 'FORM FEED (FF)',
14 'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)',
15 'NEXT LINE' => 'NEXT LINE (NEL)',
16 # Convenience.
17 'LF' => 'LINE FEED (LF)',
18 'FF' => 'FORM FEED (FF)',
eb380778 19 'CR' => 'CARRIAGE RETURN (CR)',
51e9e896 20 'NEL' => 'NEXT LINE (NEL)',
24b5d5cc 21 # More convenience. For futher convencience,
22 # it is suggested some way using using the NamesList
23 # aliases is implemented.
24 'ZWNJ' => 'ZERO WIDTH NON-JOINER',
25 'ZWJ' => 'ZERO WIDTH JOINER',
52ea3e69 26 'BOM' => 'BYTE ORDER MARK',
27 );
28
29my %alias2 = (
30 # Pre-3.2 compatibility (only for the first 256 characters).
31 'HORIZONTAL TABULATION' => 'CHARACTER TABULATION',
32 'VERTICAL TABULATION' => 'LINE TABULATION',
33 'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR',
34 'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE',
35 'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO',
36 'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE',
37 'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD',
38 'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD',
39 );
40
35c0985d 41my %alias3 = (
42 # User defined aliasses. Even more convenient :)
43 );
423cee85 44my $txt;
45
8878f897 46sub croak
47{
48 require Carp; goto &Carp::croak;
49} # croak
50
51sub carp
52{
53 require Carp; goto &Carp::carp;
54} # carp
55
35c0985d 56sub alias (@)
57{
58 @_ or return %alias3;
59 my $alias = ref $_[0] ? $_[0] : { @_ };
60 @alias3{keys %$alias} = values %$alias;
61} # alias
62
63sub alias_file ($)
64{
51cf30b6 65 my ($arg, $file) = @_;
66 if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
67 $file = $arg;
68 }
69 elsif ($arg =~ m/^\w+$/) {
70 $file = "unicore/${arg}_alias.pl";
71 }
72 else {
73 croak "Charnames alias files can only have identifier characters";
74 }
35c0985d 75 if (my @alias = do $file) {
51cf30b6 76 @alias == 1 && !defined $alias[0] and
77 croak "$file cannot be used as alias file for charnames";
78 @alias % 2 and
79 croak "$file did not return a (valid) list of alias pairs";
35c0985d 80 alias (@alias);
81 return (1);
82 }
83 0;
84} # alias_file
85
423cee85 86# This is not optimized in any way yet
b177ca84 87sub charnames
88{
89 my $name = shift;
90
52ea3e69 91 if (exists $alias1{$name}) {
35c0985d 92 $name = $alias1{$name};
52ea3e69 93 }
35c0985d 94 elsif (exists $alias2{$name}) {
95 require warnings;
96 warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead});
97 $name = $alias2{$name};
98 }
99 elsif (exists $alias3{$name}) {
100 $name = $alias3{$name};
52ea3e69 101 }
b177ca84 102
52ea3e69 103 my $ord;
423cee85 104 my @off;
52ea3e69 105 my $fname;
106
107 if ($name eq "BYTE ORDER MARK") {
35c0985d 108 $fname = $name;
109 $ord = 0xFEFF;
52ea3e69 110 } else {
35c0985d 111 ## Suck in the code/name list as a big string.
112 ## Lines look like:
113 ## "0052\t\tLATIN CAPITAL LETTER R\n"
114 $txt = do "unicore/Name.pl" unless $txt;
115
116 ## @off will hold the index into the code/name string of the start and
117 ## end of the name as we find it.
118
a6d05634 119 ## If :full, look for the name exactly
35c0985d 120 if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) {
121 @off = ($-[0], $+[0]);
122 }
123
124 ## If we didn't get above, and :short allowed, look for the short name.
125 ## The short name is like "greek:Sigma"
126 unless (@off) {
127 if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
128 my ($script, $cname) = ($1, $2);
129 my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
130 if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
52ea3e69 131 @off = ($-[0], $+[0]);
35c0985d 132 }
423cee85 133 }
35c0985d 134 }
b177ca84 135
35c0985d 136 ## If we still don't have it, check for the name among the loaded
137 ## scripts.
138 if (not @off) {
139 my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
140 for my $script (@{$^H{charnames_scripts}}) {
141 if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
142 @off = ($-[0], $+[0]);
143 last;
144 }
52ea3e69 145 }
35c0985d 146 }
147
148 ## If we don't have it by now, give up.
149 unless (@off) {
150 carp "Unknown charname '$name'";
151 return "\x{FFFD}";
152 }
153
154 ##
155 ## Now know where in the string the name starts.
156 ## The code, in hex, is before that.
157 ##
158 ## The code can be 4-6 characters long, so we've got to sort of
159 ## go look for it, just after the newline that comes before $off[0].
160 ##
161 ## This would be much easier if unicore/Name.pl had info in
162 ## a name/code order, instead of code/name order.
163 ##
164 ## The +1 after the rindex() is to skip past the newline we're finding,
165 ## or, if the rindex() fails, to put us to an offset of zero.
166 ##
167 my $hexstart = rindex($txt, "\n", $off[0]) + 1;
168
169 ## we know where it starts, so turn into number -
170 ## the ordinal for the char.
171 $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
423cee85 172 }
b177ca84 173
d5448623 174 if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
8058d7ab 175 use bytes;
d41ff1b8 176 return chr $ord if $ord <= 255;
f0175764 177 my $hex = sprintf "%04x", $ord;
52ea3e69 178 if (not defined $fname) {
35c0985d 179 $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
52ea3e69 180 }
f0175764 181 croak "Character 0x$hex with name '$fname' is above 0xFF";
423cee85 182 }
f0175764 183
52ea3e69 184 no warnings 'utf8'; # allow even illegal characters
bfa383d6 185 return pack "U", $ord;
35c0985d 186} # charnames
423cee85 187
b177ca84 188sub import
189{
190 shift; ## ignore class name
191
35c0985d 192 if (not @_) {
193 carp("`use charnames' needs explicit imports list");
b177ca84 194 }
d5448623 195 $^H |= $charnames::hint_bits;
423cee85 196 $^H{charnames} = \&charnames ;
b177ca84 197
198 ##
199 ## fill %h keys with our @_ args.
200 ##
35c0985d 201 my ($promote, %h, @args) = (0);
e5c3f898 202 while (my $arg = shift) {
203 if ($arg eq ":alias") {
51cf30b6 204 @_ or
205 croak ":alias needs an argument in charnames";
35c0985d 206 my $alias = shift;
207 if (ref $alias) {
208 ref $alias eq "HASH" or
51cf30b6 209 croak "Only HASH reference supported as argument to :alias";
35c0985d 210 alias ($alias);
211 next;
212 }
51cf30b6 213 if ($alias =~ m{:(\w+)$}) {
214 $1 eq "full" || $1 eq "short" and
215 croak ":alias cannot use existing pragma :$1 (reversed order?)";
216 alias_file ($1) and $promote = 1;
217 next;
35c0985d 218 }
51cf30b6 219 alias_file ($alias);
220 next;
221 }
e5c3f898 222 if (substr($arg, 0, 1) eq ':' and ! ($arg eq ":full" || $arg eq ":short")) {
223 warn "unsupported special '$arg' in charnames";
51cf30b6 224 next;
35c0985d 225 }
e5c3f898 226 push @args, $arg;
35c0985d 227 }
228 @args == 0 && $promote and @args = (":full");
229 @h{@args} = (1) x @args;
b177ca84 230
423cee85 231 $^H{charnames_full} = delete $h{':full'};
232 $^H{charnames_short} = delete $h{':short'};
233 $^H{charnames_scripts} = [map uc, keys %h];
b177ca84 234
235 ##
236 ## If utf8? warnings are enabled, and some scripts were given,
237 ## see if at least we can find one letter of each script.
238 ##
35c0985d 239 if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
240 $txt = do "unicore/Name.pl" unless $txt;
241
242 for my $script (@{$^H{charnames_scripts}}) {
243 if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
244 warnings::warn('utf8', "No such script: '$script'");
b177ca84 245 }
35c0985d 246 }
bd62941a 247 }
35c0985d 248} # import
423cee85 249
b8effcb5 250# this comes actually from Unicode::UCD, but it avoids the
251# overhead of loading it
252sub _getcode {
253 my $arg = shift;
254
255 if ($arg =~ /^[1-9]\d*$/) {
256 return $arg;
257 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
258 return hex($1);
259 }
260
261 return;
262}
f0175764 263
4e2cda5d 264my %viacode;
265
b177ca84 266sub viacode
267{
35c0985d 268 if (@_ != 1) {
269 carp "charnames::viacode() expects one argument";
270 return ()
271 }
f0175764 272
35c0985d 273 my $arg = shift;
b8effcb5 274 my $code = _getcode($arg);
b177ca84 275
35c0985d 276 my $hex;
f0175764 277
35c0985d 278 if (defined $code) {
279 $hex = sprintf "%04X", $arg;
280 } else {
281 carp("unexpected arg \"$arg\" to charnames::viacode()");
282 return;
283 }
b177ca84 284
35c0985d 285 if ($code > 0x10FFFF) {
286 carp sprintf "Unicode characters only allocated up to U+10FFFF (you asked for U+%X)", $hex;
287 return;
288 }
f0175764 289
35c0985d 290 return $viacode{$hex} if exists $viacode{$hex};
4e2cda5d 291
35c0985d 292 $txt = do "unicore/Name.pl" unless $txt;
b177ca84 293
35c0985d 294 if ($txt =~ m/^$hex\t\t(.+)/m) {
295 return $viacode{$hex} = $1;
296 } else {
297 return;
298 }
299} # viacode
daf0d493 300
4e2cda5d 301my %vianame;
302
daf0d493 303sub vianame
304{
35c0985d 305 if (@_ != 1) {
306 carp "charnames::vianame() expects one name argument";
307 return ()
308 }
daf0d493 309
35c0985d 310 my $arg = shift;
daf0d493 311
35c0985d 312 return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/;
dbc0d4f2 313
35c0985d 314 return $vianame{$arg} if exists $vianame{$arg};
4e2cda5d 315
35c0985d 316 $txt = do "unicore/Name.pl" unless $txt;
daf0d493 317
35c0985d 318 my $pos = index $txt, "\t\t$arg\n";
319 if ($[ <= $pos) {
320 my $posLF = rindex $txt, "\n", $pos;
321 (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d;
322 return $vianame{$arg} = hex $code;
323
324 # If $pos is at the 1st line, $posLF must be $[ - 1 (not found);
325 # then $posLF + 1 equals to $[ (at the beginning of $txt).
326 # Otherwise $posLF is the position of "\n";
327 # then $posLF + 1 must be the position of the next to "\n"
328 # (the beginning of the line).
329 # substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t",
330 # "10300\t", "100000", etc. So we can get the code via removing TAB.
331 } else {
332 return;
333 }
334} # vianame
b177ca84 335
423cee85 336
3371;
338__END__
339
340=head1 NAME
341
274085e3 342charnames - define character names for C<\N{named}> string literal escapes
423cee85 343
344=head1 SYNOPSIS
345
346 use charnames ':full';
4a2d328f 347 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85 348
349 use charnames ':short';
4a2d328f 350 print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85 351
352 use charnames qw(cyrillic greek);
4a2d328f 353 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85 354
35c0985d 355 use charnames ":full", ":alias" => {
356 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
76ae0c45 357 };
35c0985d 358 print "\N{e_ACUTE} is a small letter e with an acute.\n";
359
76ae0c45 360 use charnames ();
a23c04e4 361 print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
362 printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330"
b177ca84 363
423cee85 364=head1 DESCRIPTION
365
35c0985d 366Pragma C<use charnames> supports arguments C<:full>, C<:short>, script
367names and customized aliases. If C<:full> is present, for expansion of
76ae0c45 368C<\N{CHARNAME}>, the string C<CHARNAME> is first looked up in the list of
369standard Unicode character names. If C<:short> is present, and
423cee85 370C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
371as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
a191c821 372with script name arguments, then for C<\N{CHARNAME}> the name
423cee85 373C<CHARNAME> is looked up as a letter in the given scripts (in the
35c0985d 374specified order). Customized aliases are explained in L</CUSTOM ALIASES>.
423cee85 375
376For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
d5448623 377this pragma looks for the names
423cee85 378
379 SCRIPTNAME CAPITAL LETTER CHARNAME
380 SCRIPTNAME SMALL LETTER CHARNAME
381 SCRIPTNAME LETTER CHARNAME
382
383in the table of standard Unicode names. If C<CHARNAME> is lowercase,
daf0d493 384then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
385is ignored.
386
387Note that C<\N{...}> is compile-time, it's a special form of string
388constant used inside double-quoted strings: in other words, you cannot
4e2cda5d 389use variables inside the C<\N{...}>. If you want similar run-time
daf0d493 390functionality, use charnames::vianame().
423cee85 391
301a3cda 392For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
dbc0d4f2 393as of Unicode 3.1, there are no official Unicode names but you can use
394instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth). In
395Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429
396has been updated, see L</ALIASES>. Also note that the U+UU80, U+0081,
397U+0084, and U+0099 do not have names even in ISO 6429.
398
399Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}"
400is the Unicode smiley face, or "\N{WHITE SMILING FACE}".
301a3cda 401
423cee85 402=head1 CUSTOM TRANSLATORS
403
d5448623 404The mechanism of translation of C<\N{...}> escapes is general and not
423cee85 405hardwired into F<charnames.pm>. A module can install custom
d5448623 406translations (inside the scope which C<use>s the module) with the
423cee85 407following magic incantation:
408
d5448623 409 use charnames (); # for $charnames::hint_bits
410 sub import {
411 shift;
412 $^H |= $charnames::hint_bits;
413 $^H{charnames} = \&translator;
414 }
423cee85 415
416Here translator() is a subroutine which takes C<CHARNAME> as an
417argument, and returns text to insert into the string instead of the
4a2d328f 418C<\N{CHARNAME}> escape. Since the text to insert should be different
d5448623 419in C<bytes> mode and out of it, the function should check the current
420state of C<bytes>-flag as in:
421
422 use bytes (); # for $bytes::hint_bits
423 sub translator {
424 if ($^H & $bytes::hint_bits) {
425 return bytes_translator(@_);
426 }
427 else {
428 return utf8_translator(@_);
429 }
423cee85 430 }
423cee85 431
35c0985d 432=head1 CUSTOM ALIASES
433
434This version of charnames supports three mechanisms of adding local
435or customized aliases to standard Unicode naming conventions (:full)
436
437=head2 Anonymous hashes
438
439 use charnames ":full", ":alias" => {
440 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
441 };
442 my $str = "\N{e_ACUTE}";
443
444=head2 Alias file
445
446 use charnames ":full", ":alias" => "pro";
447
448 will try to read "unicore/pro_alias.pl" from the @INC path. This
449 file should return a list in plain perl:
450
451 (
452 A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE",
453 A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
454 A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS",
455 A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE",
456 A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE",
457 A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE",
458 A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON",
459 );
460
461=head2 Alias shortcut
462
463 use charnames ":alias" => ":pro";
464
465 works exactly the same as the alias pairs, only this time,
466 ":full" is inserted automatically as first argument (if no
467 other argument is given).
468
b177ca84 469=head1 charnames::viacode(code)
470
471Returns the full name of the character indicated by the numeric code.
472The example
473
474 print charnames::viacode(0x2722);
475
476prints "FOUR TEARDROP-SPOKED ASTERISK".
477
daf0d493 478Returns undef if no name is known for the code.
479
35c0985d 480This works only for the standard names, and does not yet apply
daf0d493 481to custom translators.
482
274085e3 483Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
484SPACE", not "BYTE ORDER MARK".
485
eb6a2339 486=head1 charnames::vianame(name)
daf0d493 487
488Returns the code point indicated by the name.
489The example
490
491 printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
492
493prints "2722".
494
eb6a2339 495Returns undef if the name is unknown.
b177ca84 496
35c0985d 497This works only for the standard names, and does not yet apply
b177ca84 498to custom translators.
499
52ea3e69 500=head1 ALIASES
501
502A few aliases have been defined for convenience: instead of having
503to use the official names
504
505 LINE FEED (LF)
506 FORM FEED (FF)
507 CARRIAGE RETURN (CR)
508 NEXT LINE (NEL)
509
510(yes, with parentheses) one can use
511
512 LINE FEED
513 FORM FEED
514 CARRIAGE RETURN
515 NEXT LINE
516 LF
517 FF
518 CR
519 NEL
520
521One can also use
522
523 BYTE ORDER MARK
524 BOM
525
24b5d5cc 526and
527
528 ZWNJ
529 ZWJ
530
531for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER.
52ea3e69 532
533For backward compatibility one can use the old names for
534certain C0 and C1 controls
535
536 old new
537
538 HORIZONTAL TABULATION CHARACTER TABULATION
539 VERTICAL TABULATION LINE TABULATION
540 FILE SEPARATOR INFORMATION SEPARATOR FOUR
541 GROUP SEPARATOR INFORMATION SEPARATOR THREE
542 RECORD SEPARATOR INFORMATION SEPARATOR TWO
543 UNIT SEPARATOR INFORMATION SEPARATOR ONE
544 PARTIAL LINE DOWN PARTIAL LINE FORWARD
545 PARTIAL LINE UP PARTIAL LINE BACKWARD
546
547but the old names in addition to giving the character
548will also give a warning about being deprecated.
549
f0175764 550=head1 ILLEGAL CHARACTERS
551
00d835f2 552If you ask by name for a character that does not exist, a warning is
553given and the Unicode I<replacement character> "\x{FFFD}" is returned.
554
555If you ask by code for a character that does not exist, no warning is
556given and C<undef> is returned. (Though if you ask for a code point
557past U+10FFFF you do get a warning.)
f0175764 558
423cee85 559=head1 BUGS
560
561Since evaluation of the translation function happens in a middle of
562compilation (of a string literal), the translation function should not
563do any C<eval>s or C<require>s. This restriction should be lifted in
564a future version of Perl.
565
566=cut