More PerlIO documentation.
[p5sagit/p5-mst-13.2.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84 2use strict;
3use warnings;
4use Carp;
51cf30b6 5use File::Spec;
35c0985d 6our $VERSION = '1.02';
b75c8c73 7
d5448623 8use bytes (); # for $bytes::hint_bits
9cfe5470 9$charnames::hint_bits = 0x20000; # HINT_LOCALIZE_HH
423cee85 10
52ea3e69 11my %alias1 = (
12 # Icky 3.2 names with parentheses.
13 'LINE FEED' => 'LINE FEED (LF)',
14 'FORM FEED' => 'FORM FEED (FF)',
15 'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)',
16 'NEXT LINE' => 'NEXT LINE (NEL)',
17 # Convenience.
18 'LF' => 'LINE FEED (LF)',
19 'FF' => 'FORM FEED (FF)',
eb380778 20 'CR' => 'CARRIAGE RETURN (CR)',
51e9e896 21 'NEL' => 'NEXT LINE (NEL)',
24b5d5cc 22 # More convenience. For futher convencience,
23 # it is suggested some way using using the NamesList
24 # aliases is implemented.
25 'ZWNJ' => 'ZERO WIDTH NON-JOINER',
26 'ZWJ' => 'ZERO WIDTH JOINER',
52ea3e69 27 'BOM' => 'BYTE ORDER MARK',
28 );
29
30my %alias2 = (
31 # Pre-3.2 compatibility (only for the first 256 characters).
32 'HORIZONTAL TABULATION' => 'CHARACTER TABULATION',
33 'VERTICAL TABULATION' => 'LINE TABULATION',
34 'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR',
35 'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE',
36 'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO',
37 'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE',
38 'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD',
39 'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD',
40 );
41
35c0985d 42my %alias3 = (
43 # User defined aliasses. Even more convenient :)
44 );
423cee85 45my $txt;
46
35c0985d 47sub alias (@)
48{
49 @_ or return %alias3;
50 my $alias = ref $_[0] ? $_[0] : { @_ };
51 @alias3{keys %$alias} = values %$alias;
52} # alias
53
54sub alias_file ($)
55{
51cf30b6 56 my ($arg, $file) = @_;
57 if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
58 $file = $arg;
59 }
60 elsif ($arg =~ m/^\w+$/) {
61 $file = "unicore/${arg}_alias.pl";
62 }
63 else {
64 croak "Charnames alias files can only have identifier characters";
65 }
35c0985d 66 if (my @alias = do $file) {
51cf30b6 67 @alias == 1 && !defined $alias[0] and
68 croak "$file cannot be used as alias file for charnames";
69 @alias % 2 and
70 croak "$file did not return a (valid) list of alias pairs";
35c0985d 71 alias (@alias);
72 return (1);
73 }
74 0;
75} # alias_file
76
423cee85 77# This is not optimized in any way yet
b177ca84 78sub charnames
79{
80 my $name = shift;
81
52ea3e69 82 if (exists $alias1{$name}) {
35c0985d 83 $name = $alias1{$name};
52ea3e69 84 }
35c0985d 85 elsif (exists $alias2{$name}) {
86 require warnings;
87 warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead});
88 $name = $alias2{$name};
89 }
90 elsif (exists $alias3{$name}) {
91 $name = $alias3{$name};
52ea3e69 92 }
b177ca84 93
52ea3e69 94 my $ord;
423cee85 95 my @off;
52ea3e69 96 my $fname;
97
98 if ($name eq "BYTE ORDER MARK") {
35c0985d 99 $fname = $name;
100 $ord = 0xFEFF;
52ea3e69 101 } else {
35c0985d 102 ## Suck in the code/name list as a big string.
103 ## Lines look like:
104 ## "0052\t\tLATIN CAPITAL LETTER R\n"
105 $txt = do "unicore/Name.pl" unless $txt;
106
107 ## @off will hold the index into the code/name string of the start and
108 ## end of the name as we find it.
109
a6d05634 110 ## If :full, look for the name exactly
35c0985d 111 if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) {
112 @off = ($-[0], $+[0]);
113 }
114
115 ## If we didn't get above, and :short allowed, look for the short name.
116 ## The short name is like "greek:Sigma"
117 unless (@off) {
118 if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
119 my ($script, $cname) = ($1, $2);
120 my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
121 if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
52ea3e69 122 @off = ($-[0], $+[0]);
35c0985d 123 }
423cee85 124 }
35c0985d 125 }
b177ca84 126
35c0985d 127 ## If we still don't have it, check for the name among the loaded
128 ## scripts.
129 if (not @off) {
130 my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
131 for my $script (@{$^H{charnames_scripts}}) {
132 if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
133 @off = ($-[0], $+[0]);
134 last;
135 }
52ea3e69 136 }
35c0985d 137 }
138
139 ## If we don't have it by now, give up.
140 unless (@off) {
141 carp "Unknown charname '$name'";
142 return "\x{FFFD}";
143 }
144
145 ##
146 ## Now know where in the string the name starts.
147 ## The code, in hex, is before that.
148 ##
149 ## The code can be 4-6 characters long, so we've got to sort of
150 ## go look for it, just after the newline that comes before $off[0].
151 ##
152 ## This would be much easier if unicore/Name.pl had info in
153 ## a name/code order, instead of code/name order.
154 ##
155 ## The +1 after the rindex() is to skip past the newline we're finding,
156 ## or, if the rindex() fails, to put us to an offset of zero.
157 ##
158 my $hexstart = rindex($txt, "\n", $off[0]) + 1;
159
160 ## we know where it starts, so turn into number -
161 ## the ordinal for the char.
162 $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
423cee85 163 }
b177ca84 164
d5448623 165 if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
8058d7ab 166 use bytes;
d41ff1b8 167 return chr $ord if $ord <= 255;
f0175764 168 my $hex = sprintf "%04x", $ord;
52ea3e69 169 if (not defined $fname) {
35c0985d 170 $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
52ea3e69 171 }
f0175764 172 croak "Character 0x$hex with name '$fname' is above 0xFF";
423cee85 173 }
f0175764 174
52ea3e69 175 no warnings 'utf8'; # allow even illegal characters
bfa383d6 176 return pack "U", $ord;
35c0985d 177} # charnames
423cee85 178
b177ca84 179sub import
180{
181 shift; ## ignore class name
182
35c0985d 183 if (not @_) {
184 carp("`use charnames' needs explicit imports list");
b177ca84 185 }
d5448623 186 $^H |= $charnames::hint_bits;
423cee85 187 $^H{charnames} = \&charnames ;
b177ca84 188
189 ##
190 ## fill %h keys with our @_ args.
191 ##
35c0985d 192 my ($promote, %h, @args) = (0);
193 while (@_ and $_ = shift) {
51cf30b6 194 if ($_ eq ":alias") {
195 @_ or
196 croak ":alias needs an argument in charnames";
35c0985d 197 my $alias = shift;
198 if (ref $alias) {
199 ref $alias eq "HASH" or
51cf30b6 200 croak "Only HASH reference supported as argument to :alias";
35c0985d 201 alias ($alias);
202 next;
203 }
51cf30b6 204 if ($alias =~ m{:(\w+)$}) {
205 $1 eq "full" || $1 eq "short" and
206 croak ":alias cannot use existing pragma :$1 (reversed order?)";
207 alias_file ($1) and $promote = 1;
208 next;
35c0985d 209 }
51cf30b6 210 alias_file ($alias);
211 next;
212 }
213 if (m/^:/ and ! ($_ eq ":full" || $_ eq ":short")) {
214 warn "unsupported special '$_' in charnames";
215 next;
35c0985d 216 }
217 push @args, $_;
218 }
219 @args == 0 && $promote and @args = (":full");
220 @h{@args} = (1) x @args;
b177ca84 221
423cee85 222 $^H{charnames_full} = delete $h{':full'};
223 $^H{charnames_short} = delete $h{':short'};
224 $^H{charnames_scripts} = [map uc, keys %h];
b177ca84 225
226 ##
227 ## If utf8? warnings are enabled, and some scripts were given,
228 ## see if at least we can find one letter of each script.
229 ##
35c0985d 230 if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
231 $txt = do "unicore/Name.pl" unless $txt;
232
233 for my $script (@{$^H{charnames_scripts}}) {
234 if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
235 warnings::warn('utf8', "No such script: '$script'");
b177ca84 236 }
35c0985d 237 }
bd62941a 238 }
35c0985d 239} # import
423cee85 240
f0175764 241require Unicode::UCD; # for Unicode::UCD::_getcode()
242
4e2cda5d 243my %viacode;
244
b177ca84 245sub viacode
246{
35c0985d 247 if (@_ != 1) {
248 carp "charnames::viacode() expects one argument";
249 return ()
250 }
f0175764 251
35c0985d 252 my $arg = shift;
253 my $code = Unicode::UCD::_getcode($arg);
b177ca84 254
35c0985d 255 my $hex;
f0175764 256
35c0985d 257 if (defined $code) {
258 $hex = sprintf "%04X", $arg;
259 } else {
260 carp("unexpected arg \"$arg\" to charnames::viacode()");
261 return;
262 }
b177ca84 263
35c0985d 264 if ($code > 0x10FFFF) {
265 carp sprintf "Unicode characters only allocated up to U+10FFFF (you asked for U+%X)", $hex;
266 return;
267 }
f0175764 268
35c0985d 269 return $viacode{$hex} if exists $viacode{$hex};
4e2cda5d 270
35c0985d 271 $txt = do "unicore/Name.pl" unless $txt;
b177ca84 272
35c0985d 273 if ($txt =~ m/^$hex\t\t(.+)/m) {
274 return $viacode{$hex} = $1;
275 } else {
276 return;
277 }
278} # viacode
daf0d493 279
4e2cda5d 280my %vianame;
281
daf0d493 282sub vianame
283{
35c0985d 284 if (@_ != 1) {
285 carp "charnames::vianame() expects one name argument";
286 return ()
287 }
daf0d493 288
35c0985d 289 my $arg = shift;
daf0d493 290
35c0985d 291 return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/;
dbc0d4f2 292
35c0985d 293 return $vianame{$arg} if exists $vianame{$arg};
4e2cda5d 294
35c0985d 295 $txt = do "unicore/Name.pl" unless $txt;
daf0d493 296
35c0985d 297 my $pos = index $txt, "\t\t$arg\n";
298 if ($[ <= $pos) {
299 my $posLF = rindex $txt, "\n", $pos;
300 (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d;
301 return $vianame{$arg} = hex $code;
302
303 # If $pos is at the 1st line, $posLF must be $[ - 1 (not found);
304 # then $posLF + 1 equals to $[ (at the beginning of $txt).
305 # Otherwise $posLF is the position of "\n";
306 # then $posLF + 1 must be the position of the next to "\n"
307 # (the beginning of the line).
308 # substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t",
309 # "10300\t", "100000", etc. So we can get the code via removing TAB.
310 } else {
311 return;
312 }
313} # vianame
b177ca84 314
423cee85 315
3161;
317__END__
318
319=head1 NAME
320
274085e3 321charnames - define character names for C<\N{named}> string literal escapes
423cee85 322
323=head1 SYNOPSIS
324
325 use charnames ':full';
4a2d328f 326 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85 327
328 use charnames ':short';
4a2d328f 329 print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85 330
331 use charnames qw(cyrillic greek);
4a2d328f 332 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85 333
35c0985d 334 use charnames ":full", ":alias" => {
335 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
76ae0c45 336 };
35c0985d 337 print "\N{e_ACUTE} is a small letter e with an acute.\n";
338
76ae0c45 339 use charnames ();
a23c04e4 340 print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
341 printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330"
b177ca84 342
423cee85 343=head1 DESCRIPTION
344
35c0985d 345Pragma C<use charnames> supports arguments C<:full>, C<:short>, script
346names and customized aliases. If C<:full> is present, for expansion of
76ae0c45 347C<\N{CHARNAME}>, the string C<CHARNAME> is first looked up in the list of
348standard Unicode character names. If C<:short> is present, and
423cee85 349C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
350as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
a191c821 351with script name arguments, then for C<\N{CHARNAME}> the name
423cee85 352C<CHARNAME> is looked up as a letter in the given scripts (in the
35c0985d 353specified order). Customized aliases are explained in L</CUSTOM ALIASES>.
423cee85 354
355For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
d5448623 356this pragma looks for the names
423cee85 357
358 SCRIPTNAME CAPITAL LETTER CHARNAME
359 SCRIPTNAME SMALL LETTER CHARNAME
360 SCRIPTNAME LETTER CHARNAME
361
362in the table of standard Unicode names. If C<CHARNAME> is lowercase,
daf0d493 363then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
364is ignored.
365
366Note that C<\N{...}> is compile-time, it's a special form of string
367constant used inside double-quoted strings: in other words, you cannot
4e2cda5d 368use variables inside the C<\N{...}>. If you want similar run-time
daf0d493 369functionality, use charnames::vianame().
423cee85 370
301a3cda 371For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
dbc0d4f2 372as of Unicode 3.1, there are no official Unicode names but you can use
373instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth). In
374Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429
375has been updated, see L</ALIASES>. Also note that the U+UU80, U+0081,
376U+0084, and U+0099 do not have names even in ISO 6429.
377
378Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}"
379is the Unicode smiley face, or "\N{WHITE SMILING FACE}".
301a3cda 380
423cee85 381=head1 CUSTOM TRANSLATORS
382
d5448623 383The mechanism of translation of C<\N{...}> escapes is general and not
423cee85 384hardwired into F<charnames.pm>. A module can install custom
d5448623 385translations (inside the scope which C<use>s the module) with the
423cee85 386following magic incantation:
387
d5448623 388 use charnames (); # for $charnames::hint_bits
389 sub import {
390 shift;
391 $^H |= $charnames::hint_bits;
392 $^H{charnames} = \&translator;
393 }
423cee85 394
395Here translator() is a subroutine which takes C<CHARNAME> as an
396argument, and returns text to insert into the string instead of the
4a2d328f 397C<\N{CHARNAME}> escape. Since the text to insert should be different
d5448623 398in C<bytes> mode and out of it, the function should check the current
399state of C<bytes>-flag as in:
400
401 use bytes (); # for $bytes::hint_bits
402 sub translator {
403 if ($^H & $bytes::hint_bits) {
404 return bytes_translator(@_);
405 }
406 else {
407 return utf8_translator(@_);
408 }
423cee85 409 }
423cee85 410
35c0985d 411=head1 CUSTOM ALIASES
412
413This version of charnames supports three mechanisms of adding local
414or customized aliases to standard Unicode naming conventions (:full)
415
416=head2 Anonymous hashes
417
418 use charnames ":full", ":alias" => {
419 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
420 };
421 my $str = "\N{e_ACUTE}";
422
423=head2 Alias file
424
425 use charnames ":full", ":alias" => "pro";
426
427 will try to read "unicore/pro_alias.pl" from the @INC path. This
428 file should return a list in plain perl:
429
430 (
431 A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE",
432 A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
433 A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS",
434 A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE",
435 A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE",
436 A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE",
437 A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON",
438 );
439
440=head2 Alias shortcut
441
442 use charnames ":alias" => ":pro";
443
444 works exactly the same as the alias pairs, only this time,
445 ":full" is inserted automatically as first argument (if no
446 other argument is given).
447
b177ca84 448=head1 charnames::viacode(code)
449
450Returns the full name of the character indicated by the numeric code.
451The example
452
453 print charnames::viacode(0x2722);
454
455prints "FOUR TEARDROP-SPOKED ASTERISK".
456
daf0d493 457Returns undef if no name is known for the code.
458
35c0985d 459This works only for the standard names, and does not yet apply
daf0d493 460to custom translators.
461
274085e3 462Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
463SPACE", not "BYTE ORDER MARK".
464
eb6a2339 465=head1 charnames::vianame(name)
daf0d493 466
467Returns the code point indicated by the name.
468The example
469
470 printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
471
472prints "2722".
473
eb6a2339 474Returns undef if the name is unknown.
b177ca84 475
35c0985d 476This works only for the standard names, and does not yet apply
b177ca84 477to custom translators.
478
52ea3e69 479=head1 ALIASES
480
481A few aliases have been defined for convenience: instead of having
482to use the official names
483
484 LINE FEED (LF)
485 FORM FEED (FF)
486 CARRIAGE RETURN (CR)
487 NEXT LINE (NEL)
488
489(yes, with parentheses) one can use
490
491 LINE FEED
492 FORM FEED
493 CARRIAGE RETURN
494 NEXT LINE
495 LF
496 FF
497 CR
498 NEL
499
500One can also use
501
502 BYTE ORDER MARK
503 BOM
504
24b5d5cc 505and
506
507 ZWNJ
508 ZWJ
509
510for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER.
52ea3e69 511
512For backward compatibility one can use the old names for
513certain C0 and C1 controls
514
515 old new
516
517 HORIZONTAL TABULATION CHARACTER TABULATION
518 VERTICAL TABULATION LINE TABULATION
519 FILE SEPARATOR INFORMATION SEPARATOR FOUR
520 GROUP SEPARATOR INFORMATION SEPARATOR THREE
521 RECORD SEPARATOR INFORMATION SEPARATOR TWO
522 UNIT SEPARATOR INFORMATION SEPARATOR ONE
523 PARTIAL LINE DOWN PARTIAL LINE FORWARD
524 PARTIAL LINE UP PARTIAL LINE BACKWARD
525
526but the old names in addition to giving the character
527will also give a warning about being deprecated.
528
f0175764 529=head1 ILLEGAL CHARACTERS
530
00d835f2 531If you ask by name for a character that does not exist, a warning is
532given and the Unicode I<replacement character> "\x{FFFD}" is returned.
533
534If you ask by code for a character that does not exist, no warning is
535given and C<undef> is returned. (Though if you ask for a code point
536past U+10FFFF you do get a warning.)
f0175764 537
423cee85 538=head1 BUGS
539
540Since evaluation of the translation function happens in a middle of
541compilation (of a string literal), the translation function should not
542do any C<eval>s or C<require>s. This restriction should be lifted in
543a future version of Perl.
544
545=cut