Re: [PATCH] Re: ndbm failing on linux ?
[p5sagit/p5-mst-13.2.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84 2use strict;
3use warnings;
4use Carp;
5our $VERSION = '1.01';
b75c8c73 6
d5448623 7use bytes (); # for $bytes::hint_bits
8$charnames::hint_bits = 0x20000;
423cee85 9
423cee85 10my $txt;
11
12# This is not optimized in any way yet
b177ca84 13sub charnames
14{
15 my $name = shift;
16
17 ## Suck in the code/name list as a big string.
18 ## Lines look like:
19 ## "0052\t\tLATIN CAPITAL LETTER R\n"
55d7b906 20 $txt = do "unicore/Name.pl" unless $txt;
b177ca84 21
22 ## @off will hold the index into the code/name string of the start and
23 ## end of the name as we find it.
423cee85 24 my @off;
b177ca84 25
26 ## If :full, look for the the name exactly
423cee85 27 if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
28 @off = ($-[0], $+[0]);
29 }
b177ca84 30
31 ## If we didn't get above, and :short allowed, look for the short name.
32 ## The short name is like "greek:Sigma"
423cee85 33 unless (@off) {
b177ca84 34 if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
423cee85 35 my ($script, $cname) = ($1,$2);
36 my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
37 if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
38 @off = ($-[0], $+[0]);
39 }
40 }
41 }
b177ca84 42
43 ## If we still don't have it, check for the name among the loaded
44 ## scripts.
45 if (not @off)
46 {
47 my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
48 for my $script ( @{$^H{charnames_scripts}} )
49 {
50 if ($txt =~ m/\t\t$script (?:$case )?LETTER \U$name$/m) {
51 @off = ($-[0], $+[0]);
52 last;
53 }
54 }
423cee85 55 }
b177ca84 56
57 ## If we don't have it by now, give up.
f0175764 58 unless (@off) {
59 carp "Unknown charname '$name'";
60 return "\x{FFFD}";
61 }
b896c7a5 62
b177ca84 63 ##
64 ## Now know where in the string the name starts.
65 ## The code, in hex, is befor that.
66 ##
67 ## The code can be 4-6 characters long, so we've got to sort of
68 ## go look for it, just after the newline that comes before $off[0].
69 ##
70 ## This would be much easier if unicore/Name.pl had info in
71 ## a name/code order, instead of code/name order.
72 ##
73 ## The +1 after the rindex() is to skip past the newline we're finding,
74 ## or, if the rindex() fails, to put us to an offset of zero.
75 ##
76 my $hexstart = rindex($txt, "\n", $off[0]) + 1;
77
78 ## we know where it starts, so turn into number - the ordinal for the char.
79 my $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
80
d5448623 81 if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
8058d7ab 82 use bytes;
d41ff1b8 83 return chr $ord if $ord <= 255;
f0175764 84 my $hex = sprintf "%04x", $ord;
d41ff1b8 85 my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
f0175764 86 croak "Character 0x$hex with name '$fname' is above 0xFF";
423cee85 87 }
f0175764 88
bfa383d6 89 return pack "U", $ord;
423cee85 90}
91
b177ca84 92sub import
93{
94 shift; ## ignore class name
95
96 if (not @_)
97 {
98 carp("`use charnames' needs explicit imports list");
99 }
d5448623 100 $^H |= $charnames::hint_bits;
423cee85 101 $^H{charnames} = \&charnames ;
b177ca84 102
103 ##
104 ## fill %h keys with our @_ args.
105 ##
423cee85 106 my %h;
107 @h{@_} = (1) x @_;
b177ca84 108
423cee85 109 $^H{charnames_full} = delete $h{':full'};
110 $^H{charnames_short} = delete $h{':short'};
111 $^H{charnames_scripts} = [map uc, keys %h];
b177ca84 112
113 ##
114 ## If utf8? warnings are enabled, and some scripts were given,
115 ## see if at least we can find one letter of each script.
116 ##
117 if (warnings::enabled('utf8') && @{$^H{charnames_scripts}})
118 {
119 $txt = do "unicore/Name.pl" unless $txt;
120
121 for my $script (@{$^H{charnames_scripts}})
122 {
123 if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
124 warnings::warn('utf8', "No such script: '$script'");
125 }
126 }
bd62941a 127 }
423cee85 128}
129
f0175764 130require Unicode::UCD; # for Unicode::UCD::_getcode()
131
4e2cda5d 132my %viacode;
133
b177ca84 134sub viacode
135{
136 if (@_ != 1) {
daf0d493 137 carp "charnames::viacode() expects one numeric argument";
b177ca84 138 return ()
139 }
f0175764 140
b177ca84 141 my $arg = shift;
f0175764 142 my $code = Unicode::UCD::_getcode($arg);
b177ca84 143
144 my $hex;
f0175764 145
146 if (defined $code) {
b177ca84 147 $hex = sprintf "%04X", $arg;
148 } else {
149 carp("unexpected arg \"$arg\" to charnames::viacode()");
daf0d493 150 return;
b177ca84 151 }
152
f0175764 153 if ($code > 0x10FFFF) {
154 carp "Unicode characters only allocated up to 0x10FFFF (you asked for $hex)";
155 return "\x{FFFD}";
156 }
157
4e2cda5d 158 return $viacode{$hex} if exists $viacode{$hex};
159
b177ca84 160 $txt = do "unicore/Name.pl" unless $txt;
161
162 if ($txt =~ m/^$hex\t\t(.+)/m) {
4e2cda5d 163 return $viacode{$hex} = $1;
b177ca84 164 } else {
daf0d493 165 return;
166 }
167}
168
4e2cda5d 169my %vianame;
170
daf0d493 171sub vianame
172{
173 if (@_ != 1) {
174 carp "charnames::vianame() expects one name argument";
175 return ()
176 }
177
178 my $arg = shift;
179
4e2cda5d 180 return $vianame{$arg} if exists $vianame{$arg};
181
daf0d493 182 $txt = do "unicore/Name.pl" unless $txt;
183
184 if ($txt =~ m/^([0-9A-F]+)\t\t($arg)/m) {
4e2cda5d 185 return $vianame{$arg} = hex $1;
daf0d493 186 } else {
187 return;
b177ca84 188 }
189}
190
423cee85 191
1921;
193__END__
194
195=head1 NAME
196
b177ca84 197charnames - define character names for C<\N{named}> string literal escapes.
423cee85 198
199=head1 SYNOPSIS
200
201 use charnames ':full';
4a2d328f 202 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85 203
204 use charnames ':short';
4a2d328f 205 print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85 206
207 use charnames qw(cyrillic greek);
4a2d328f 208 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85 209
b177ca84 210 print charname::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
daf0d493 211 printf "%04X", charname::vianame("GOTHIC LETTER AHSA"); # prints "10330"
b177ca84 212
423cee85 213=head1 DESCRIPTION
214
215Pragma C<use charnames> supports arguments C<:full>, C<:short> and
216script names. If C<:full> is present, for expansion of
4a2d328f 217C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of
423cee85 218standard Unicode names of chars. If C<:short> is present, and
219C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
220as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
4a2d328f 221with script name arguments, then for C<\N{CHARNAME}}> the name
423cee85 222C<CHARNAME> is looked up as a letter in the given scripts (in the
223specified order).
224
225For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
d5448623 226this pragma looks for the names
423cee85 227
228 SCRIPTNAME CAPITAL LETTER CHARNAME
229 SCRIPTNAME SMALL LETTER CHARNAME
230 SCRIPTNAME LETTER CHARNAME
231
232in the table of standard Unicode names. If C<CHARNAME> is lowercase,
daf0d493 233then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
234is ignored.
235
236Note that C<\N{...}> is compile-time, it's a special form of string
237constant used inside double-quoted strings: in other words, you cannot
4e2cda5d 238use variables inside the C<\N{...}>. If you want similar run-time
daf0d493 239functionality, use charnames::vianame().
423cee85 240
241=head1 CUSTOM TRANSLATORS
242
d5448623 243The mechanism of translation of C<\N{...}> escapes is general and not
423cee85 244hardwired into F<charnames.pm>. A module can install custom
d5448623 245translations (inside the scope which C<use>s the module) with the
423cee85 246following magic incantation:
247
d5448623 248 use charnames (); # for $charnames::hint_bits
249 sub import {
250 shift;
251 $^H |= $charnames::hint_bits;
252 $^H{charnames} = \&translator;
253 }
423cee85 254
255Here translator() is a subroutine which takes C<CHARNAME> as an
256argument, and returns text to insert into the string instead of the
4a2d328f 257C<\N{CHARNAME}> escape. Since the text to insert should be different
d5448623 258in C<bytes> mode and out of it, the function should check the current
259state of C<bytes>-flag as in:
260
261 use bytes (); # for $bytes::hint_bits
262 sub translator {
263 if ($^H & $bytes::hint_bits) {
264 return bytes_translator(@_);
265 }
266 else {
267 return utf8_translator(@_);
268 }
423cee85 269 }
423cee85 270
b177ca84 271=head1 charnames::viacode(code)
272
273Returns the full name of the character indicated by the numeric code.
274The example
275
276 print charnames::viacode(0x2722);
277
278prints "FOUR TEARDROP-SPOKED ASTERISK".
279
daf0d493 280Returns undef if no name is known for the code.
281
282This works only for the standard names, and does not yet aply
283to custom translators.
284
285=head1 charnames::vianame(code)
286
287Returns the code point indicated by the name.
288The example
289
290 printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
291
292prints "2722".
293
294Returns undef if no name is known for the name.
b177ca84 295
296This works only for the standard names, and does not yet aply
297to custom translators.
298
f0175764 299=head1 ILLEGAL CHARACTERS
300
301If you ask for a character that does not exist, a warning is given
302and the special Unicode I<replacement character> "\x{FFFD}" is returned.
303
423cee85 304=head1 BUGS
305
306Since evaluation of the translation function happens in a middle of
307compilation (of a string literal), the translation function should not
308do any C<eval>s or C<require>s. This restriction should be lifted in
309a future version of Perl.
310
311=cut