I don't think trying to bracket the hires time with lores
[p5sagit/p5-mst-13.2.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84 2use strict;
3use warnings;
4use Carp;
5our $VERSION = '1.01';
b75c8c73 6
d5448623 7use bytes (); # for $bytes::hint_bits
8$charnames::hint_bits = 0x20000;
423cee85 9
423cee85 10my $txt;
11
12# This is not optimized in any way yet
b177ca84 13sub charnames
14{
15 my $name = shift;
16
17 ## Suck in the code/name list as a big string.
18 ## Lines look like:
19 ## "0052\t\tLATIN CAPITAL LETTER R\n"
55d7b906 20 $txt = do "unicore/Name.pl" unless $txt;
b177ca84 21
22 ## @off will hold the index into the code/name string of the start and
23 ## end of the name as we find it.
423cee85 24 my @off;
b177ca84 25
26 ## If :full, look for the the name exactly
423cee85 27 if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
28 @off = ($-[0], $+[0]);
29 }
b177ca84 30
31 ## If we didn't get above, and :short allowed, look for the short name.
32 ## The short name is like "greek:Sigma"
423cee85 33 unless (@off) {
b177ca84 34 if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
423cee85 35 my ($script, $cname) = ($1,$2);
36 my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
37 if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
38 @off = ($-[0], $+[0]);
39 }
40 }
41 }
b177ca84 42
43 ## If we still don't have it, check for the name among the loaded
44 ## scripts.
45 if (not @off)
46 {
47 my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
48 for my $script ( @{$^H{charnames_scripts}} )
49 {
50 if ($txt =~ m/\t\t$script (?:$case )?LETTER \U$name$/m) {
51 @off = ($-[0], $+[0]);
52 last;
53 }
54 }
423cee85 55 }
b177ca84 56
57 ## If we don't have it by now, give up.
423cee85 58 die "Unknown charname '$name'" unless @off;
b896c7a5 59
b177ca84 60 ##
61 ## Now know where in the string the name starts.
62 ## The code, in hex, is befor that.
63 ##
64 ## The code can be 4-6 characters long, so we've got to sort of
65 ## go look for it, just after the newline that comes before $off[0].
66 ##
67 ## This would be much easier if unicore/Name.pl had info in
68 ## a name/code order, instead of code/name order.
69 ##
70 ## The +1 after the rindex() is to skip past the newline we're finding,
71 ## or, if the rindex() fails, to put us to an offset of zero.
72 ##
73 my $hexstart = rindex($txt, "\n", $off[0]) + 1;
74
75 ## we know where it starts, so turn into number - the ordinal for the char.
76 my $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
77
d5448623 78 if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
8058d7ab 79 use bytes;
d41ff1b8 80 return chr $ord if $ord <= 255;
81 my $hex = sprintf '%X=0%o', $ord, $ord;
82 my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
83 die "Character 0x$hex with name '$fname' is above 0xFF";
423cee85 84 }
bfa383d6 85 return pack "U", $ord;
423cee85 86}
87
b177ca84 88sub import
89{
90 shift; ## ignore class name
91
92 if (not @_)
93 {
94 carp("`use charnames' needs explicit imports list");
95 }
d5448623 96 $^H |= $charnames::hint_bits;
423cee85 97 $^H{charnames} = \&charnames ;
b177ca84 98
99 ##
100 ## fill %h keys with our @_ args.
101 ##
423cee85 102 my %h;
103 @h{@_} = (1) x @_;
b177ca84 104
423cee85 105 $^H{charnames_full} = delete $h{':full'};
106 $^H{charnames_short} = delete $h{':short'};
107 $^H{charnames_scripts} = [map uc, keys %h];
b177ca84 108
109 ##
110 ## If utf8? warnings are enabled, and some scripts were given,
111 ## see if at least we can find one letter of each script.
112 ##
113 if (warnings::enabled('utf8') && @{$^H{charnames_scripts}})
114 {
115 $txt = do "unicore/Name.pl" unless $txt;
116
117 for my $script (@{$^H{charnames_scripts}})
118 {
119 if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
120 warnings::warn('utf8', "No such script: '$script'");
121 }
122 }
bd62941a 123 }
423cee85 124}
125
b177ca84 126sub viacode
127{
128 if (@_ != 1) {
daf0d493 129 carp "charnames::viacode() expects one numeric argument";
b177ca84 130 return ()
131 }
132 my $arg = shift;
133
134 my $hex;
135 if ($arg =~ m/^[0-9]+$/) {
136 $hex = sprintf "%04X", $arg;
137 } else {
138 carp("unexpected arg \"$arg\" to charnames::viacode()");
daf0d493 139 return;
b177ca84 140 }
141
142 $txt = do "unicore/Name.pl" unless $txt;
143
144 if ($txt =~ m/^$hex\t\t(.+)/m) {
145 return $1;
146 } else {
daf0d493 147 return;
148 }
149}
150
151sub vianame
152{
153 if (@_ != 1) {
154 carp "charnames::vianame() expects one name argument";
155 return ()
156 }
157
158 my $arg = shift;
159
160 $txt = do "unicore/Name.pl" unless $txt;
161
162 if ($txt =~ m/^([0-9A-F]+)\t\t($arg)/m) {
163 return hex $1;
164 } else {
165 return;
b177ca84 166 }
167}
168
423cee85 169
1701;
171__END__
172
173=head1 NAME
174
b177ca84 175charnames - define character names for C<\N{named}> string literal escapes.
423cee85 176
177=head1 SYNOPSIS
178
179 use charnames ':full';
4a2d328f 180 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85 181
182 use charnames ':short';
4a2d328f 183 print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85 184
185 use charnames qw(cyrillic greek);
4a2d328f 186 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85 187
b177ca84 188 print charname::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
daf0d493 189 printf "%04X", charname::vianame("GOTHIC LETTER AHSA"); # prints "10330"
b177ca84 190
423cee85 191=head1 DESCRIPTION
192
193Pragma C<use charnames> supports arguments C<:full>, C<:short> and
194script names. If C<:full> is present, for expansion of
4a2d328f 195C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of
423cee85 196standard Unicode names of chars. If C<:short> is present, and
197C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
198as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
4a2d328f 199with script name arguments, then for C<\N{CHARNAME}}> the name
423cee85 200C<CHARNAME> is looked up as a letter in the given scripts (in the
201specified order).
202
203For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
d5448623 204this pragma looks for the names
423cee85 205
206 SCRIPTNAME CAPITAL LETTER CHARNAME
207 SCRIPTNAME SMALL LETTER CHARNAME
208 SCRIPTNAME LETTER CHARNAME
209
210in the table of standard Unicode names. If C<CHARNAME> is lowercase,
daf0d493 211then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
212is ignored.
213
214Note that C<\N{...}> is compile-time, it's a special form of string
215constant used inside double-quoted strings: in other words, you cannot
216used variables inside the C<\N{...}>. If you want similar run-time
217functionality, use charnames::vianame().
423cee85 218
219=head1 CUSTOM TRANSLATORS
220
d5448623 221The mechanism of translation of C<\N{...}> escapes is general and not
423cee85 222hardwired into F<charnames.pm>. A module can install custom
d5448623 223translations (inside the scope which C<use>s the module) with the
423cee85 224following magic incantation:
225
d5448623 226 use charnames (); # for $charnames::hint_bits
227 sub import {
228 shift;
229 $^H |= $charnames::hint_bits;
230 $^H{charnames} = \&translator;
231 }
423cee85 232
233Here translator() is a subroutine which takes C<CHARNAME> as an
234argument, and returns text to insert into the string instead of the
4a2d328f 235C<\N{CHARNAME}> escape. Since the text to insert should be different
d5448623 236in C<bytes> mode and out of it, the function should check the current
237state of C<bytes>-flag as in:
238
239 use bytes (); # for $bytes::hint_bits
240 sub translator {
241 if ($^H & $bytes::hint_bits) {
242 return bytes_translator(@_);
243 }
244 else {
245 return utf8_translator(@_);
246 }
423cee85 247 }
423cee85 248
b177ca84 249=head1 charnames::viacode(code)
250
251Returns the full name of the character indicated by the numeric code.
252The example
253
254 print charnames::viacode(0x2722);
255
256prints "FOUR TEARDROP-SPOKED ASTERISK".
257
daf0d493 258Returns undef if no name is known for the code.
259
260This works only for the standard names, and does not yet aply
261to custom translators.
262
263=head1 charnames::vianame(code)
264
265Returns the code point indicated by the name.
266The example
267
268 printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
269
270prints "2722".
271
272Returns undef if no name is known for the name.
b177ca84 273
274This works only for the standard names, and does not yet aply
275to custom translators.
276
423cee85 277=head1 BUGS
278
279Since evaluation of the translation function happens in a middle of
280compilation (of a string literal), the translation function should not
281do any C<eval>s or C<require>s. This restriction should be lifted in
282a future version of Perl.
283
284=cut