lib/charnames.pm

   1 package charnames;
   2 use strict;
   3 use warnings;
   4 use Carp;
   5 our $VERSION = '1.01';
   6
   7 use bytes ();           # for $bytes::hint_bits
   8 $charnames::hint_bits = 0x20000;
   9
  10 my $txt;
  11
  12 # This is not optimized in any way yet
  13 sub charnames
  14 {
  15   my $name = shift;
  16
  17   ## Suck in the code/name list as a big string.
  18   ## Lines look like:
  19   ##     "0052\t\tLATIN CAPITAL LETTER R\n"
  20   $txt = do "unicore/Name.pl" unless $txt;
  21
  22   ## @off will hold the index into the code/name string of the start and
  23   ## end of the name as we find it.
  24   my @off;
  25
  26   ## If :full, look for the the name exactly
  27   if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
  28     @off = ($-[0], $+[0]);
  29   }
  30
  31   ## If we didn't get above, and :short allowed, look for the short name.
  32   ## The short name is like "greek:Sigma"
  33   unless (@off) {
  34     if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
  35       my ($script, $cname) = ($1,$2);
  36       my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
  37       if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
  38         @off = ($-[0], $+[0]);
  39       }
  40     }
  41   }
  42
  43   ## If we still don't have it, check for the name among the loaded
  44   ## scripts.
  45   if (not @off)
  46   {
  47       my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
  48       for my $script ( @{$^H{charnames_scripts}} )
  49       {
  50           if ($txt =~ m/\t\t$script (?:$case )?LETTER \U$name$/m) {
  51               @off = ($-[0], $+[0]);
  52               last;
  53           }
  54       }
  55   }
  56
  57   ## If we don't have it by now, give up.
  58   die "Unknown charname '$name'" unless @off;
  59
  60   ##
  61   ## Now know where in the string the name starts.
  62   ## The code, in hex, is befor that.
  63   ##
  64   ## The code can be 4-6 characters long, so we've got to sort of
  65   ## go look for it, just after the newline that comes before $off[0].
  66   ##
  67   ## This would be much easier if unicore/Name.pl had info in
  68   ## a name/code order, instead of code/name order.
  69   ##
  70   ## The +1 after the rindex() is to skip past the newline we're finding,
  71   ## or, if the rindex() fails, to put us to an offset of zero.
  72   ##
  73   my $hexstart = rindex($txt, "\n", $off[0]) + 1;
  74
  75   ## we know where it starts, so turn into number - the ordinal for the char.
  76   my $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
  77
  78   if ($^H & $bytes::hint_bits) {        # "use bytes" in effect?
  79     use bytes;
  80     return chr $ord if $ord <= 255;
  81     my $hex = sprintf '%X=0%o', $ord, $ord;
  82     my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
  83     die "Character 0x$hex with name '$fname' is above 0xFF";
  84   }
  85   return pack "U", $ord;
  86 }
  87
  88 sub import
  89 {
  90   shift; ## ignore class name
  91
  92   if (not @_)
  93   {
  94       carp("`use charnames' needs explicit imports list");
  95   }
  96   $^H |= $charnames::hint_bits;
  97   $^H{charnames} = \&charnames ;
  98
  99   ##
 100   ## fill %h keys with our @_ args.
 101   ##
 102   my %h;
 103   @h{@_} = (1) x @_;
 104
 105   $^H{charnames_full} = delete $h{':full'};
 106   $^H{charnames_short} = delete $h{':short'};
 107   $^H{charnames_scripts} = [map uc, keys %h];
 108
 109   ##
 110   ## If utf8? warnings are enabled, and some scripts were given,
 111   ## see if at least we can find one letter of each script.
 112   ##
 113   if (warnings::enabled('utf8') && @{$^H{charnames_scripts}})
 114   {
 115       $txt = do "unicore/Name.pl" unless $txt;
 116
 117       for my $script (@{$^H{charnames_scripts}})
 118       {
 119           if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
 120               warnings::warn('utf8',  "No such script: '$script'");
 121           }
 122       }
 123   }
 124 }
 125
 126 sub viacode
 127 {
 128     if (@_ != 1) {
 129         carp "charnames::viacode() expects one numeric value";
 130         return ()
 131     }
 132     my $arg = shift;
 133
 134     my $hex;
 135     if ($arg =~ m/^[0-9]+$/) {
 136         $hex = sprintf "%04X", $arg;
 137     } else {
 138         carp("unexpected arg \"$arg\" to charnames::viacode()");
 139         return ();
 140     }
 141
 142     $txt = do "unicore/Name.pl" unless $txt;
 143
 144     if ($txt =~ m/^$hex\t\t(.+)/m) {
 145         return $1;
 146     } else {
 147         return ();
 148     }
 149 }
 150
 151
 152 1;
 153 __END__
 154
 155 =head1 NAME
 156
 157 charnames - define character names for C<\N{named}> string literal escapes.
 158
 159 =head1 SYNOPSIS
 160
 161   use charnames ':full';
 162   print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
 163
 164   use charnames ':short';
 165   print "\N{greek:Sigma} is an upper-case sigma.\n";
 166
 167   use charnames qw(cyrillic greek);
 168   print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
 169
 170   print charname::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
 171
 172 =head1 DESCRIPTION
 173
 174 Pragma C<use charnames> supports arguments C<:full>, C<:short> and
 175 script names.  If C<:full> is present, for expansion of
 176 C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of
 177 standard Unicode names of chars.  If C<:short> is present, and
 178 C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
 179 as a letter in script C<SCRIPT>.  If pragma C<use charnames> is used
 180 with script name arguments, then for C<\N{CHARNAME}}> the name
 181 C<CHARNAME> is looked up as a letter in the given scripts (in the
 182 specified order).
 183
 184 For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
 185 this pragma looks for the names
 186
 187   SCRIPTNAME CAPITAL LETTER CHARNAME
 188   SCRIPTNAME SMALL LETTER CHARNAME
 189   SCRIPTNAME LETTER CHARNAME
 190
 191 in the table of standard Unicode names.  If C<CHARNAME> is lowercase,
 192 then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant is
 193 ignored.
 194
 195 =head1 CUSTOM TRANSLATORS
 196
 197 The mechanism of translation of C<\N{...}> escapes is general and not
 198 hardwired into F<charnames.pm>.  A module can install custom
 199 translations (inside the scope which C<use>s the module) with the
 200 following magic incantation:
 201
 202     use charnames ();           # for $charnames::hint_bits
 203     sub import {
 204         shift;
 205         $^H |= $charnames::hint_bits;
 206         $^H{charnames} = \&translator;
 207     }
 208
 209 Here translator() is a subroutine which takes C<CHARNAME> as an
 210 argument, and returns text to insert into the string instead of the
 211 C<\N{CHARNAME}> escape.  Since the text to insert should be different
 212 in C<bytes> mode and out of it, the function should check the current
 213 state of C<bytes>-flag as in:
 214
 215     use bytes ();                       # for $bytes::hint_bits
 216     sub translator {
 217         if ($^H & $bytes::hint_bits) {
 218             return bytes_translator(@_);
 219         }
 220         else {
 221             return utf8_translator(@_);
 222         }
 223     }
 224
 225 =head1 charnames::viacode(code)
 226
 227 Returns the full name of the character indicated by the numeric code.
 228 The example
 229
 230     print charnames::viacode(0x2722);
 231
 232 prints "FOUR TEARDROP-SPOKED ASTERISK".
 233
 234 Returns nothing if no name is known for the code.
 235
 236 This works only for the standard names, and does not yet aply
 237 to custom translators.
 238
 239 =head1 BUGS
 240
 241 Since evaluation of the translation function happens in a middle of
 242 compilation (of a string literal), the translation function should not
 243 do any C<eval>s or C<require>s.  This restriction should be lifted in
 244 a future version of Perl.
 245
 246 =cut