lib/charnames.pm

   1 package charnames;
   2 use bytes ();           # for $bytes::hint_bits
   3 use warnings();
   4 $charnames::hint_bits = 0x20000;
   5
   6 my $txt;
   7
   8 # This is not optimized in any way yet
   9 sub charnames {
  10   $name = shift;
  11   $txt = do "unicode/Name.pl" unless $txt;
  12   my @off;
  13   if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
  14     @off = ($-[0], $+[0]);
  15   }
  16   unless (@off) {
  17     if ($^H{charnames_short} and $name =~ /^(.*?):(.*)/s) {
  18       my ($script, $cname) = ($1,$2);
  19       my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
  20       if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
  21         @off = ($-[0], $+[0]);
  22       }
  23     }
  24   }
  25   unless (@off) {
  26     my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
  27     for ( @{$^H{charnames_scripts}} ) {
  28       (@off = ($-[0], $+[0])), last
  29         if $txt =~ m/\t\t$_ (?:$case )?LETTER \U$name$/m;
  30     }
  31   }
  32   die "Unknown charname '$name'" unless @off;
  33
  34   my $ord = hex substr $txt, $off[0] - 4, 4;
  35   if ($^H & $bytes::hint_bits) {        # "use bytes" in effect?
  36     use bytes;
  37     return chr $ord if $ord <= 255;
  38     my $hex = sprintf '%X=0%o', $ord, $ord;
  39     my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
  40     die "Character 0x$hex with name '$fname' is above 0xFF";
  41   }
  42   return chr $ord;
  43 }
  44
  45 sub import {
  46   shift;
  47   die "`use charnames' needs explicit imports list" unless @_;
  48   $^H |= $charnames::hint_bits;
  49   $^H{charnames} = \&charnames ;
  50   my %h;
  51   @h{@_} = (1) x @_;
  52   $^H{charnames_full} = delete $h{':full'};
  53   $^H{charnames_short} = delete $h{':short'};
  54   $^H{charnames_scripts} = [map uc, keys %h];
  55   if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
  56         $txt = do "unicode/Name.pl" unless $txt;
  57     for (@{$^H{charnames_scripts}}) {
  58         warnings::warn('utf8',  "No such script: '$_'") unless
  59             $txt =~ m/\t\t$_ (?:CAPITAL |SMALL )?LETTER /;
  60         }
  61   }
  62 }
  63
  64
  65 1;
  66 __END__
  67
  68 =head1 NAME
  69
  70 charnames - define character names for C<\N{named}> string literal escape.
  71
  72 =head1 SYNOPSIS
  73
  74   use charnames ':full';
  75   print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
  76
  77   use charnames ':short';
  78   print "\N{greek:Sigma} is an upper-case sigma.\n";
  79
  80   use charnames qw(cyrillic greek);
  81   print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
  82
  83 =head1 DESCRIPTION
  84
  85 Pragma C<use charnames> supports arguments C<:full>, C<:short> and
  86 script names.  If C<:full> is present, for expansion of
  87 C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of
  88 standard Unicode names of chars.  If C<:short> is present, and
  89 C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
  90 as a letter in script C<SCRIPT>.  If pragma C<use charnames> is used
  91 with script name arguments, then for C<\N{CHARNAME}}> the name
  92 C<CHARNAME> is looked up as a letter in the given scripts (in the
  93 specified order).
  94
  95 For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
  96 this pragma looks for the names
  97
  98   SCRIPTNAME CAPITAL LETTER CHARNAME
  99   SCRIPTNAME SMALL LETTER CHARNAME
 100   SCRIPTNAME LETTER CHARNAME
 101
 102 in the table of standard Unicode names.  If C<CHARNAME> is lowercase,
 103 then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant is
 104 ignored.
 105
 106 =head1 CUSTOM TRANSLATORS
 107
 108 The mechanism of translation of C<\N{...}> escapes is general and not
 109 hardwired into F<charnames.pm>.  A module can install custom
 110 translations (inside the scope which C<use>s the module) with the
 111 following magic incantation:
 112
 113     use charnames ();           # for $charnames::hint_bits
 114     sub import {
 115         shift;
 116         $^H |= $charnames::hint_bits;
 117         $^H{charnames} = \&translator;
 118     }
 119
 120 Here translator() is a subroutine which takes C<CHARNAME> as an
 121 argument, and returns text to insert into the string instead of the
 122 C<\N{CHARNAME}> escape.  Since the text to insert should be different
 123 in C<bytes> mode and out of it, the function should check the current
 124 state of C<bytes>-flag as in:
 125
 126     use bytes ();                       # for $bytes::hint_bits
 127     sub translator {
 128         if ($^H & $bytes::hint_bits) {
 129             return bytes_translator(@_);
 130         }
 131         else {
 132             return utf8_translator(@_);
 133         }
 134     }
 135
 136 =head1 BUGS
 137
 138 Since evaluation of the translation function happens in a middle of
 139 compilation (of a string literal), the translation function should not
 140 do any C<eval>s or C<require>s.  This restriction should be lifted in
 141 a future version of Perl.
 142
 143 =cut