lib/charnames.pm

   1 package charnames;
   2 use bytes ();           # for $bytes::hint_bits
   3 $charnames::hint_bits = 0x20000;
   4
   5 my $fname = 'unicode/UnicodeData-Latest.txt';
   6 my $txt;
   7
   8 # This is not optimized in any way yet
   9 sub charnames {
  10   $name = shift;
  11   $txt = do "unicode/Name.pl" unless $txt;
  12   my @off;
  13   if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
  14     @off = ($-[0], $+[0]);
  15   }
  16   unless (@off) {
  17     if ($^H{charnames_short} and $name =~ /^(.*?):(.*)/s) {
  18       my ($script, $cname) = ($1,$2);
  19       my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
  20       if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
  21         @off = ($-[0], $+[0]);
  22       }
  23     }
  24   }
  25   unless (@off) {
  26     my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
  27     for ( @{$^H{charnames_scripts}} ) {
  28       (@off = ($-[0], $+[0])), last
  29         if $txt =~ m/\t\t$_ (?:$case )?LETTER \U$name$/m;
  30     }
  31   }
  32   die "Unknown charname '$name'" unless @off;
  33
  34   my $ord = hex substr $txt, $off[0] - 4, 4;
  35   if ($^H & $bytes::hint_bits) {        # "use bytes" in effect?
  36     use bytes;
  37     return chr $ord if $ord <= 255;
  38     my $hex = sprintf '%X=0%o', $ord, $ord;
  39     my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
  40     die "Character 0x$hex with name '$fname' is above 0xFF";
  41   }
  42   return chr $ord;
  43 }
  44
  45 sub import {
  46   shift;
  47   die "`use charnames' needs explicit imports list" unless @_;
  48   $^H |= $charnames::hint_bits;
  49   $^H{charnames} = \&charnames ;
  50   my %h;
  51   @h{@_} = (1) x @_;
  52   $^H{charnames_full} = delete $h{':full'};
  53   $^H{charnames_short} = delete $h{':short'};
  54   $^H{charnames_scripts} = [map uc, keys %h];
  55 }
  56
  57
  58 1;
  59 __END__
  60
  61 =head1 NAME
  62
  63 charnames - define character names for C<\N{named}> string literal escape.
  64
  65 =head1 SYNOPSIS
  66
  67   use charnames ':full';
  68   print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
  69
  70   use charnames ':short';
  71   print "\N{greek:Sigma} is an upper-case sigma.\n";
  72
  73   use charnames qw(cyrillic greek);
  74   print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
  75
  76 =head1 DESCRIPTION
  77
  78 Pragma C<use charnames> supports arguments C<:full>, C<:short> and
  79 script names.  If C<:full> is present, for expansion of
  80 C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of
  81 standard Unicode names of chars.  If C<:short> is present, and
  82 C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
  83 as a letter in script C<SCRIPT>.  If pragma C<use charnames> is used
  84 with script name arguments, then for C<\N{CHARNAME}}> the name
  85 C<CHARNAME> is looked up as a letter in the given scripts (in the
  86 specified order).
  87
  88 For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
  89 this pragma looks for the names
  90
  91   SCRIPTNAME CAPITAL LETTER CHARNAME
  92   SCRIPTNAME SMALL LETTER CHARNAME
  93   SCRIPTNAME LETTER CHARNAME
  94
  95 in the table of standard Unicode names.  If C<CHARNAME> is lowercase,
  96 then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant is
  97 ignored.
  98
  99 =head1 CUSTOM TRANSLATORS
 100
 101 The mechanism of translation of C<\N{...}> escapes is general and not
 102 hardwired into F<charnames.pm>.  A module can install custom
 103 translations (inside the scope which C<use>s the module) with the
 104 following magic incantation:
 105
 106     use charnames ();           # for $charnames::hint_bits
 107     sub import {
 108         shift;
 109         $^H |= $charnames::hint_bits;
 110         $^H{charnames} = \&translator;
 111     }
 112
 113 Here translator() is a subroutine which takes C<CHARNAME> as an
 114 argument, and returns text to insert into the string instead of the
 115 C<\N{CHARNAME}> escape.  Since the text to insert should be different
 116 in C<bytes> mode and out of it, the function should check the current
 117 state of C<bytes>-flag as in:
 118
 119     use bytes ();                       # for $bytes::hint_bits
 120     sub translator {
 121         if ($^H & $bytes::hint_bits) {
 122             return bytes_translator(@_);
 123         }
 124         else {
 125             return utf8_translator(@_);
 126         }
 127     }
 128
 129 =head1 BUGS
 130
 131 Since evaluation of the translation function happens in a middle of
 132 compilation (of a string literal), the translation function should not
 133 do any C<eval>s or C<require>s.  This restriction should be lifted in
 134 a future version of Perl.
 135
 136 =cut