lib/charnames.pm

   1 package charnames;
   2
   3 our $VERSION = '1.00';
   4
   5 use bytes ();           # for $bytes::hint_bits
   6 use warnings();
   7 $charnames::hint_bits = 0x20000;
   8
   9 my $txt;
  10
  11 # This is not optimized in any way yet
  12 sub charnames {
  13   $name = shift;
  14   $txt = do "unicode/Name.pl" unless $txt;
  15   my @off;
  16   if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
  17     @off = ($-[0], $+[0]);
  18   }
  19   unless (@off) {
  20     if ($^H{charnames_short} and $name =~ /^(.*?):(.*)/s) {
  21       my ($script, $cname) = ($1,$2);
  22       my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
  23       if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
  24         @off = ($-[0], $+[0]);
  25       }
  26     }
  27   }
  28   unless (@off) {
  29     my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
  30     for ( @{$^H{charnames_scripts}} ) {
  31       (@off = ($-[0], $+[0])), last
  32         if $txt =~ m/\t\t$_ (?:$case )?LETTER \U$name$/m;
  33     }
  34   }
  35   die "Unknown charname '$name'" unless @off;
  36
  37   my $hexlen = 0;
  38   $hexlen++ while substr($txt, $off[0] - $hexlen - 1, 1) =~ /[0-9a-f]/;
  39   my $ord = hex substr $txt, $off[0] - $hexlen, $hexlen;
  40   if ($^H & $bytes::hint_bits) {        # "use bytes" in effect?
  41     use bytes;
  42     return chr $ord if $ord <= 255;
  43     my $hex = sprintf '%X=0%o', $ord, $ord;
  44     my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
  45     die "Character 0x$hex with name '$fname' is above 0xFF";
  46   }
  47   return chr $ord;
  48 }
  49
  50 sub import {
  51   shift;
  52   die "`use charnames' needs explicit imports list" unless @_;
  53   $^H |= $charnames::hint_bits;
  54   $^H{charnames} = \&charnames ;
  55   my %h;
  56   @h{@_} = (1) x @_;
  57   $^H{charnames_full} = delete $h{':full'};
  58   $^H{charnames_short} = delete $h{':short'};
  59   $^H{charnames_scripts} = [map uc, keys %h];
  60   if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
  61         $txt = do "unicode/Name.pl" unless $txt;
  62     for (@{$^H{charnames_scripts}}) {
  63         warnings::warn('utf8',  "No such script: '$_'") unless
  64             $txt =~ m/\t\t$_ (?:CAPITAL |SMALL )?LETTER /;
  65         }
  66   }
  67 }
  68
  69
  70 1;
  71 __END__
  72
  73 =head1 NAME
  74
  75 charnames - define character names for C<\N{named}> string literal escape.
  76
  77 =head1 SYNOPSIS
  78
  79   use charnames ':full';
  80   print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
  81
  82   use charnames ':short';
  83   print "\N{greek:Sigma} is an upper-case sigma.\n";
  84
  85   use charnames qw(cyrillic greek);
  86   print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
  87
  88 =head1 DESCRIPTION
  89
  90 Pragma C<use charnames> supports arguments C<:full>, C<:short> and
  91 script names.  If C<:full> is present, for expansion of
  92 C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of
  93 standard Unicode names of chars.  If C<:short> is present, and
  94 C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
  95 as a letter in script C<SCRIPT>.  If pragma C<use charnames> is used
  96 with script name arguments, then for C<\N{CHARNAME}}> the name
  97 C<CHARNAME> is looked up as a letter in the given scripts (in the
  98 specified order).
  99
 100 For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
 101 this pragma looks for the names
 102
 103   SCRIPTNAME CAPITAL LETTER CHARNAME
 104   SCRIPTNAME SMALL LETTER CHARNAME
 105   SCRIPTNAME LETTER CHARNAME
 106
 107 in the table of standard Unicode names.  If C<CHARNAME> is lowercase,
 108 then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant is
 109 ignored.
 110
 111 =head1 CUSTOM TRANSLATORS
 112
 113 The mechanism of translation of C<\N{...}> escapes is general and not
 114 hardwired into F<charnames.pm>.  A module can install custom
 115 translations (inside the scope which C<use>s the module) with the
 116 following magic incantation:
 117
 118     use charnames ();           # for $charnames::hint_bits
 119     sub import {
 120         shift;
 121         $^H |= $charnames::hint_bits;
 122         $^H{charnames} = \&translator;
 123     }
 124
 125 Here translator() is a subroutine which takes C<CHARNAME> as an
 126 argument, and returns text to insert into the string instead of the
 127 C<\N{CHARNAME}> escape.  Since the text to insert should be different
 128 in C<bytes> mode and out of it, the function should check the current
 129 state of C<bytes>-flag as in:
 130
 131     use bytes ();                       # for $bytes::hint_bits
 132     sub translator {
 133         if ($^H & $bytes::hint_bits) {
 134             return bytes_translator(@_);
 135         }
 136         else {
 137             return utf8_translator(@_);
 138         }
 139     }
 140
 141 =head1 BUGS
 142
 143 Since evaluation of the translation function happens in a middle of
 144 compilation (of a string literal), the translation function should not
 145 do any C<eval>s or C<require>s.  This restriction should be lifted in
 146 a future version of Perl.
 147
 148 =cut