[p5sagit/p5-mst-13.2.git] / lib / charnames.pm

package charnames;
use strict;
use warnings;
use Carp;
our $VERSION = '1.02';

use bytes ();		# for $bytes::hint_bits
$charnames::hint_bits = 0x20000;

my %alias1 = (
		# Icky 3.2 names with parentheses.
		'LINE FEED'		=> 'LINE FEED (LF)',
		'FORM FEED'		=> 'FORM FEED (FF)',
		'CARRIAGE RETURN'	=> 'CARRIAGE RETURN (CR)',
		'NEXT LINE'		=> 'NEXT LINE (NEL)',
		# Convenience.
		'LF'			=> 'LINE FEED (LF)',
		'FF'			=> 'FORM FEED (FF)',
		'CR'			=> 'CARRIAGE RETURN (CR)',
		'NEL'			=> 'NEXT LINE (NEL)',
	        # More convenience.  For futher convencience,
	        # it is suggested some way using using the NamesList
		# aliases is implemented.
	        'ZWNJ'			=> 'ZERO WIDTH NON-JOINER',
	        'ZWJ'			=> 'ZERO WIDTH JOINER',
		'BOM'			=> 'BYTE ORDER MARK',
	    );

my %alias2 = (
		# Pre-3.2 compatibility (only for the first 256 characters).
		'HORIZONTAL TABULATION'	=> 'CHARACTER TABULATION',
		'VERTICAL TABULATION'	=> 'LINE TABULATION',
		'FILE SEPARATOR'	=> 'INFORMATION SEPARATOR FOUR',
		'GROUP SEPARATOR'	=> 'INFORMATION SEPARATOR THREE',
		'RECORD SEPARATOR'	=> 'INFORMATION SEPARATOR TWO',
		'UNIT SEPARATOR'	=> 'INFORMATION SEPARATOR ONE',
		'PARTIAL LINE DOWN'	=> 'PARTIAL LINE FORWARD',
		'PARTIAL LINE UP'	=> 'PARTIAL LINE BACKWARD',
	    );

my %alias3 = (
		# User defined aliasses. Even more convenient :)
	    );
my $txt;

sub alias (@)
{
  @_ or return %alias3;
  my $alias = ref $_[0] ? $_[0] : { @_ };
  @alias3{keys %$alias} = values %$alias;
} # alias

sub alias_file ($)
{
  my $arg = shift;
  my $file = -f $arg ? $arg : "unicore/${arg}_alias.pl";
  if (my @alias = do $file) {
    alias (@alias);
    return (1);
  }
  0;
} # alias_file

# This is not optimized in any way yet
sub charnames
{
  my $name = shift;

  if (exists $alias1{$name}) {
    $name = $alias1{$name};
  }
  elsif (exists $alias2{$name}) {
    require warnings;
    warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead});
    $name = $alias2{$name};
  }
  elsif (exists $alias3{$name}) {
    $name = $alias3{$name};
  }

  my $ord;
  my @off;
  my $fname;

  if ($name eq "BYTE ORDER MARK") {
    $fname = $name;
    $ord = 0xFEFF;
  } else {
    ## Suck in the code/name list as a big string.
    ## Lines look like:
    ##     "0052\t\tLATIN CAPITAL LETTER R\n"
    $txt = do "unicore/Name.pl" unless $txt;

    ## @off will hold the index into the code/name string of the start and
    ## end of the name as we find it.

    ## If :full, look for the the name exactly
    if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) {
      @off = ($-[0], $+[0]);
    }

    ## If we didn't get above, and :short allowed, look for the short name.
    ## The short name is like "greek:Sigma"
    unless (@off) {
      if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
	my ($script, $cname) = ($1, $2);
	my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
	if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
	  @off = ($-[0], $+[0]);
	}
      }
    }

    ## If we still don't have it, check for the name among the loaded
    ## scripts.
    if (not @off) {
      my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
      for my $script (@{$^H{charnames_scripts}}) {
	if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
	  @off = ($-[0], $+[0]);
	  last;
	}
      }
    }

    ## If we don't have it by now, give up.
    unless (@off) {
      carp "Unknown charname '$name'";
      return "\x{FFFD}";
    }

    ##
    ## Now know where in the string the name starts.
    ## The code, in hex, is before that.
    ##
    ## The code can be 4-6 characters long, so we've got to sort of
    ## go look for it, just after the newline that comes before $off[0].
    ##
    ## This would be much easier if unicore/Name.pl had info in
    ## a name/code order, instead of code/name order.
    ##
    ## The +1 after the rindex() is to skip past the newline we're finding,
    ## or, if the rindex() fails, to put us to an offset of zero.
    ##
    my $hexstart = rindex($txt, "\n", $off[0]) + 1;

    ## we know where it starts, so turn into number -
    ## the ordinal for the char.
    $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
  }

  if ($^H & $bytes::hint_bits) {	# "use bytes" in effect?
    use bytes;
    return chr $ord if $ord <= 255;
    my $hex = sprintf "%04x", $ord;
    if (not defined $fname) {
      $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
    }
    croak "Character 0x$hex with name '$fname' is above 0xFF";
  }

  no warnings 'utf8'; # allow even illegal characters
  return pack "U", $ord;
} # charnames

sub import
{
  shift; ## ignore class name

  if (not @_) {
    carp("`use charnames' needs explicit imports list");
  }
  $^H |= $charnames::hint_bits;
  $^H{charnames} = \&charnames ;

  ##
  ## fill %h keys with our @_ args.
  ##
  my ($promote, %h, @args) = (0);
  while (@_ and $_ = shift) {
    if ($_ eq ":alias" && @_) {
      my $alias = shift;
      if (ref $alias) {
	ref $alias eq "HASH" or
	  die "Only HASH reference supported as argument to :alias";
	alias ($alias);
	next;
      }
      if ($alias =~ m{:(\w+)$} and $1 ne "full" && $1 ne "short") {
	alias_file ($1) and $promote = 1, next;
      }
      alias_file ($alias) and next;
    }
    push @args, $_;
  }
  @args == 0 && $promote and @args = (":full");
  @h{@args} = (1) x @args;

  $^H{charnames_full} = delete $h{':full'};
  $^H{charnames_short} = delete $h{':short'};
  $^H{charnames_scripts} = [map uc, keys %h];

  ##
  ## If utf8? warnings are enabled, and some scripts were given,
  ## see if at least we can find one letter of each script.
  ##
  if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
    $txt = do "unicore/Name.pl" unless $txt;

    for my $script (@{$^H{charnames_scripts}}) {
      if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
	warnings::warn('utf8',  "No such script: '$script'");
      }
    }
  }
} # import

require Unicode::UCD; # for Unicode::UCD::_getcode()

my %viacode;

sub viacode
{
  if (@_ != 1) {
    carp "charnames::viacode() expects one argument";
    return ()
  }

  my $arg = shift;
  my $code = Unicode::UCD::_getcode($arg);

  my $hex;

  if (defined $code) {
    $hex = sprintf "%04X", $arg;
  } else {
    carp("unexpected arg \"$arg\" to charnames::viacode()");
    return;
  }

  if ($code > 0x10FFFF) {
    carp sprintf "Unicode characters only allocated up to U+10FFFF (you asked for U+%X)", $hex;
    return;
  }

  return $viacode{$hex} if exists $viacode{$hex};

  $txt = do "unicore/Name.pl" unless $txt;

  if ($txt =~ m/^$hex\t\t(.+)/m) {
    return $viacode{$hex} = $1;
  } else {
    return;
  }
} # viacode

my %vianame;

sub vianame
{
  if (@_ != 1) {
    carp "charnames::vianame() expects one name argument";
    return ()
  }

  my $arg = shift;

  return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/;

  return $vianame{$arg} if exists $vianame{$arg};

  $txt = do "unicore/Name.pl" unless $txt;

  my $pos = index $txt, "\t\t$arg\n";
  if ($[ <= $pos) {
    my $posLF = rindex $txt, "\n", $pos;
    (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d;
    return $vianame{$arg} = hex $code;

    # If $pos is at the 1st line, $posLF must be $[ - 1 (not found);
    # then $posLF + 1 equals to $[ (at the beginning of $txt).
    # Otherwise $posLF is the position of "\n";
    # then $posLF + 1 must be the position of the next to "\n"
    # (the beginning of the line).
    # substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t",
    # "10300\t", "100000", etc. So we can get the code via removing TAB.
  } else {
    return;
  }
} # vianame


1;
__END__

=head1 NAME

charnames - define character names for C<\N{named}> string literal escapes

=head1 SYNOPSIS

  use charnames ':full';
  print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";

  use charnames ':short';
  print "\N{greek:Sigma} is an upper-case sigma.\n";

  use charnames qw(cyrillic greek);
  print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";

  use charnames ":full", ":alias" => {
    e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
    };
  print "\N{e_ACUTE} is a small letter e with an acute.\n";

  print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
  printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330"

=head1 DESCRIPTION

Pragma C<use charnames> supports arguments C<:full>, C<:short>, script
names and customized aliases.  If C<:full> is present, for expansion of
C<\N{CHARNAME}> string C<CHARNAME> is first looked in the list of
standard Unicode names of chars.  If C<:short> is present, and
C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
as a letter in script C<SCRIPT>.  If pragma C<use charnames> is used
with script name arguments, then for C<\N{CHARNAME}> the name
C<CHARNAME> is looked up as a letter in the given scripts (in the
specified order). Customized aliases are explained in L</CUSTOM ALIASES>.

For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
this pragma looks for the names

  SCRIPTNAME CAPITAL LETTER CHARNAME
  SCRIPTNAME SMALL LETTER CHARNAME
  SCRIPTNAME LETTER CHARNAME

in the table of standard Unicode names.  If C<CHARNAME> is lowercase,
then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
is ignored.

Note that C<\N{...}> is compile-time, it's a special form of string
constant used inside double-quoted strings: in other words, you cannot
use variables inside the C<\N{...}>.  If you want similar run-time
functionality, use charnames::vianame().

For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
as of Unicode 3.1, there are no official Unicode names but you can use
instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth).  In
Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429
has been updated, see L</ALIASES>.  Also note that the U+UU80, U+0081,
U+0084, and U+0099 do not have names even in ISO 6429.

Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}"
is the Unicode smiley face, or "\N{WHITE SMILING FACE}".

=head1 CUSTOM TRANSLATORS

The mechanism of translation of C<\N{...}> escapes is general and not
hardwired into F<charnames.pm>.  A module can install custom
translations (inside the scope which C<use>s the module) with the
following magic incantation:

    use charnames ();		# for $charnames::hint_bits
    sub import {
	shift;
	$^H |= $charnames::hint_bits;
	$^H{charnames} = \&translator;
    }

Here translator() is a subroutine which takes C<CHARNAME> as an
argument, and returns text to insert into the string instead of the
C<\N{CHARNAME}> escape.  Since the text to insert should be different
in C<bytes> mode and out of it, the function should check the current
state of C<bytes>-flag as in:

    use bytes ();			# for $bytes::hint_bits
    sub translator {
	if ($^H & $bytes::hint_bits) {
	    return bytes_translator(@_);
	}
	else {
	    return utf8_translator(@_);
	}
    }

=head1 CUSTOM ALIASES

This version of charnames supports three mechanisms of adding local
or customized aliases to standard Unicode naming conventions (:full)

=head2 Anonymous hashes

    use charnames ":full", ":alias" => {
        e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
        };
    my $str = "\N{e_ACUTE}";

=head2 Alias file

    use charnames ":full", ":alias" => "pro";

    will try to read "unicore/pro_alias.pl" from the @INC path. This
    file should return a list in plain perl:

    (
    A_GRAVE         => "LATIN CAPITAL LETTER A WITH GRAVE",
    A_CIRCUM        => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
    A_DIAERES       => "LATIN CAPITAL LETTER A WITH DIAERESIS",
    A_TILDE         => "LATIN CAPITAL LETTER A WITH TILDE",
    A_BREVE         => "LATIN CAPITAL LETTER A WITH BREVE",
    A_RING          => "LATIN CAPITAL LETTER A WITH RING ABOVE",
    A_MACRON        => "LATIN CAPITAL LETTER A WITH MACRON",
    );

=head2 Alias shortcut

    use charnames ":alias" => ":pro";

    works exactly the same as the alias pairs, only this time,
    ":full" is inserted automatically as first argument (if no
    other argument is given).

=head1 charnames::viacode(code)

Returns the full name of the character indicated by the numeric code.
The example

    print charnames::viacode(0x2722);

prints "FOUR TEARDROP-SPOKED ASTERISK".

Returns undef if no name is known for the code.

This works only for the standard names, and does not yet apply
to custom translators.

Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
SPACE", not "BYTE ORDER MARK".

=head1 charnames::vianame(name)

Returns the code point indicated by the name.
The example

    printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");

prints "2722".

Returns undef if the name is unknown.

This works only for the standard names, and does not yet apply
to custom translators.

=head1 ALIASES

A few aliases have been defined for convenience: instead of having
to use the official names

    LINE FEED (LF)
    FORM FEED (FF)
    CARRIAGE RETURN (CR)
    NEXT LINE (NEL)

(yes, with parentheses) one can use

    LINE FEED
    FORM FEED
    CARRIAGE RETURN
    NEXT LINE
    LF
    FF
    CR
    NEL

One can also use

    BYTE ORDER MARK
    BOM

and

    ZWNJ
    ZWJ

for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER.

For backward compatibility one can use the old names for
certain C0 and C1 controls

    old                         new

    HORIZONTAL TABULATION       CHARACTER TABULATION
    VERTICAL TABULATION         LINE TABULATION
    FILE SEPARATOR              INFORMATION SEPARATOR FOUR
    GROUP SEPARATOR             INFORMATION SEPARATOR THREE
    RECORD SEPARATOR            INFORMATION SEPARATOR TWO
    UNIT SEPARATOR              INFORMATION SEPARATOR ONE
    PARTIAL LINE DOWN           PARTIAL LINE FORWARD
    PARTIAL LINE UP             PARTIAL LINE BACKWARD

but the old names in addition to giving the character
will also give a warning about being deprecated.

=head1 ILLEGAL CHARACTERS

If you ask by name for a character that does not exist, a warning is
given and the Unicode I<replacement character> "\x{FFFD}" is returned.

If you ask by code for a character that does not exist, no warning is
given and C<undef> is returned.  (Though if you ask for a code point
past U+10FFFF you do get a warning.)

=head1 BUGS

Since evaluation of the translation function happens in a middle of
compilation (of a string literal), the translation function should not
do any C<eval>s or C<require>s.  This restriction should be lifted in
a future version of Perl.

=cut
Commit	Line	Data
423cee85	1	package charnames;
b177ca84	2	use strict;
	3	use warnings;
	4	use Carp;
35c0985d	5	our $VERSION = '1.02';
b75c8c73	6
d5448623	7	use bytes (); # for $bytes::hint_bits
d5448623	8	$charnames::hint_bits = 0x20000;
423cee85	9
52ea3e69	10	my %alias1 = (
	11	# Icky 3.2 names with parentheses.
	12	'LINE FEED' => 'LINE FEED (LF)',
	13	'FORM FEED' => 'FORM FEED (FF)',
	14	'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)',
	15	'NEXT LINE' => 'NEXT LINE (NEL)',
	16	# Convenience.
	17	'LF' => 'LINE FEED (LF)',
	18	'FF' => 'FORM FEED (FF)',
eb380778	19	'CR' => 'CARRIAGE RETURN (CR)',
51e9e896	20	'NEL' => 'NEXT LINE (NEL)',
24b5d5cc	21	# More convenience. For futher convencience,
	22	# it is suggested some way using using the NamesList
	23	# aliases is implemented.
	24	'ZWNJ' => 'ZERO WIDTH NON-JOINER',
	25	'ZWJ' => 'ZERO WIDTH JOINER',
52ea3e69	26	'BOM' => 'BYTE ORDER MARK',
	27	);
	28
	29	my %alias2 = (
	30	# Pre-3.2 compatibility (only for the first 256 characters).
	31	'HORIZONTAL TABULATION' => 'CHARACTER TABULATION',
	32	'VERTICAL TABULATION' => 'LINE TABULATION',
	33	'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR',
	34	'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE',
	35	'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO',
	36	'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE',
	37	'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD',
	38	'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD',
	39	);
	40
35c0985d	41	my %alias3 = (
	42	# User defined aliasses. Even more convenient :)
	43	);
423cee85	44	my $txt;
423cee85	45
35c0985d	46	sub alias (@)
	47	{
	48	@_ or return %alias3;
	49	my $alias = ref $_[0] ? $_[0] : { @_ };
	50	@alias3{keys %$alias} = values %$alias;
	51	} # alias
	52
	53	sub alias_file ($)
	54	{
	55	my $arg = shift;
	56	my $file = -f $arg ? $arg : "unicore/${arg}_alias.pl";
	57	if (my @alias = do $file) {
	58	alias (@alias);
	59	return (1);
	60	}
	61	0;
	62	} # alias_file
	63
423cee85	64	# This is not optimized in any way yet
b177ca84	65	sub charnames
	66	{
	67	my $name = shift;
	68
52ea3e69	69	if (exists $alias1{$name}) {
35c0985d	70	$name = $alias1{$name};
52ea3e69	71	}
35c0985d	72	elsif (exists $alias2{$name}) {
	73	require warnings;
	74	warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead});
	75	$name = $alias2{$name};
	76	}
	77	elsif (exists $alias3{$name}) {
	78	$name = $alias3{$name};
52ea3e69	79	}
b177ca84	80
52ea3e69	81	my $ord;
423cee85	82	my @off;
52ea3e69	83	my $fname;
	84
	85	if ($name eq "BYTE ORDER MARK") {
35c0985d	86	$fname = $name;
35c0985d	87	$ord = 0xFEFF;
52ea3e69	88	} else {
35c0985d	89	## Suck in the code/name list as a big string.
	90	## Lines look like:
	91	## "0052\t\tLATIN CAPITAL LETTER R\n"
	92	$txt = do "unicore/Name.pl" unless $txt;
	93
	94	## @off will hold the index into the code/name string of the start and
	95	## end of the name as we find it.
	96
	97	## If :full, look for the the name exactly
	98	if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) {
	99	@off = ($-[0], $+[0]);
	100	}
	101
	102	## If we didn't get above, and :short allowed, look for the short name.
	103	## The short name is like "greek:Sigma"
	104	unless (@off) {
	105	if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
	106	my ($script, $cname) = ($1, $2);
	107	my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
	108	if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
52ea3e69	109	@off = ($-[0], $+[0]);
35c0985d	110	}
423cee85	111	}
35c0985d	112	}
b177ca84	113
35c0985d	114	## If we still don't have it, check for the name among the loaded
	115	## scripts.
	116	if (not @off) {
	117	my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
	118	for my $script (@{$^H{charnames_scripts}}) {
	119	if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
	120	@off = ($-[0], $+[0]);
	121	last;
	122	}
52ea3e69	123	}
35c0985d	124	}
	125
	126	## If we don't have it by now, give up.
	127	unless (@off) {
	128	carp "Unknown charname '$name'";
	129	return "\x{FFFD}";
	130	}
	131
	132	##
	133	## Now know where in the string the name starts.
	134	## The code, in hex, is before that.
	135	##
	136	## The code can be 4-6 characters long, so we've got to sort of
	137	## go look for it, just after the newline that comes before $off[0].
	138	##
	139	## This would be much easier if unicore/Name.pl had info in
	140	## a name/code order, instead of code/name order.
	141	##
	142	## The +1 after the rindex() is to skip past the newline we're finding,
	143	## or, if the rindex() fails, to put us to an offset of zero.
	144	##
	145	my $hexstart = rindex($txt, "\n", $off[0]) + 1;
	146
	147	## we know where it starts, so turn into number -
	148	## the ordinal for the char.
	149	$ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
423cee85	150	}
b177ca84	151
d5448623	152	if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
8058d7ab	153	use bytes;
d41ff1b8	154	return chr $ord if $ord <= 255;
f0175764	155	my $hex = sprintf "%04x", $ord;
52ea3e69	156	if (not defined $fname) {
35c0985d	157	$fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
52ea3e69	158	}
f0175764	159	croak "Character 0x$hex with name '$fname' is above 0xFF";
423cee85	160	}
f0175764	161
52ea3e69	162	no warnings 'utf8'; # allow even illegal characters
bfa383d6	163	return pack "U", $ord;
35c0985d	164	} # charnames
423cee85	165
b177ca84	166	sub import
	167	{
	168	shift; ## ignore class name
	169
35c0985d	170	if (not @_) {
35c0985d	171	carp("`use charnames' needs explicit imports list");
b177ca84	172	}
d5448623	173	$^H \|= $charnames::hint_bits;
423cee85	174	$^H{charnames} = \&charnames ;
b177ca84	175
	176	##
	177	## fill %h keys with our @_ args.
	178	##
35c0985d	179	my ($promote, %h, @args) = (0);
	180	while (@_ and $_ = shift) {
	181	if ($_ eq ":alias" && @_) {
	182	my $alias = shift;
	183	if (ref $alias) {
	184	ref $alias eq "HASH" or
	185	die "Only HASH reference supported as argument to :alias";
	186	alias ($alias);
	187	next;
	188	}
	189	if ($alias =~ m{:(\w+)$} and $1 ne "full" && $1 ne "short") {
	190	alias_file ($1) and $promote = 1, next;
	191	}
	192	alias_file ($alias) and next;
	193	}
	194	push @args, $_;
	195	}
	196	@args == 0 && $promote and @args = (":full");
	197	@h{@args} = (1) x @args;
b177ca84	198
423cee85	199	$^H{charnames_full} = delete $h{':full'};
	200	$^H{charnames_short} = delete $h{':short'};
	201	$^H{charnames_scripts} = [map uc, keys %h];
b177ca84	202
	203	##
	204	## If utf8? warnings are enabled, and some scripts were given,
	205	## see if at least we can find one letter of each script.
	206	##
35c0985d	207	if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
	208	$txt = do "unicore/Name.pl" unless $txt;
	209
	210	for my $script (@{$^H{charnames_scripts}}) {
	211	if (not $txt =~ m/\t\t$script (?:CAPITAL \|SMALL )?LETTER /) {
	212	warnings::warn('utf8', "No such script: '$script'");
b177ca84	213	}
35c0985d	214	}
bd62941a	215	}
35c0985d	216	} # import
423cee85	217
f0175764	218	require Unicode::UCD; # for Unicode::UCD::_getcode()
f0175764	219
4e2cda5d	220	my %viacode;
4e2cda5d	221
b177ca84	222	sub viacode
b177ca84	223	{
35c0985d	224	if (@_ != 1) {
	225	carp "charnames::viacode() expects one argument";
	226	return ()
	227	}
f0175764	228
35c0985d	229	my $arg = shift;
35c0985d	230	my $code = Unicode::UCD::_getcode($arg);
b177ca84	231
35c0985d	232	my $hex;
f0175764	233
35c0985d	234	if (defined $code) {
	235	$hex = sprintf "%04X", $arg;
	236	} else {
	237	carp("unexpected arg \"$arg\" to charnames::viacode()");
	238	return;
	239	}
b177ca84	240
35c0985d	241	if ($code > 0x10FFFF) {
	242	carp sprintf "Unicode characters only allocated up to U+10FFFF (you asked for U+%X)", $hex;
	243	return;
	244	}
f0175764	245
35c0985d	246	return $viacode{$hex} if exists $viacode{$hex};
4e2cda5d	247
35c0985d	248	$txt = do "unicore/Name.pl" unless $txt;
b177ca84	249
35c0985d	250	if ($txt =~ m/^$hex\t\t(.+)/m) {
	251	return $viacode{$hex} = $1;
	252	} else {
	253	return;
	254	}
	255	} # viacode
daf0d493	256
4e2cda5d	257	my %vianame;
4e2cda5d	258
daf0d493	259	sub vianame
daf0d493	260	{
35c0985d	261	if (@_ != 1) {
	262	carp "charnames::vianame() expects one name argument";
	263	return ()
	264	}
daf0d493	265
35c0985d	266	my $arg = shift;
daf0d493	267
35c0985d	268	return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/;
dbc0d4f2	269
35c0985d	270	return $vianame{$arg} if exists $vianame{$arg};
4e2cda5d	271
35c0985d	272	$txt = do "unicore/Name.pl" unless $txt;
daf0d493	273
35c0985d	274	my $pos = index $txt, "\t\t$arg\n";
	275	if ($[ <= $pos) {
	276	my $posLF = rindex $txt, "\n", $pos;
	277	(my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d;
	278	return $vianame{$arg} = hex $code;
	279
	280	# If $pos is at the 1st line, $posLF must be $[ - 1 (not found);
	281	# then $posLF + 1 equals to $[ (at the beginning of $txt).
	282	# Otherwise $posLF is the position of "\n";
	283	# then $posLF + 1 must be the position of the next to "\n"
	284	# (the beginning of the line).
	285	# substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t",
	286	# "10300\t", "100000", etc. So we can get the code via removing TAB.
	287	} else {
	288	return;
	289	}
	290	} # vianame
b177ca84	291
423cee85	292
	293	1;
	294	__END__
	295
	296	=head1 NAME
	297
274085e3	298	charnames - define character names for C<\N{named}> string literal escapes
423cee85	299
	300	=head1 SYNOPSIS
	301
	302	use charnames ':full';
4a2d328f	303	print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85	304
423cee85	305	use charnames ':short';
4a2d328f	306	print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85	307
423cee85	308	use charnames qw(cyrillic greek);
4a2d328f	309	print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85	310
35c0985d	311	use charnames ":full", ":alias" => {
	312	e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
	313	};
	314	print "\N{e_ACUTE} is a small letter e with an acute.\n";
	315
a23c04e4	316	print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
a23c04e4	317	printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330"
b177ca84	318
423cee85	319	=head1 DESCRIPTION
423cee85	320
35c0985d	321	Pragma C<use charnames> supports arguments C<:full>, C<:short>, script
35c0985d	322	names and customized aliases. If C<:full> is present, for expansion of
a191c821	323	C<\N{CHARNAME}> string C<CHARNAME> is first looked in the list of
423cee85	324	standard Unicode names of chars. If C<:short> is present, and
	325	C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
	326	as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
a191c821	327	with script name arguments, then for C<\N{CHARNAME}> the name
423cee85	328	C<CHARNAME> is looked up as a letter in the given scripts (in the
35c0985d	329	specified order). Customized aliases are explained in L</CUSTOM ALIASES>.
423cee85	330
423cee85	331	For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
d5448623	332	this pragma looks for the names
423cee85	333
	334	SCRIPTNAME CAPITAL LETTER CHARNAME
	335	SCRIPTNAME SMALL LETTER CHARNAME
	336	SCRIPTNAME LETTER CHARNAME
	337
	338	in the table of standard Unicode names. If C<CHARNAME> is lowercase,
daf0d493	339	then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
	340	is ignored.
	341
	342	Note that C<\N{...}> is compile-time, it's a special form of string
	343	constant used inside double-quoted strings: in other words, you cannot
4e2cda5d	344	use variables inside the C<\N{...}>. If you want similar run-time
daf0d493	345	functionality, use charnames::vianame().
423cee85	346
301a3cda	347	For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
dbc0d4f2	348	as of Unicode 3.1, there are no official Unicode names but you can use
	349	instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth). In
	350	Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429
	351	has been updated, see L</ALIASES>. Also note that the U+UU80, U+0081,
	352	U+0084, and U+0099 do not have names even in ISO 6429.
	353
	354	Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}"
	355	is the Unicode smiley face, or "\N{WHITE SMILING FACE}".
301a3cda	356
423cee85	357	=head1 CUSTOM TRANSLATORS
423cee85	358
d5448623	359	The mechanism of translation of C<\N{...}> escapes is general and not
423cee85	360	hardwired into F<charnames.pm>. A module can install custom
d5448623	361	translations (inside the scope which C<use>s the module) with the
423cee85	362	following magic incantation:
423cee85	363
d5448623	364	use charnames (); # for $charnames::hint_bits
	365	sub import {
	366	shift;
	367	$^H \|= $charnames::hint_bits;
	368	$^H{charnames} = \&translator;
	369	}
423cee85	370
	371	Here translator() is a subroutine which takes C<CHARNAME> as an
	372	argument, and returns text to insert into the string instead of the
4a2d328f	373	C<\N{CHARNAME}> escape. Since the text to insert should be different
d5448623	374	in C<bytes> mode and out of it, the function should check the current
	375	state of C<bytes>-flag as in:
	376
	377	use bytes (); # for $bytes::hint_bits
	378	sub translator {
	379	if ($^H & $bytes::hint_bits) {
	380	return bytes_translator(@_);
	381	}
	382	else {
	383	return utf8_translator(@_);
	384	}
423cee85	385	}
423cee85	386
35c0985d	387	=head1 CUSTOM ALIASES
	388
	389	This version of charnames supports three mechanisms of adding local
	390	or customized aliases to standard Unicode naming conventions (:full)
	391
	392	=head2 Anonymous hashes
	393
	394	use charnames ":full", ":alias" => {
	395	e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
	396	};
	397	my $str = "\N{e_ACUTE}";
	398
	399	=head2 Alias file
	400
	401	use charnames ":full", ":alias" => "pro";
	402
	403	will try to read "unicore/pro_alias.pl" from the @INC path. This
	404	file should return a list in plain perl:
	405
	406	(
	407	A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE",
	408	A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
	409	A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS",
	410	A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE",
	411	A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE",
	412	A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE",
	413	A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON",
	414	);
	415
	416	=head2 Alias shortcut
	417
	418	use charnames ":alias" => ":pro";
	419
	420	works exactly the same as the alias pairs, only this time,
	421	":full" is inserted automatically as first argument (if no
	422	other argument is given).
	423
b177ca84	424	=head1 charnames::viacode(code)
	425
	426	Returns the full name of the character indicated by the numeric code.
	427	The example
	428
	429	print charnames::viacode(0x2722);
	430
	431	prints "FOUR TEARDROP-SPOKED ASTERISK".
	432
daf0d493	433	Returns undef if no name is known for the code.
daf0d493	434
35c0985d	435	This works only for the standard names, and does not yet apply
daf0d493	436	to custom translators.
daf0d493	437
274085e3	438	Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
	439	SPACE", not "BYTE ORDER MARK".
	440
eb6a2339	441	=head1 charnames::vianame(name)
daf0d493	442
	443	Returns the code point indicated by the name.
	444	The example
	445
	446	printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
	447
	448	prints "2722".
	449
eb6a2339	450	Returns undef if the name is unknown.
b177ca84	451
35c0985d	452	This works only for the standard names, and does not yet apply
b177ca84	453	to custom translators.
b177ca84	454
52ea3e69	455	=head1 ALIASES
	456
	457	A few aliases have been defined for convenience: instead of having
	458	to use the official names
	459
	460	LINE FEED (LF)
	461	FORM FEED (FF)
	462	CARRIAGE RETURN (CR)
	463	NEXT LINE (NEL)
	464
	465	(yes, with parentheses) one can use
	466
	467	LINE FEED
	468	FORM FEED
	469	CARRIAGE RETURN
	470	NEXT LINE
	471	LF
	472	FF
	473	CR
	474	NEL
	475
	476	One can also use
	477
	478	BYTE ORDER MARK
	479	BOM
	480
24b5d5cc	481	and
	482
	483	ZWNJ
	484	ZWJ
	485
	486	for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER.
52ea3e69	487
	488	For backward compatibility one can use the old names for
	489	certain C0 and C1 controls
	490
	491	old new
	492
	493	HORIZONTAL TABULATION CHARACTER TABULATION
	494	VERTICAL TABULATION LINE TABULATION
	495	FILE SEPARATOR INFORMATION SEPARATOR FOUR
	496	GROUP SEPARATOR INFORMATION SEPARATOR THREE
	497	RECORD SEPARATOR INFORMATION SEPARATOR TWO
	498	UNIT SEPARATOR INFORMATION SEPARATOR ONE
	499	PARTIAL LINE DOWN PARTIAL LINE FORWARD
	500	PARTIAL LINE UP PARTIAL LINE BACKWARD
	501
	502	but the old names in addition to giving the character
	503	will also give a warning about being deprecated.
	504
f0175764	505	=head1 ILLEGAL CHARACTERS
f0175764	506
00d835f2	507	If you ask by name for a character that does not exist, a warning is
	508	given and the Unicode I<replacement character> "\x{FFFD}" is returned.
	509
	510	If you ask by code for a character that does not exist, no warning is
	511	given and C<undef> is returned. (Though if you ask for a code point
	512	past U+10FFFF you do get a warning.)
f0175764	513
423cee85	514	=head1 BUGS
	515
	516	Since evaluation of the translation function happens in a middle of
	517	compilation (of a string literal), the translation function should not
	518	do any C<eval>s or C<require>s. This restriction should be lifted in
	519	a future version of Perl.
	520
	521	=cut