Can't get #17492 to work with -Uuseperlio otherwise (either

[p5sagit/p5-mst-13.2.git] / lib / charnames.pm
diff --git a/lib/charnames.pm b/lib/charnames.pm

index 2217f6f..fc3ad8b 100644 (file)
--- a/lib/charnames.pm
+++ b/lib/charnames.pm
@@ -16,8 +16,13 @@ my %alias1 = (
                # Convenience.
                'LF'                    => 'LINE FEED (LF)',
                'FF'                    => 'FORM FEED (FF)',
-               'CR'                    => 'CARRIAGE RETURN (LF)',
+               'CR'                    => 'CARRIAGE RETURN (CR)',
                'NEL'                   => 'NEXT LINE (NEL)',
+               # More convenience.  For futher convencience,
+               # it is suggested some way using using the NamesList
+               # aliases is implemented.
+               'ZWNJ'                  => 'ZERO WIDTH NON-JOINER',
+               'ZWJ'                   => 'ZERO WIDTH JOINER',
                'BOM'                   => 'BYTE ORDER MARK',
            );
 
@@ -55,7 +60,7 @@ sub charnames
 
   if ($name eq "BYTE ORDER MARK") {
       $fname = $name;
-      $ord = 0xFFFE;
+      $ord = 0xFEFF;
   } else {
       ## Suck in the code/name list as a big string.
       ## Lines look like:
@@ -104,7 +109,7 @@ sub charnames
       
       ##
       ## Now know where in the string the name starts.
-      ## The code, in hex, is befor that.
+      ## The code, in hex, is before that.
       ##
       ## The code can be 4-6 characters long, so we've got to sort of
       ## go look for it, just after the newline that comes before $off[0].
@@ -181,7 +186,7 @@ my %viacode;
 sub viacode
 {
     if (@_ != 1) {
-        carp "charnames::viacode() expects one numeric argument";
+        carp "charnames::viacode() expects one argument";
         return ()
     }
 
@@ -198,8 +203,8 @@ sub viacode
     }
 
     if ($code > 0x10FFFF) {
-       carp "Unicode characters only allocated up to 0x10FFFF (you asked for $hex)";
-       return "\x{FFFD}";
+       carp sprintf "Unicode characters only allocated up to U+10FFFF (you asked for U+%X)", $hex;
+       return;
     }
 
     return $viacode{$hex} if exists $viacode{$hex};
@@ -224,12 +229,25 @@ sub vianame
 
     my $arg = shift;
 
+    return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/;
+
     return $vianame{$arg} if exists $vianame{$arg};
 
     $txt = do "unicore/Name.pl" unless $txt;
 
-    if ($txt =~ m/^([0-9A-F]+)\t\t($arg)/m) {
-        return $vianame{$arg} = hex $1;
+    my $pos = index $txt, "\t\t$arg\n";
+    if ($[ <= $pos) {
+       my $posLF = rindex $txt, "\n", $pos;
+       (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d;
+       return $vianame{$arg} = hex $code;
+
+       # If $pos is at the 1st line, $posLF must be $[ - 1 (not found);
+       # then $posLF + 1 equals to $[ (at the beginning of $txt).
+       # Otherwise $posLF is the position of "\n";
+       # then $posLF + 1 must be the position of the next to "\n"
+       # (the beginning of the line).
+       # substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t",
+       # "10300\t", "100000", etc. So we can get the code via removing TAB.
     } else {
         return;
     }
@@ -241,7 +259,7 @@ __END__
 
 =head1 NAME
 
-charnames - define character names for C<\N{named}> string literal escapes.
+charnames - define character names for C<\N{named}> string literal escapes
 
 =head1 SYNOPSIS
 
@@ -254,18 +272,18 @@ charnames - define character names for C<\N{named}> string literal escapes.
   use charnames qw(cyrillic greek);
   print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
 
-  print charname::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
-  printf "%04X", charname::vianame("GOTHIC LETTER AHSA"); # prints "10330"
+  print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
+  printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330"
 
 =head1 DESCRIPTION
 
 Pragma C<use charnames> supports arguments C<:full>, C<:short> and
 script names.  If C<:full> is present, for expansion of
-C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of
+C<\N{CHARNAME}> string C<CHARNAME> is first looked in the list of
 standard Unicode names of chars.  If C<:short> is present, and
 C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
 as a letter in script C<SCRIPT>.  If pragma C<use charnames> is used
-with script name arguments, then for C<\N{CHARNAME}}> the name
+with script name arguments, then for C<\N{CHARNAME}> the name
 C<CHARNAME> is looked up as a letter in the given scripts (in the
 specified order).
 
@@ -286,11 +304,14 @@ use variables inside the C<\N{...}>.  If you want similar run-time
 functionality, use charnames::vianame().
 
 For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
-as of Unicode 3.1, there are no official Unicode names but you can
-use instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth).
-In Unicode 3.2 some naming changes will happen since ISO 6429 has been
-updated.  Also note that the U+UU80, U+0081, U+0084, and U+0099
-do not have names even in ISO 6429.
+as of Unicode 3.1, there are no official Unicode names but you can use
+instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth).  In
+Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429
+has been updated, see L</ALIASES>.  Also note that the U+UU80, U+0081,
+U+0084, and U+0099 do not have names even in ISO 6429.
+
+Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}"
+is the Unicode smiley face, or "\N{WHITE SMILING FACE}".
 
 =head1 CUSTOM TRANSLATORS
 
@@ -333,10 +354,13 @@ prints "FOUR TEARDROP-SPOKED ASTERISK".
 
 Returns undef if no name is known for the code.
 
-This works only for the standard names, and does not yet aply 
+This works only for the standard names, and does not yet apply 
 to custom translators.
 
-=head1 charnames::vianame(code)
+Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
+SPACE", not "BYTE ORDER MARK".
+
+=head1 charnames::vianame(name)
 
 Returns the code point indicated by the name.
 The example
@@ -345,9 +369,9 @@ The example
 
 prints "2722".
 
-Returns undef if no name is known for the name.
+Returns undef if the name is unknown.
 
-This works only for the standard names, and does not yet aply 
+This works only for the standard names, and does not yet apply 
 to custom translators.
 
 =head1 ALIASES
@@ -376,7 +400,12 @@ One can also use
     BYTE ORDER MARK
     BOM
 
-though that is of course not a legal character as such.
+and
+
+    ZWNJ
+    ZWJ
+
+for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER.
 
 For backward compatibility one can use the old names for
 certain C0 and C1 controls
@@ -397,9 +426,12 @@ will also give a warning about being deprecated.
 
 =head1 ILLEGAL CHARACTERS
 
-If you ask for a character that is illegal (like the byte order mark
-U+FFFE, or the U+FFFF) does not exist, a warning is given and the
-special Unicode I<replacement character> "\x{FFFD}" is returned.
+If you ask by name for a character that does not exist, a warning is
+given and the Unicode I<replacement character> "\x{FFFD}" is returned.
+
+If you ask by code for a character that does not exist, no warning is
+given and C<undef> is returned.  (Though if you ask for a code point
+past U+10FFFF you do get a warning.)
 
 =head1 BUGS