X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2Fcharnames.pm;h=0acae61f2eefa241555b31198ce230558dfc1fd9;hb=906bad61c3bd0dd3a56a226b2981e4d14d5e71c9;hp=f0a4446b4dceaf1d1640e2e4966e8766e390922d;hpb=e5c3f8982a1650ad4c25a05c41a9038ce21a512c;p=p5sagit%2Fp5-mst-13.2.git diff --git a/lib/charnames.pm b/lib/charnames.pm index f0a4446..0acae61 100644 --- a/lib/charnames.pm +++ b/lib/charnames.pm @@ -1,12 +1,10 @@ package charnames; use strict; use warnings; -use Carp; use File::Spec; -our $VERSION = '1.03'; +our $VERSION = '1.07'; use bytes (); # for $bytes::hint_bits -$charnames::hint_bits = 0x20000; # HINT_LOCALIZE_HH my %alias1 = ( # Icky 3.2 names with parentheses. @@ -44,6 +42,16 @@ my %alias3 = ( ); my $txt; +sub croak +{ + require Carp; goto &Carp::croak; +} # croak + +sub carp +{ + require Carp; goto &Carp::carp; +} # carp + sub alias (@) { @_ or return %alias3; @@ -159,7 +167,7 @@ sub charnames ## we know where it starts, so turn into number - ## the ordinal for the char. - $ord = hex substr($txt, $hexstart, $off[0] - $hexstart); + $ord = CORE::hex substr($txt, $hexstart, $off[0] - $hexstart); } if ($^H & $bytes::hint_bits) { # "use bytes" in effect? @@ -183,7 +191,6 @@ sub import if (not @_) { carp("`use charnames' needs explicit imports list"); } - $^H |= $charnames::hint_bits; $^H{charnames} = \&charnames ; ## @@ -238,43 +245,32 @@ sub import } } # import -# this comes actually from Unicode::UCD, but it avoids the -# overhead of loading it -sub _getcode { - my $arg = shift; - - if ($arg =~ /^[1-9]\d*$/) { - return $arg; - } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { - return hex($1); - } - - return; -} - my %viacode; sub viacode { if (@_ != 1) { carp "charnames::viacode() expects one argument"; - return () + return; } my $arg = shift; - my $code = _getcode($arg); + # this comes actually from Unicode::UCD, where it is the named + # function _getcode (), but it avoids the overhead of loading it my $hex; - - if (defined $code) { + if ($arg =~ /^[1-9]\d*$/) { $hex = sprintf "%04X", $arg; + } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { + $hex = $1; } else { carp("unexpected arg \"$arg\" to charnames::viacode()"); return; } - if ($code > 0x10FFFF) { - carp sprintf "Unicode characters only allocated up to U+10FFFF (you asked for U+%X)", $hex; + # checking the length first is slightly faster + if (length($hex) > 5 && hex($hex) > 0x10FFFF) { + carp "Unicode characters only allocated up to U+10FFFF (you asked for U+$hex)"; return; } @@ -282,11 +278,9 @@ sub viacode $txt = do "unicore/Name.pl" unless $txt; - if ($txt =~ m/^$hex\t\t(.+)/m) { - return $viacode{$hex} = $1; - } else { - return; - } + return unless $txt =~ m/^$hex\t\t(.+)/m; + + $viacode{$hex} = $1; } # viacode my %vianame; @@ -300,7 +294,7 @@ sub vianame my $arg = shift; - return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/; + return chr CORE::hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/; return $vianame{$arg} if exists $vianame{$arg}; @@ -310,7 +304,7 @@ sub vianame if ($[ <= $pos) { my $posLF = rindex $txt, "\n", $pos; (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d; - return $vianame{$arg} = hex $code; + return $vianame{$arg} = CORE::hex $code; # If $pos is at the 1st line, $posLF must be $[ - 1 (not found); # then $posLF + 1 equals to $[ (at the beginning of $txt). @@ -390,35 +384,55 @@ U+0084, and U+0099 do not have names even in ISO 6429. Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}" is the Unicode smiley face, or "\N{WHITE SMILING FACE}". -=head1 CUSTOM TRANSLATORS +=head1 ALIASES -The mechanism of translation of C<\N{...}> escapes is general and not -hardwired into F. A module can install custom -translations (inside the scope which Cs the module) with the -following magic incantation: +A few aliases have been defined for convenience: instead of having +to use the official names - use charnames (); # for $charnames::hint_bits - sub import { - shift; - $^H |= $charnames::hint_bits; - $^H{charnames} = \&translator; - } + LINE FEED (LF) + FORM FEED (FF) + CARRIAGE RETURN (CR) + NEXT LINE (NEL) -Here translator() is a subroutine which takes C as an -argument, and returns text to insert into the string instead of the -C<\N{CHARNAME}> escape. Since the text to insert should be different -in C mode and out of it, the function should check the current -state of C-flag as in: +(yes, with parentheses) one can use - use bytes (); # for $bytes::hint_bits - sub translator { - if ($^H & $bytes::hint_bits) { - return bytes_translator(@_); - } - else { - return utf8_translator(@_); - } - } + LINE FEED + FORM FEED + CARRIAGE RETURN + NEXT LINE + LF + FF + CR + NEL + +One can also use + + BYTE ORDER MARK + BOM + +and + + ZWNJ + ZWJ + +for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER. + +For backward compatibility one can use the old names for +certain C0 and C1 controls + + old new + + HORIZONTAL TABULATION CHARACTER TABULATION + VERTICAL TABULATION LINE TABULATION + FILE SEPARATOR INFORMATION SEPARATOR FOUR + GROUP SEPARATOR INFORMATION SEPARATOR THREE + RECORD SEPARATOR INFORMATION SEPARATOR TWO + UNIT SEPARATOR INFORMATION SEPARATOR ONE + PARTIAL LINE DOWN PARTIAL LINE FORWARD + PARTIAL LINE UP PARTIAL LINE BACKWARD + +but the old names in addition to giving the character +will also give a warning about being deprecated. =head1 CUSTOM ALIASES @@ -488,55 +502,33 @@ Returns undef if the name is unknown. This works only for the standard names, and does not yet apply to custom translators. -=head1 ALIASES - -A few aliases have been defined for convenience: instead of having -to use the official names - - LINE FEED (LF) - FORM FEED (FF) - CARRIAGE RETURN (CR) - NEXT LINE (NEL) - -(yes, with parentheses) one can use - - LINE FEED - FORM FEED - CARRIAGE RETURN - NEXT LINE - LF - FF - CR - NEL - -One can also use - - BYTE ORDER MARK - BOM - -and - - ZWNJ - ZWJ - -for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER. +=head1 CUSTOM TRANSLATORS -For backward compatibility one can use the old names for -certain C0 and C1 controls +The mechanism of translation of C<\N{...}> escapes is general and not +hardwired into F. A module can install custom +translations (inside the scope which Cs the module) with the +following magic incantation: - old new + sub import { + shift; + $^H{charnames} = \&translator; + } - HORIZONTAL TABULATION CHARACTER TABULATION - VERTICAL TABULATION LINE TABULATION - FILE SEPARATOR INFORMATION SEPARATOR FOUR - GROUP SEPARATOR INFORMATION SEPARATOR THREE - RECORD SEPARATOR INFORMATION SEPARATOR TWO - UNIT SEPARATOR INFORMATION SEPARATOR ONE - PARTIAL LINE DOWN PARTIAL LINE FORWARD - PARTIAL LINE UP PARTIAL LINE BACKWARD +Here translator() is a subroutine which takes C as an +argument, and returns text to insert into the string instead of the +C<\N{CHARNAME}> escape. Since the text to insert should be different +in C mode and out of it, the function should check the current +state of C-flag as in: -but the old names in addition to giving the character -will also give a warning about being deprecated. + use bytes (); # for $bytes::hint_bits + sub translator { + if ($^H & $bytes::hint_bits) { + return bytes_translator(@_); + } + else { + return utf8_translator(@_); + } + } =head1 ILLEGAL CHARACTERS @@ -549,6 +541,11 @@ past U+10FFFF you do get a warning.) =head1 BUGS +Unicode standard named sequences are not recognized, such as +C +(which should mean C with an additional +C). + Since evaluation of the translation function happens in a middle of compilation (of a string literal), the translation function should not do any Cs or Cs. This restriction should be lifted in