X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2Fcharnames.pm;h=9f9526b9654f5c441afb36b58ea98324572dbfae;hb=ff504b36b0f6467f64b463fd17fb34f640855abc;hp=4f7fdeb6ed1812d8affd935dfc941afdce903809;hpb=b8effcb5f9b63a03069dd25d50d5af552626f11a;p=p5sagit%2Fp5-mst-13.2.git diff --git a/lib/charnames.pm b/lib/charnames.pm index 4f7fdeb..9f9526b 100644 --- a/lib/charnames.pm +++ b/lib/charnames.pm @@ -1,12 +1,10 @@ package charnames; use strict; use warnings; -use Carp; use File::Spec; -our $VERSION = '1.03'; +our $VERSION = '1.06'; use bytes (); # for $bytes::hint_bits -$charnames::hint_bits = 0x20000; # HINT_LOCALIZE_HH my %alias1 = ( # Icky 3.2 names with parentheses. @@ -44,6 +42,16 @@ my %alias3 = ( ); my $txt; +sub croak +{ + require Carp; goto &Carp::croak; +} # croak + +sub carp +{ + require Carp; goto &Carp::carp; +} # carp + sub alias (@) { @_ or return %alias3; @@ -159,7 +167,7 @@ sub charnames ## we know where it starts, so turn into number - ## the ordinal for the char. - $ord = hex substr($txt, $hexstart, $off[0] - $hexstart); + $ord = CORE::hex substr($txt, $hexstart, $off[0] - $hexstart); } if ($^H & $bytes::hint_bits) { # "use bytes" in effect? @@ -183,15 +191,14 @@ sub import if (not @_) { carp("`use charnames' needs explicit imports list"); } - $^H |= $charnames::hint_bits; $^H{charnames} = \&charnames ; ## ## fill %h keys with our @_ args. ## my ($promote, %h, @args) = (0); - while (@_ and $_ = shift) { - if ($_ eq ":alias") { + while (my $arg = shift) { + if ($arg eq ":alias") { @_ or croak ":alias needs an argument in charnames"; my $alias = shift; @@ -210,11 +217,11 @@ sub import alias_file ($alias); next; } - if (m/^:/ and ! ($_ eq ":full" || $_ eq ":short")) { - warn "unsupported special '$_' in charnames"; + if (substr($arg, 0, 1) eq ':' and ! ($arg eq ":full" || $arg eq ":short")) { + warn "unsupported special '$arg' in charnames"; next; } - push @args, $_; + push @args, $arg; } @args == 0 && $promote and @args = (":full"); @h{@args} = (1) x @args; @@ -238,43 +245,32 @@ sub import } } # import -# this comes actually from Unicode::UCD, but it avoids the -# overhead of loading it -sub _getcode { - my $arg = shift; - - if ($arg =~ /^[1-9]\d*$/) { - return $arg; - } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { - return hex($1); - } - - return; -} - my %viacode; sub viacode { if (@_ != 1) { carp "charnames::viacode() expects one argument"; - return () + return; } my $arg = shift; - my $code = _getcode($arg); + # this comes actually from Unicode::UCD, where it is the named + # function _getcode (), but it avoids the overhead of loading it my $hex; - - if (defined $code) { + if ($arg =~ /^[1-9]\d*$/) { $hex = sprintf "%04X", $arg; + } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { + $hex = $1; } else { carp("unexpected arg \"$arg\" to charnames::viacode()"); return; } - if ($code > 0x10FFFF) { - carp sprintf "Unicode characters only allocated up to U+10FFFF (you asked for U+%X)", $hex; + # checking the length first is slightly faster + if (length($hex) > 5 && hex($hex) > 0x10FFFF) { + carp "Unicode characters only allocated up to U+10FFFF (you asked for U+$hex)"; return; } @@ -282,11 +278,9 @@ sub viacode $txt = do "unicore/Name.pl" unless $txt; - if ($txt =~ m/^$hex\t\t(.+)/m) { - return $viacode{$hex} = $1; - } else { - return; - } + return unless $txt =~ m/^$hex\t\t(.+)/m; + + $viacode{$hex} = $1; } # viacode my %vianame; @@ -300,7 +294,7 @@ sub vianame my $arg = shift; - return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/; + return chr CORE::hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/; return $vianame{$arg} if exists $vianame{$arg}; @@ -310,7 +304,7 @@ sub vianame if ($[ <= $pos) { my $posLF = rindex $txt, "\n", $pos; (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d; - return $vianame{$arg} = hex $code; + return $vianame{$arg} = CORE::hex $code; # If $pos is at the 1st line, $posLF must be $[ - 1 (not found); # then $posLF + 1 equals to $[ (at the beginning of $txt). @@ -390,35 +384,55 @@ U+0084, and U+0099 do not have names even in ISO 6429. Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}" is the Unicode smiley face, or "\N{WHITE SMILING FACE}". -=head1 CUSTOM TRANSLATORS +=head1 ALIASES -The mechanism of translation of C<\N{...}> escapes is general and not -hardwired into F. A module can install custom -translations (inside the scope which Cs the module) with the -following magic incantation: +A few aliases have been defined for convenience: instead of having +to use the official names - use charnames (); # for $charnames::hint_bits - sub import { - shift; - $^H |= $charnames::hint_bits; - $^H{charnames} = \&translator; - } + LINE FEED (LF) + FORM FEED (FF) + CARRIAGE RETURN (CR) + NEXT LINE (NEL) -Here translator() is a subroutine which takes C as an -argument, and returns text to insert into the string instead of the -C<\N{CHARNAME}> escape. Since the text to insert should be different -in C mode and out of it, the function should check the current -state of C-flag as in: +(yes, with parentheses) one can use - use bytes (); # for $bytes::hint_bits - sub translator { - if ($^H & $bytes::hint_bits) { - return bytes_translator(@_); - } - else { - return utf8_translator(@_); - } - } + LINE FEED + FORM FEED + CARRIAGE RETURN + NEXT LINE + LF + FF + CR + NEL + +One can also use + + BYTE ORDER MARK + BOM + +and + + ZWNJ + ZWJ + +for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER. + +For backward compatibility one can use the old names for +certain C0 and C1 controls + + old new + + HORIZONTAL TABULATION CHARACTER TABULATION + VERTICAL TABULATION LINE TABULATION + FILE SEPARATOR INFORMATION SEPARATOR FOUR + GROUP SEPARATOR INFORMATION SEPARATOR THREE + RECORD SEPARATOR INFORMATION SEPARATOR TWO + UNIT SEPARATOR INFORMATION SEPARATOR ONE + PARTIAL LINE DOWN PARTIAL LINE FORWARD + PARTIAL LINE UP PARTIAL LINE BACKWARD + +but the old names in addition to giving the character +will also give a warning about being deprecated. =head1 CUSTOM ALIASES @@ -488,55 +502,33 @@ Returns undef if the name is unknown. This works only for the standard names, and does not yet apply to custom translators. -=head1 ALIASES - -A few aliases have been defined for convenience: instead of having -to use the official names - - LINE FEED (LF) - FORM FEED (FF) - CARRIAGE RETURN (CR) - NEXT LINE (NEL) - -(yes, with parentheses) one can use - - LINE FEED - FORM FEED - CARRIAGE RETURN - NEXT LINE - LF - FF - CR - NEL - -One can also use - - BYTE ORDER MARK - BOM - -and - - ZWNJ - ZWJ - -for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER. +=head1 CUSTOM TRANSLATORS -For backward compatibility one can use the old names for -certain C0 and C1 controls +The mechanism of translation of C<\N{...}> escapes is general and not +hardwired into F. A module can install custom +translations (inside the scope which Cs the module) with the +following magic incantation: - old new + sub import { + shift; + $^H{charnames} = \&translator; + } - HORIZONTAL TABULATION CHARACTER TABULATION - VERTICAL TABULATION LINE TABULATION - FILE SEPARATOR INFORMATION SEPARATOR FOUR - GROUP SEPARATOR INFORMATION SEPARATOR THREE - RECORD SEPARATOR INFORMATION SEPARATOR TWO - UNIT SEPARATOR INFORMATION SEPARATOR ONE - PARTIAL LINE DOWN PARTIAL LINE FORWARD - PARTIAL LINE UP PARTIAL LINE BACKWARD +Here translator() is a subroutine which takes C as an +argument, and returns text to insert into the string instead of the +C<\N{CHARNAME}> escape. Since the text to insert should be different +in C mode and out of it, the function should check the current +state of C-flag as in: -but the old names in addition to giving the character -will also give a warning about being deprecated. + use bytes (); # for $bytes::hint_bits + sub translator { + if ($^H & $bytes::hint_bits) { + return bytes_translator(@_); + } + else { + return utf8_translator(@_); + } + } =head1 ILLEGAL CHARACTERS