From: Jarkko Hietaniemi Date: Thu, 28 Mar 2002 16:20:03 +0000 (+0000) Subject: Support Unicode 3.1 names, names without the (XX), and BOM. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=52ea3e69a0eb35af2d24bda5dabccf9b9600bfe4;p=p5sagit%2Fp5-mst-13.2.git Support Unicode 3.1 names, names without the (XX), and BOM. p4raw-id: //depot/perl@15585 --- diff --git a/lib/charnames.pm b/lib/charnames.pm index 0241534..6471d18 100644 --- a/lib/charnames.pm +++ b/lib/charnames.pm @@ -7,6 +7,32 @@ our $VERSION = '1.01'; use bytes (); # for $bytes::hint_bits $charnames::hint_bits = 0x20000; +my %alias1 = ( + # Icky 3.2 names with parentheses. + 'LINE FEED' => 'LINE FEED (LF)', + 'FORM FEED' => 'FORM FEED (FF)', + 'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)', + 'NEXT LINE' => 'NEXT LINE (NEL)', + # Convenience. + 'LF' => 'LINE FEED (LF)', + 'FF' => 'FORM FEED (FF)', + 'CR' => 'CARRIAGE RETURN (LF)', + 'NEL' => 'NEXT LINE (NEXT LINE)', + 'BOM' => 'BYTE ORDER MARK', + ); + +my %alias2 = ( + # Pre-3.2 compatibility (only for the first 256 characters). + 'HORIZONTAL TABULATION' => 'CHARACTER TABULATION', + 'VERTICAL TABULATION' => 'LINE TABULATION', + 'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR', + 'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE', + 'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO', + 'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE', + 'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD', + 'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD', + ); + my $txt; # This is not optimized in any way yet @@ -14,78 +40,99 @@ sub charnames { my $name = shift; - ## Suck in the code/name list as a big string. - ## Lines look like: - ## "0052\t\tLATIN CAPITAL LETTER R\n" - $txt = do "unicore/Name.pl" unless $txt; + if (exists $alias1{$name}) { + $name = $alias1{$name}; + } + if (exists $alias2{$name}) { + require warnings; + warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead}); + $name = $alias2{$name}; + } - ## @off will hold the index into the code/name string of the start and - ## end of the name as we find it. + my $ord; my @off; + my $fname; + + if ($name eq "BYTE ORDER MARK") { + $fname = $name; + $ord = 0xFFFE; + } else { + ## Suck in the code/name list as a big string. + ## Lines look like: + ## "0052\t\tLATIN CAPITAL LETTER R\n" + $txt = do "unicore/Name.pl" unless $txt; - ## If :full, look for the the name exactly - if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) { - @off = ($-[0], $+[0]); - } - - ## If we didn't get above, and :short allowed, look for the short name. - ## The short name is like "greek:Sigma" - unless (@off) { - if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) { - my ($script, $cname) = ($1,$2); - my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"); - if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) { - @off = ($-[0], $+[0]); + ## @off will hold the index into the code/name string of the start and + ## end of the name as we find it. + + ## If :full, look for the the name exactly + if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) { + @off = ($-[0], $+[0]); } - } - } - ## If we still don't have it, check for the name among the loaded - ## scripts. - if (not @off) - { - my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"); - for my $script ( @{$^H{charnames_scripts}} ) + ## If we didn't get above, and :short allowed, look for the short name. + ## The short name is like "greek:Sigma" + unless (@off) { + if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) { + my ($script, $cname) = ($1,$2); + my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"); + if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) { + @off = ($-[0], $+[0]); + } + } + } + + ## If we still don't have it, check for the name among the loaded + ## scripts. + if (not @off) { - if ($txt =~ m/\t\t$script (?:$case )?LETTER \U$name$/m) { - @off = ($-[0], $+[0]); - last; - } + my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"); + for my $script ( @{$^H{charnames_scripts}} ) + { + if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) { + @off = ($-[0], $+[0]); + last; + } + } } + + ## If we don't have it by now, give up. + unless (@off) { + carp "Unknown charname '$name'"; + return "\x{FFFD}"; + } + + ## + ## Now know where in the string the name starts. + ## The code, in hex, is befor that. + ## + ## The code can be 4-6 characters long, so we've got to sort of + ## go look for it, just after the newline that comes before $off[0]. + ## + ## This would be much easier if unicore/Name.pl had info in + ## a name/code order, instead of code/name order. + ## + ## The +1 after the rindex() is to skip past the newline we're finding, + ## or, if the rindex() fails, to put us to an offset of zero. + ## + my $hexstart = rindex($txt, "\n", $off[0]) + 1; + + ## we know where it starts, so turn into number - + ## the ordinal for the char. + $ord = hex substr($txt, $hexstart, $off[0] - $hexstart); } - ## If we don't have it by now, give up. - unless (@off) { - carp "Unknown charname '$name'"; - return "\x{FFFD}"; - } - - ## - ## Now know where in the string the name starts. - ## The code, in hex, is befor that. - ## - ## The code can be 4-6 characters long, so we've got to sort of - ## go look for it, just after the newline that comes before $off[0]. - ## - ## This would be much easier if unicore/Name.pl had info in - ## a name/code order, instead of code/name order. - ## - ## The +1 after the rindex() is to skip past the newline we're finding, - ## or, if the rindex() fails, to put us to an offset of zero. - ## - my $hexstart = rindex($txt, "\n", $off[0]) + 1; - - ## we know where it starts, so turn into number - the ordinal for the char. - my $ord = hex substr($txt, $hexstart, $off[0] - $hexstart); - if ($^H & $bytes::hint_bits) { # "use bytes" in effect? use bytes; return chr $ord if $ord <= 255; my $hex = sprintf "%04x", $ord; - my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2; + if (not defined $fname) { + $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2; + } croak "Character 0x$hex with name '$fname' is above 0xFF"; } + no warnings 'utf8'; # allow even illegal characters return pack "U", $ord; } @@ -303,10 +350,56 @@ Returns undef if no name is known for the name. This works only for the standard names, and does not yet aply to custom translators. +=head1 ALIASES + +A few aliases have been defined for convenience: instead of having +to use the official names + + LINE FEED (LF) + FORM FEED (FF) + CARRIAGE RETURN (CR) + NEXT LINE (NEL) + +(yes, with parentheses) one can use + + LINE FEED + FORM FEED + CARRIAGE RETURN + NEXT LINE + LF + FF + CR + NEL + +One can also use + + BYTE ORDER MARK + BOM + +though that is of course not a legal character as such. + +For backward compatibility one can use the old names for +certain C0 and C1 controls + + old new + + HORIZONTAL TABULATION CHARACTER TABULATION + VERTICAL TABULATION LINE TABULATION + FILE SEPARATOR INFORMATION SEPARATOR FOUR + GROUP SEPARATOR INFORMATION SEPARATOR THREE + RECORD SEPARATOR INFORMATION SEPARATOR TWO + UNIT SEPARATOR INFORMATION SEPARATOR ONE + PARTIAL LINE DOWN PARTIAL LINE FORWARD + PARTIAL LINE UP PARTIAL LINE BACKWARD + +but the old names in addition to giving the character +will also give a warning about being deprecated. + =head1 ILLEGAL CHARACTERS -If you ask for a character that does not exist, a warning is given -and the special Unicode I "\x{FFFD}" is returned. +If you ask for a character that is illegal (like the byte order mark +U+FFFE, or the U+FFFF) does not exist, a warning is given and the +special Unicode I "\x{FFFD}" is returned. =head1 BUGS diff --git a/lib/charnames.t b/lib/charnames.t index c800128..a8a063f 100644 --- a/lib/charnames.t +++ b/lib/charnames.t @@ -1,15 +1,18 @@ #!./perl +my @WARN; + BEGIN { unless(grep /blib/, @INC) { chdir 't' if -d 't'; @INC = '../lib'; } + $SIG{__WARN__} = sub { push @WARN, @_ }; } $| = 1; -print "1..25\n"; +print "1..34\n"; use charnames ':full'; @@ -169,6 +172,36 @@ print "ok 24\n"; print "not " unless "\N{NULL}" eq "\c@"; print "ok 25\n"; -# TODO: support 3.1 names, BOM. Generic aliasing? +print "not " unless "\N{LINE FEED (LF)}" eq "\n"; +print "ok 26\n"; + +print "not " unless "\N{LINE FEED}" eq "\n"; +print "ok 27\n"; + +print "not " unless "\N{LF}" eq "\n"; +print "ok 28\n"; + +print "not " unless "\N{BYTE ORDER MARK}" eq chr(0xFFFE); +print "ok 29\n"; + +print "not " unless "\N{BOM}" eq chr(0xFFFE); +print "ok 30\n"; + +{ + use warnings 'deprecated'; + + print "not " unless "\N{HORIZONTAL TABULATION}" eq "\t"; + print "ok 31\n"; + + print "not " unless grep { /"HORIZONTAL TABULATION" is deprecated/ } @WARN; + print "ok 32\n"; + no warnings 'deprecated'; + + print "not " unless "\N{VERTICAL TABULATION}" eq "\013"; + print "ok 33\n"; + + print "not " if grep { /"VERTICAL TABULATION" is deprecated/ } @WARN; + print "ok 34\n"; +} diff --git a/pp_pack.c b/pp_pack.c index 67f53e7..1c5ee31 100644 --- a/pp_pack.c +++ b/pp_pack.c @@ -2039,8 +2039,12 @@ Perl_pack_cat(pTHX_ SV *cat, char *pat, register char *patend, register SV **beg fromstr = NEXTFROM; auint = UNI_TO_NATIVE(SvUV(fromstr)); SvGROW(cat, SvCUR(cat) + UTF8_MAXLEN + 1); - SvCUR_set(cat, (char*)uvchr_to_utf8((U8*)SvEND(cat),auint) - - SvPVX(cat)); + SvCUR_set(cat, + (char*)uvchr_to_utf8_flags((U8*)SvEND(cat), + auint, + ckWARN(WARN_UTF8) ? + 0 : UNICODE_ALLOW_ANY) + - SvPVX(cat)); } *SvEND(cat) = '\0'; break;