From: Rafael Garcia-Suarez Date: Sun, 20 Dec 2009 15:23:36 +0000 (+0100) Subject: Introduce C X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=1863b87966ed39b042c45e12d1b4e0b90b9cc071;p=p5sagit%2Fp5-mst-13.2.git Introduce C This turns on the unicode semantics for uc/lc/ucfirst/lcfirst operations on strings without the UTF8 bit set but with ASCII characters higher than 127. This replaces the "legacy" pragma experiment. Note that currently this feature sets both a bit in $^H and a (unused) key in %^H. The bit in $^H could be replaced by a flag on the uc/lc/etc op. It's probably not feasible to test a key in %^H in pp_uc in friends each time we want to know which semantics to apply. --- diff --git a/MANIFEST b/MANIFEST index 76a5568..2d79cbc 100644 --- a/MANIFEST +++ b/MANIFEST @@ -3454,6 +3454,7 @@ lib/ExtUtils/XSSymSet.pm on VMS, manage linker symbols when building extensions lib/fastcwd.pl a faster but more dangerous getcwd lib/feature.pm Pragma to enable new syntax lib/feature.t See if features work +lib/feature/unicode_strings.t See if feature "unicode_strings" work lib/File/Basename.pm Emulate the basename program lib/File/Basename.t See if File::Basename works lib/File/CheckTree.pm Perl module supporting wholesale file mode validation @@ -3492,8 +3493,6 @@ lib/importenv.pl Perl routine to get environment into variables lib/integer.pm For "use integer" lib/integer.t For "use integer" testing lib/Internals.t For Internals::* testing -lib/legacy.pm Pragma to preserve legacy behavior -lib/legacy.t For "use legacy" testing lib/less.pm For "use less" lib/less.t See if less support works lib/locale.pm For "use locale" diff --git a/Porting/Maintainers.pl b/Porting/Maintainers.pl index c4a1a79..bb5e61b 100755 --- a/Porting/Maintainers.pl +++ b/Porting/Maintainers.pl @@ -1941,6 +1941,7 @@ use File::Glob qw(:case); lib/exceptions.pl lib/fastcwd.pl lib/feature.{pm,t} + lib/feature/ lib/filetest.{pm,t} lib/find.pl lib/finddepth.pl diff --git a/lib/feature.pm b/lib/feature.pm index 915b5c7..649ccb3 100644 --- a/lib/feature.pm +++ b/lib/feature.pm @@ -1,19 +1,24 @@ package feature; -our $VERSION = '1.13'; +our $VERSION = '1.14'; # (feature name) => (internal name, used in %^H) my %feature = ( - switch => 'feature_switch', - say => "feature_say", - state => "feature_state", + switch => 'feature_switch', + say => "feature_say", + state => "feature_state", + unicode_strings => "feature_unicode", ); +# This gets set (for now) in $^H as well as in %^H, +# for runtime speed of the uc/lc/ucfirst/lcfirst functions. +our $hint_uni8bit = 0x00000800; + # NB. the latest bundle must be loaded by the -E switch (see toke.c) my %feature_bundle = ( "5.10" => [qw(switch say state)], - "5.11" => [qw(switch say state)], + "5.11" => [qw(switch say state unicode_strings)], ); # special case @@ -43,9 +48,9 @@ feature - Perl pragma to enable new syntactic features It is usually impossible to add new syntax to Perl without breaking some existing programs. This pragma provides a way to minimize that -risk. New syntactic constructs can be enabled by C, -and will be parsed only when the appropriate feature pragma is in -scope. +risk. New syntactic constructs, or new semantic meanings to older +constructs, can be enabled by C, and will be parsed +only when the appropriate feature pragma is in scope. =head2 Lexical effect @@ -95,6 +100,80 @@ variables. See L for details. +=head2 the 'unicode_strings' feature + +C tells the compiler to treat +strings with codepoints larger than 128 as Unicode. It is available +starting with Perl 5.11.3. + +In greater detail: + +This feature modifies the semantics for the 128 characters on ASCII +systems that have the 8th bit set. (See L below for +EBCDIC systems.) By default, unless C> is specified, or the +scalar containing such a character is known by Perl to be encoded in UTF8, +the semantics are essentially that the characters have an ordinal number, +and that's it. They are caseless, and aren't anything: they're not +controls, not letters, not punctuation, ..., not anything. + +This behavior stems from when Perl did not support Unicode, and ASCII was the +only known character set outside of C>. In order to not +possibly break pre-Unicode programs, these characters have retained their old +non-meanings, except when it is clear to Perl that Unicode is what is meant, +for example by calling utf8::upgrade() on a scalar, or if the scalar also +contains characters that are only available in Unicode. Then these 128 +characters take on their Unicode meanings. + +The problem with this behavior is that a scalar that encodes these characters +has a different meaning depending on if it is stored as utf8 or not. +In general, the internal storage method should not affect the +external behavior. + +The behavior is known to have effects on these areas: + +=over 4 + +=item * + +Changing the case of a scalar, that is, using C, C, C, +and C, or C<\L>, C<\U>, C<\u> and C<\l> in regular expression +substitutions. + +=item * + +Using caseless (C) regular expression matching + +=item * + +Matching a number of properties in regular expressions, such as C<\w> + +=item * + +User-defined case change mappings. You can create a C function, for +example, which overrides Perl's built-in case mappings. The scalar must be +encoded in utf8 for your function to actually be invoked. + +=back + +B +outside of C. See below for EBCDIC. + +To turn on B for these characters, use +C. + +The other old (legacy) behaviors regarding these characters are currently +unaffected by this pragma. + +=head4 EBCDIC platforms + +On EBCDIC platforms, the situation is somewhat different. The legacy +semantics are whatever the underlying semantics of the native C language +library are. Each of the three EBCDIC encodings currently known by Perl is an +isomorph of the Latin-1 character set. That means every character in Latin-1 +has a corresponding EBCDIC equivalent, and vice-versa. Specifying C> currently makes sure that all EBCDIC characters have the same +B semantics as their corresponding Latin-1 characters. + =head1 FEATURE BUNDLES It's possible to load a whole slew of features in one go, using @@ -164,6 +243,7 @@ sub import { unknown_feature($name); } $^H{$feature{$name}} = 1; + $^H |= $hint_uni8bit if $name eq 'unicode_strings'; } } @@ -173,6 +253,7 @@ sub unimport { # A bare C should disable *all* features if (!@_) { delete @^H{ values(%feature) }; + $^H &= ~ $hint_uni8bit; return; } @@ -194,6 +275,7 @@ sub unimport { } else { delete $^H{$feature{$name}}; + $^H &= ~ $hint_uni8bit if $name eq 'unicode_strings'; } } } diff --git a/lib/legacy.t b/lib/feature/unicode_strings.t similarity index 98% rename from lib/legacy.t rename to lib/feature/unicode_strings.t index 1f0cce9..dce34bd 100644 --- a/lib/legacy.t +++ b/lib/feature/unicode_strings.t @@ -84,7 +84,7 @@ for my $prefix (\%empty, \%posix, \%cyrillic, \%latin1) { my $cp = sprintf "U+%04X", $i; # First try using latin1 (Unicode) semantics. - no legacy "unicode8bit"; + use feature "unicode_strings"; my $phrase = 'with uni8bit'; my $char = chr($i); @@ -112,7 +112,7 @@ for my $prefix (\%empty, \%posix, \%cyrillic, \%latin1) { } # Then try with posix semantics. - use legacy "unicode8bit"; + no feature "unicode_strings"; $phrase = 'no uni8bit'; # These don't contribute anything in this case. diff --git a/lib/legacy.pm b/lib/legacy.pm deleted file mode 100755 index 1ea7c07..0000000 --- a/lib/legacy.pm +++ /dev/null @@ -1,199 +0,0 @@ -package legacy; - -our $VERSION = '1.00'; - -$unicode8bit::hint_not_uni8bit = 0x00000800; - -my %legacy_bundle = ( - "5.10" => [qw(unicode8bit)], - "5.11" => [qw(unicode8bit)], -); - -my %legacy = ( 'unicode8bit' => '0' ); - -=head1 NAME - -legacy - Perl pragma to preserve legacy behaviors or enable new non-default behaviors - -=head1 SYNOPSIS - - use legacy ':5.10'; # Keeps semantics the same as in perl 5.10 - - use legacy qw(unicode8bit); - - no legacy; - - no legacy qw(unicode8bit); - -=head1 DESCRIPTION - -Some programs may rely on behaviors that for others are problematic or -even wrong. A new version of Perl may change behaviors from past ones, -and when it is viewed that the old way of doing things may be required -to still be supported, the new behavior will be able to be turned off by using -this pragma. - -Additionally, a new behavior may be supported in a new version of Perl, but -for whatever reason the default remains the old one. This pragma can enable -the new behavior. - -Like other pragmas (C, for example), C will -only make the legacy behavior for "foo" available from that point to the end of -the enclosing block. - -=head2 B - -Preserve the old way of doing things when a new version of Perl is -released that would otherwise change the behavior. - -The one current possibility is: - -=head3 unicode8bit - -Use legacy semantics for the 128 characters on ASCII systems that have the 8th -bit set. (See L below for EBCDIC systems.) Unless -C> is specified, or the scalar containing such a character is -known by Perl to be encoded in UTF8, the semantics are essentially that the -characters have an ordinal number, and that's it. They are caseless, and -aren't anything: they're not controls, not letters, not punctuation, ..., not -anything. - -This behavior stems from when Perl did not support Unicode, and ASCII was the -only known character set outside of C>. In order to not -possibly break pre-Unicode programs, these characters have retained their old -non-meanings, except when it is clear to Perl that Unicode is what is meant, -for example by calling utf8::upgrade() on a scalar, or if the scalar also -contains characters that are only available in Unicode. Then these 128 -characters take on their Unicode meanings. - -The problem with this behavior is that a scalar that encodes these characters -has a different meaning depending on if it is stored as utf8 or not. -In general, the internal storage method should not affect the -external behavior. - -The behavior is known to have effects on these areas: - -=over 4 - -=item * - -Changing the case of a scalar, that is, using C, C, C, -and C, or C<\L>, C<\U>, C<\u> and C<\l> in regular expression -substitutions. - -=item * - -Using caseless (C) regular expression matching - -=item * - -Matching a number of properties in regular expressions, such as C<\w> - -=item * - -User-defined case change mappings. You can create a C function, for -example, which overrides Perl's built-in case mappings. The scalar must be -encoded in utf8 for your function to actually be invoked. - -=back - -B -outside of C. See below for EBCDIC. -To turn on B for these characters, use -C>. -The other legacy behaviors regarding these characters are currently -unaffected by this pragma. - -=head4 EBCDIC platforms - -On EBCDIC platforms, the situation is somewhat different. The legacy -semantics are whatever the underlying semantics of the native C language -library are. Each of the three EBCDIC encodings currently known by Perl is an -isomorph of the Latin-1 character set. That means every character in Latin-1 -has a corresponding EBCDIC equivalent, and vice-versa. Specifying C> currently makes sure that all EBCDIC characters have the same -B semantics as their corresponding Latin-1 characters. - -=head2 B - -Turn on a new behavior in a version of Perl that understands -it but has it turned off by default. For example, C turns on -behavior C in the lexical scope of the pragma. C -without any modifier turns on all new behaviors known to the pragma. - -=head1 LEGACY BUNDLES - -It's possible to turn off all new behaviors past a given release by -using a I, which is the name of the release prefixed with -a colon, to distinguish it from an individual legacy behavior. - -Specifying sub-versions such as the C<0> in C<5.10.0> in legacy bundles has -no effect: legacy bundles are guaranteed to be the same for all sub-versions. - -Legacy bundles are not allowed with C. - -=cut - -sub import { - my $class = shift; - if (@_ == 0) { - croak("No legacy behaviors specified"); - } - while (@_) { - my $name = shift(@_); - if (substr($name, 0, 1) eq ":") { - my $v = substr($name, 1); - if (!exists $legacy_bundle{$v}) { - $v =~ s/^([0-9]+)\.([0-9]+).[0-9]+$/$1.$2/; - if (!exists $legacy_bundle{$v}) { - unknown_legacy_bundle(substr($name, 1)); - } - } - unshift @_, @{$legacy_bundle{$v}}; - next; - } - $^H |= $unicode8bit::hint_not_uni8bit; # The only valid thing as of yet - } -} - - -sub unimport { - my $class = shift; - - # A bare C should disable *all* legacy behaviors - if (!@_) { - unshift @_, keys(%legacy); - } - - while (@_) { - my $name = shift; - if (substr($name, 0, 1) eq ":") { - croak(sprintf('Legacy bundles (%s) are not allowed in "no legacy"', - $name)); - } - if (!exists($legacy{$name})) { - unknown_legacy($name); - } - else { - $^H &= ~ $unicode8bit::hint_not_uni8bit; # The only valid thing now - } - } -} - -sub unknown_legacy { - my $legacy = shift; - croak(sprintf('Legacy "%s" is not supported by Perl %vd', $legacy, $^V)); -} - -sub unknown_legacy_bundle { - my $legacy = shift; - croak(sprintf('Legacy bundle "%s" is not supported by Perl %vd', - $legacy, $^V)); -} - -sub croak { - require Carp; - Carp::croak(@_); -} - -1; diff --git a/perl.h b/perl.h index adff169..5988e78 100644 --- a/perl.h +++ b/perl.h @@ -4773,7 +4773,7 @@ enum { /* pass one of these to get_vtbl */ #define HINT_BLOCK_SCOPE 0x00000100 #define HINT_STRICT_SUBS 0x00000200 /* strict pragma */ #define HINT_STRICT_VARS 0x00000400 /* strict pragma */ -#define HINT_NOT_UNI_8_BIT 0x00000800 /* unicode8bit pragma */ +#define HINT_UNI_8_BIT 0x00000800 /* unicode_strings feature */ /* The HINT_NEW_* constants are used by the overload pragma */ #define HINT_NEW_INTEGER 0x00001000 diff --git a/t/lib/feature/bundle b/t/lib/feature/bundle index a869c75..11fde32 100644 --- a/t/lib/feature/bundle +++ b/t/lib/feature/bundle @@ -7,6 +7,25 @@ say "Hello", "world"; EXPECT Helloworld ######## +# Standard feature bundle, no 5.11 +use feature ":5.10"; +say ord uc chr 233; +EXPECT +233 +######## +# Standard feature bundle, 5.11 +use feature ":5.11"; +say ord uc chr 233; +EXPECT +201 +######## +# Standard feature bundle, 5.11 +use feature ":5.11"; +use utf8; +say ord "\ué"; # this is utf8 +EXPECT +201 +######## # more specific: 5.10.0 maps to 5.10 use feature ":5.10.0"; say "Hello", "world"; diff --git a/t/uni/overload.t b/t/uni/overload.t index da9b07b..7bf4841 100644 --- a/t/uni/overload.t +++ b/t/uni/overload.t @@ -35,7 +35,7 @@ package main; # These tests are based on characters 128-255 not having latin1, and hence # Unicode, semantics -use legacy 'unicode8bit'; +# no feature "unicode_strings"; # Bug 34297 foreach my $t ("ASCII", "B\366se") { diff --git a/toke.c b/toke.c index db9eca3..19241c4 100644 --- a/toke.c +++ b/toke.c @@ -583,7 +583,7 @@ S_missingterm(pTHX_ char *s) ((0 != (PL_hints & HINT_LOCALIZE_HH)) \ && S_feature_is_enabled(aTHX_ STR_WITH_LEN(name))) /* The longest string we pass in. */ -#define MAX_FEATURE_LEN (sizeof("switch")-1) +#define MAX_FEATURE_LEN (sizeof("unicode_strings")-1) /* * S_feature_is_enabled diff --git a/utf8.h b/utf8.h index 8fef274..9eed545 100644 --- a/utf8.h +++ b/utf8.h @@ -206,7 +206,7 @@ encoded character. #define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES) #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES) -#define IN_UNI_8_BIT ( (! (CopHINTS_get(PL_curcop) & HINT_NOT_UNI_8_BIT)) \ +#define IN_UNI_8_BIT ( (CopHINTS_get(PL_curcop) & HINT_UNI_8_BIT) \ && ! IN_LOCALE_RUNTIME && ! IN_BYTES) #define UTF8_ALLOW_EMPTY 0x0001