From: Karl Williamson Date: Fri, 13 Nov 2009 05:40:21 +0000 (-0700) Subject: add code for Unicode semantics for non-utf8 latin1 chars X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=00f254e235ff10d6223aa9a402ad5b7a85689829;p=p5sagit%2Fp5-mst-13.2.git add code for Unicode semantics for non-utf8 latin1 chars --- diff --git a/MANIFEST b/MANIFEST index 93042f6..22db6a3 100644 --- a/MANIFEST +++ b/MANIFEST @@ -3476,6 +3476,7 @@ lib/integer.pm For "use integer" lib/integer.t For "use integer" testing lib/Internals.t For Internals::* testing lib/legacy.pm Pragma to preserve legacy behavior +lib/legacy.t For "use legacy" testing lib/less.pm For "use less" lib/less.t See if less support works lib/locale.pm For "use locale" diff --git a/handy.h b/handy.h index 9ec64e0..848cc0e 100644 --- a/handy.h +++ b/handy.h @@ -429,7 +429,7 @@ Returns a boolean indicating whether the C C is a US-ASCII (Basic Latin) alphanumeric character (including underscore) or digit. =for apidoc Am|bool|isALPHA|char ch -Returns a boolean indicating whether the C C is a US-ASCII (Basic Latin) +Returns a boolean indicating whether the C C is a US-ASCII (Basic Latin) alphabetic character. =for apidoc Am|bool|isSPACE|char ch @@ -479,7 +479,9 @@ US-ASCII (Basic Latin) range are viewed as not having any case. # define isPUNCT(c) ispunct(c) # define isXDIGIT(c) isxdigit(c) # define toUPPER(c) toupper(c) +# define toUPPER_LATIN1_MOD(c) UNI_TO_NATIVE(PL_mod_latin1_uc[(U8) NATIVE_TO_UNI(c)]) # define toLOWER(c) tolower(c) +# define toLOWER_LATIN1(c) UNI_TO_NATIVE(PL_latin1_lc[(U8) NATIVE_TO_UNI(c)]) #else # define isUPPER(c) ((c) >= 'A' && (c) <= 'Z') # define isLOWER(c) ((c) >= 'a' && (c) <= 'z') @@ -490,6 +492,15 @@ US-ASCII (Basic Latin) range are viewed as not having any case. # define isPRINT(c) (((c) >= 32 && (c) < 127)) # define isPUNCT(c) (((c) >= 33 && (c) <= 47) || ((c) >= 58 && (c) <= 64) || ((c) >= 91 && (c) <= 96) || ((c) >= 123 && (c) <= 126)) # define isXDIGIT(c) (isDIGIT(c) || ((c) >= 'a' && (c) <= 'f') || ((c) >= 'A' && (c) <= 'F')) + +/* Use table lookup for speed */ +# define toLOWER_LATIN1(c) (PL_latin1_lc[(U8) c]) + +/* Modified uc. Is correct uc except for three non-ascii chars which are + * all mapped to one of them, and these need special handling */ +# define toUPPER_LATIN1_MOD(c) (PL_mod_latin1_uc[(U8) c]) + +/* ASCII casing. */ # define toUPPER(c) (isLOWER(c) ? (c) - ('a' - 'A') : (c)) # define toLOWER(c) (isUPPER(c) ? (c) + ('a' - 'A') : (c)) #endif diff --git a/lib/legacy.pm b/lib/legacy.pm index a1f21a6..3ffea69 100755 --- a/lib/legacy.pm +++ b/lib/legacy.pm @@ -2,7 +2,7 @@ package legacy; our $VERSION = '1.00'; -$unicode8bit::hint_bits = 0x00000800; +$unicode8bit::hint_uni8bit = 0x00000800; my %legacy_bundle = ( "5.10" => [qw(unicode8bit)], @@ -20,21 +20,19 @@ behaviors use legacy ':5.10'; # Keeps semantics the same as in perl 5.10 - no legacy; - -=cut + use legacy qw(unicode8bit); - #no legacy qw(unicode8bit); + no legacy; -=pod + no legacy qw(unicode8bit); =head1 DESCRIPTION Some programs may rely on behaviors that for others are problematic or even wrong. A new version of Perl may change behaviors from past ones, and when it is viewed that the old way of doing things may be required -to still be supported, that behavior will be added to the list recognized -by this pragma to allow that. +to still be supported, the new behavior will be able to be turned off by using +this pragma. Additionally, a new behavior may be supported in a new version of Perl, but for whatever reason the default remains the old one. This pragma can enable @@ -44,24 +42,92 @@ Like other pragmas (C, for example), C will only make the legacy behavior for "foo" available from that point to the end of the enclosing block. -B - =head2 B Preserve the old way of doing things when a new version of Perl is -released that changes things +released that would otherwise change the behavior. + +The one current possibility is: + +=head3 unicode8bit + +THIS IS SUBJECT TO CHANGE + +Use legacy semantics for the 128 characters on ASCII systems that have the 8th +bit set. (See L below for EBCDIC systems.) Unless +C> is specified, or the scalar containing such a character is +known by Perl to be encoded in UTF8, the semantics are essentially that the +characters have an ordinal number, and that's it. They are caseless, and +aren't anything: they're not controls, not letters, not punctuation, ..., not +anything. + +This behavior stems from when Perl did not support Unicode, and ASCII was the +only known character set outside of C>. In order to not +possibly break pre_Unicode programs, these characters have retained their old +non-meanings, except when it is clear to Perl that Unicode is what is meant, +for example by calling utf::upgrade() on a scalar, or if the scalar also +contains characters that are only available in Unicode. Then these 128 +characters take on their Unicode meanings. + +The problem with this behavior is that a scalar that encodes these characters +has a different meaning depending on if it is stored as utf8 or not. +In general, the internal storage method should not affect the +external behavior. + +The behavior is known to have effects on these areas: + +=over 4 + +=item + +Changing the case of a scalar, that is, using C, +C, +C, +and C, or C<\L>, C<\U>, C<\u> and C<\l> in regular expression substitutions. + +=item + +Using caseless (C) regular expression matching + +=item + +Matching a number of properties in regular expressions, such as C<\w> + +=item + +User-defined case change mappings. You can create a C function, for +example, which overrides Perl's built-in case mappings. The scalar must be +encoded in utf8 for your function to actually be invoked. + +=back + +B +outside of C. See below for EBCDIC. +To turn on B for these characters, use +C>. +The other legacy behaviors regarding these characters are currently +unaffected by this pragma. + +=head4 EBCDIC platforms + +On EBCDIC platforms, the situation is somewhat different. The legacy +semantics are whatever the underlying semantics of the native C language +library are. Each of the three EBCDIC encodings currently known by Perl is an +isomorph of the Latin-1 character set. That means every character in Latin-1 +has a corresponding EBCDIC equivalent, and vice-versa. Specifying C> currently makes sure that all EBCDIC characters have the same +B semantics as their corresponding Latin-1 characters. =head2 B Turn on a new behavior in a version of Perl that understands it but has it turned off by default. For example, C turns on -behavior C in the lexical scope of the pragma. Simply C -turns on all new behaviors known to the pragma. +behavior C in the lexical scope of the pragma. C +without any modifier turns on all new behaviors known to the pragma. =head1 LEGACY BUNDLES -It's possible to turn off all new behaviors past a given release by +It's possible to turn off all new behaviors past a given release by using a I, which is the name of the release prefixed with a colon, to distinguish it from an individual legacy behavior. @@ -93,7 +159,7 @@ sub import { if (!exists $legacy{$name}) { unknown_legacy($name); } - $^H &= ~$unicode8bit::hint_bits; # The only thing it could be as of yet + $^H &= ~$unicode8bit::hint_uni8bit; # The only valid thing as of yet } } @@ -116,7 +182,7 @@ sub unimport { unknown_legacy($name); } else { - $^H |= $unicode8bit::hint_bits; # The only thing it could be as of yet + $^H |= $unicode8bit::hint_uni8bit; # The only valid thing as of yet } } } diff --git a/lib/legacy.t b/lib/legacy.t new file mode 100644 index 0000000..1d332b7 --- /dev/null +++ b/lib/legacy.t @@ -0,0 +1,146 @@ +use warnings; +use strict; + +BEGIN { + chdir 't' if -d 't'; + @INC = '../lib'; + require './test.pl'; +} + +#use Test::More; + +#plan("no_plan"); +plan(13312); + +# First compute the case mappings without resorting to the functions we're +# testing. + +# Initialize the arrays so each $i maps to itself. +my @posix_to_upper; +for my $i (0 .. 255) { + $posix_to_upper[$i] = chr($i); +} +my @posix_to_lower += my @posix_to_title += my @latin1_to_upper += my @latin1_to_lower += my @latin1_to_title += @posix_to_upper; + +# Override the elements in the to_lower arrays that have different lower case +# mappings with those mappings. +for my $i (0x41 .. 0x5A) { + $posix_to_lower[$i] = chr(ord($posix_to_lower[$i]) + 32); + $latin1_to_lower[$i] = chr(ord($latin1_to_lower[$i]) + 32); +} + +# Same for upper and title +for my $i (0x61 .. 0x7A) { + $posix_to_upper[$i] = chr(ord($posix_to_upper[$i]) - 32); + $latin1_to_upper[$i] = chr(ord($latin1_to_upper[$i]) - 32); + $posix_to_title[$i] = chr(ord($posix_to_title[$i]) - 32); + $latin1_to_title[$i] = chr(ord($latin1_to_title[$i]) - 32); +} + +# And the same for those in the latin1 range +for my $i (0xC0 .. 0xD6, 0xD8 .. 0xDE) { + $latin1_to_lower[$i] = chr(ord($latin1_to_lower[$i]) + 32); +} +for my $i (0xE0 .. 0xF6, 0xF8 .. 0xFE) { + $latin1_to_upper[$i] = chr(ord($latin1_to_upper[$i]) - 32); + $latin1_to_title[$i] = chr(ord($latin1_to_title[$i]) - 32); +} + +# Override the abnormal cases. +$latin1_to_upper[0xB5] = chr(0x39C); +$latin1_to_title[0xB5] = chr(0x39C); +$latin1_to_upper[0xDF] = 'SS'; +$latin1_to_title[0xDF] = 'Ss'; +$latin1_to_upper[0xFF] = chr(0x178); +$latin1_to_title[0xFF] = chr(0x178); + +my $repeat = 25; # Length to make strings. + +# Create hashes of strings in several ranges, both for uc and lc. +my %posix; +$posix{'uc'} = 'A' x $repeat; +$posix{'lc'} = 'a' x $repeat ; + +my %cyrillic; +$cyrillic{'uc'} = chr(0x42F) x $repeat; +$cyrillic{'lc'} = chr(0x44F) x $repeat; + +my %latin1; +$latin1{'uc'} = chr(0xD8) x $repeat; +$latin1{'lc'} = chr(0xF8) x $repeat; + +my %empty; +$empty{'lc'} = $empty{'uc'} = ""; + +# Loop so prefix each character being tested with nothing, and the various +# strings; then loop for suffixes of those strings as well. +for my $prefix (\%empty, \%posix, \%cyrillic, \%latin1) { + for my $suffix (\%empty, \%posix, \%cyrillic, \%latin1) { + for my $i (0 .. 255) { # For each possible posix or latin1 character + my $cp = sprintf "%02X", $i; + + # First try using latin1 (Unicode) semantics. + no legacy "unicode8bit"; + + my $phrase = 'with unicode'; + my $char = chr($i); + my $pre_lc = $prefix->{'lc'}; + my $pre_uc = $prefix->{'uc'}; + my $post_lc = $suffix->{'lc'}; + my $post_uc = $suffix->{'uc'}; + my $to_upper = $pre_lc . $char . $post_lc; + my $expected_upper = $pre_uc . $latin1_to_upper[$i] . $post_uc; + my $to_lower = $pre_uc . $char . $post_uc; + my $expected_lower = $pre_lc . $latin1_to_lower[$i] . $post_lc; + + is (uc($to_upper), $expected_upper, + + # The names are commented out for now to avoid 'wide character + # in print' messages. + ); #"$cp: $phrase: uc('$to_upper') eq '$expected_upper'"); + is (lc($to_lower), $expected_lower, + ); #"$cp: $phrase: lc('$to_lower') eq '$expected_lower'"); + + if ($pre_uc eq "") { # Title case if null prefix. + my $expected_title = $latin1_to_title[$i] . $post_lc; + is (ucfirst($to_upper), $expected_title, + ); #"$cp: $phrase: ucfirst('$to_upper') eq '$expected_title'"); + my $expected_lcfirst = $latin1_to_lower[$i] . $post_uc; + is (lcfirst($to_lower), $expected_lcfirst, + ); #"$cp: $phrase: lcfirst('$to_lower') eq '$expected_lcfirst'"); + } + + # Then try with posix semantics. + use legacy "unicode8bit"; + $phrase = 'no unicode'; + + # These don't contribute anything in this case. + next if $suffix == \%cyrillic; + next if $suffix == \%latin1; + next if $prefix == \%cyrillic; + next if $prefix == \%latin1; + + $expected_upper = $pre_uc . $posix_to_upper[$i] . $post_uc; + $expected_lower = $pre_lc . $posix_to_lower[$i] . $post_lc; + + is (uc($to_upper), $expected_upper, + ); #"$cp: $phrase: uc('$to_upper') eq '$expected_upper'"); + is (lc($to_lower), $expected_lower, + ); #"$cp: $phrase: lc('$to_lower') eq '$expected_lower'"); + + if ($pre_uc eq "") { + my $expected_title = $posix_to_title[$i] . $post_lc; + is (ucfirst($to_upper), $expected_title, + ); #"$cp: $phrase: ucfirst('$to_upper') eq '$expected_title'"); + my $expected_lcfirst = $posix_to_lower[$i] . $post_uc; + is (lcfirst($to_lower), $expected_lcfirst, + ); #"$cp: $phrase: lcfirst('$to_lower') eq '$expected_lcfirst'"); + } + } + } +} diff --git a/perl.h b/perl.h index 874d0c3..fe6b7fc 100644 --- a/perl.h +++ b/perl.h @@ -28,7 +28,7 @@ #ifdef VOIDUSED # undef VOIDUSED -#endif +#endif #define VOIDUSED 1 #ifdef PERL_MICRO @@ -270,13 +270,13 @@ #define CALLREG_PACKAGE(rx) \ CALL_FPTR(RX_ENGINE(rx)->qr_package)(aTHX_ (rx)) -#if defined(USE_ITHREADS) +#if defined(USE_ITHREADS) #define CALLREGDUPE(prog,param) \ Perl_re_dup(aTHX_ (prog),(param)) #define CALLREGDUPE_PVT(prog,param) \ (prog ? CALL_FPTR(RX_ENGINE(prog)->dupe)(aTHX_ (prog),(param)) \ - : (REGEXP *)NULL) + : (REGEXP *)NULL) #endif @@ -310,7 +310,7 @@ # define PERL_UNUSED_DECL # endif #endif - + /* gcc -Wall: * for silencing unused variables that are actually used most of the time, * but we cannot quite get rid of, such as "ax" in PPCODE+noargs xsubs @@ -947,7 +947,7 @@ EXTERN_C int usleep(unsigned int); #define PERL_DEFAULT_DO_EXEC3_IMPLEMENTATION #endif -/* Cannot include embed.h here on Win32 as win32.h has not +/* Cannot include embed.h here on Win32 as win32.h has not yet been included and defines some config variables e.g. HAVE_INTERP_INTERN */ #if !defined(PERL_FOR_X2P) && !(defined(WIN32)||defined(VMS)) @@ -1198,7 +1198,7 @@ EXTERN_C int usleep(unsigned int); #endif /* In Tru64 use the 4.4BSD struct msghdr, not the 4.3 one. - * This is important for using IPv6. + * This is important for using IPv6. * For OSF/1 3.2, however, defining _SOCKADDR_LEN would be * a bad idea since it breaks send() and recv(). */ #if defined(__osf__) && defined(__alpha) && !defined(_SOCKADDR_LEN) && !defined(DEC_OSF1_3_X) @@ -2780,7 +2780,7 @@ freeing any remaining Perl interpreters. # define HASATTRIBUTE_WARN_UNUSED_RESULT # endif #endif -#endif /* #ifndef PERL_MICRO */ +#endif /* #ifndef PERL_MICRO */ /* USE_5005THREADS needs to be after unixish.h as includes * which defines NSIG - which will stop inclusion of @@ -2871,7 +2871,7 @@ typedef pthread_key_t perl_key; /* This is complicated. The child processes return a true native VMS status which must be saved. But there is an assumption in Perl that the UNIX child status has some relationship to errno values, so - Perl tries to translate it to text in some of the tests. + Perl tries to translate it to text in some of the tests. In order to get the string translation correct, for the error, errno must be EVMSERR, but that generates a different text message than what the test programs are expecting. So an errno value must @@ -3131,16 +3131,16 @@ typedef pthread_key_t perl_key; # define PERL_SET_THX(t) PERL_SET_CONTEXT(t) #endif -/* +/* This replaces the previous %_ "hack" by the "%p" hacks. All that is required is that the perl source does not - use "%-p" or "%-p" or "%p" formats. - These formats will still work in perl code. + use "%-p" or "%-p" or "%p" formats. + These formats will still work in perl code. See comments in sv.c for futher details. Robin Barker 2005-07-14 - No longer use %1p for VDf = %vd. RMB 2007-10-19 + No longer use %1p for VDf = %vd. RMB 2007-10-19 */ #ifndef SVf_ @@ -3162,7 +3162,7 @@ typedef pthread_key_t perl_key; #define SVfARG(p) ((void*)(p)) #ifdef PERL_CORE -/* not used; but needed for backward compatibilty with XS code? - RMB */ +/* not used; but needed for backward compatibilty with XS code? - RMB */ # undef VDf #else # ifndef VDf @@ -3171,7 +3171,7 @@ typedef pthread_key_t perl_key; #endif #ifdef PERL_CORE -/* not used; but needed for backward compatibilty with XS code? - RMB */ +/* not used; but needed for backward compatibilty with XS code? - RMB */ # undef UVf #else # ifndef UVf @@ -3251,7 +3251,7 @@ typedef pthread_key_t perl_key; #ifdef PRINTF_FORMAT_NULL_OK # define __attribute__format__null_ok__(x,y,z) __attribute__format__(x,y,z) #else -# define __attribute__format__null_ok__(x,y,z) +# define __attribute__format__null_ok__(x,y,z) #endif #ifdef HAS_BUILTIN_EXPECT @@ -3354,7 +3354,7 @@ typedef struct crypt_data { /* straight from /usr/include/crypt.h */ #endif #ifdef __LIBCATAMOUNT__ -#undef HAS_PASSWD /* unixish.h but not unixish enough. */ +#undef HAS_PASSWD /* unixish.h but not unixish enough. */ #undef HAS_GROUP #define FAKE_BIT_BUCKET #endif @@ -4345,9 +4345,85 @@ EXTCONST unsigned char PL_fold[] = { 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255 }; -#endif /* !EBCDIC */ -#else +#endif /* !EBCDIC, but still in DOINIT */ + +/* If these tables are accessed through ebcdic, the access will be converted to + * latin1 first */ +EXTCONST unsigned char PL_latin1_lc[] = { /* lowercasing */ + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 'a', 'b', 'c', 'd', 'e', 'f', 'g', + 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', + 'x', 'y', 'z', 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, + 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, + 184, 185, 186, 187, 188, 189, 190, 191, + 192+32, 193+32, 194+32, 195+32, 196+32, 197+32, 198+32, 199+32, + 200+32, 201+32, 202+32, 203+32, 204+32, 205+32, 206+32, 207+32, + 208+32, 209+32, 210+32, 211+32, 212+32, 213+32, 214+32, 215, + 216+32, 217+32, 218+32, 219+32, 220+32, 221+32, 222+32, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 254, 255 +}; + +/* upper and title case of latin1 characters, modified so that the three tricky + * ones are mapped to 255 (which is one of the three) */ +EXTCONST unsigned char PL_mod_latin1_uc[] = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 92, 93, 94, 95, + 96, 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', + 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', + 'X', 'Y', 'Z', 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, + 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 255 /*micro*/, 182, 183, + 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, + 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 255 /*sharp s*/, + 224-32, 225-32, 226-32, 227-32, 228-32, 229-32, 230-32, 231-32, + 232-32, 233-32, 234-32, 235-32, 236-32, 237-32, 238-32, 239-32, + 240-32, 241-32, 242-32, 243-32, 244-32, 245-32, 246-32, 247, + 248-32, 249-32, 250-32, 251-32, 252-32, 253-32, 254-32, 255 +}; +#else /* ! DOINIT */ EXTCONST unsigned char PL_fold[]; +EXTCONST unsigned char PL_mod_latin1_uc[]; +EXTCONST unsigned char PL_latin1_lc[]; #endif #ifndef PERL_GLOBAL_STRUCT /* or perlvars.h */ @@ -5970,8 +6046,8 @@ extern void moncontrol(int); #define NO_ENV_ARRAY_IN_MAIN #endif -/* These are used by Perl_pv_escape() and Perl_pv_pretty() - * are here so that they are available throughout the core +/* These are used by Perl_pv_escape() and Perl_pv_pretty() + * are here so that they are available throughout the core * NOTE that even though some are for _escape and some for _pretty * there must not be any clashes as the flags from _pretty are * passed straight through to _escape. @@ -5985,7 +6061,7 @@ extern void moncontrol(int); #define PERL_PV_ESCAPE_FIRSTCHAR 0x0008 -#define PERL_PV_ESCAPE_UNI 0x0100 +#define PERL_PV_ESCAPE_UNI 0x0100 #define PERL_PV_ESCAPE_UNI_DETECT 0x0200 #define PERL_PV_ESCAPE_ALL 0x1000 diff --git a/pp.c b/pp.c index 7641b54..eaeb89f 100644 --- a/pp.c +++ b/pp.c @@ -3525,22 +3525,97 @@ PP(pp_crypt) #endif } +/* Generally UTF-8 and UTF-EBCDIC are indistinguishable at this level. So + * most comments below say UTF-8, when in fact they mean UTF-EBCDIC as well */ + +/* Both the characters below can be stored in two UTF-8 bytes. In UTF-8 the max + * character that 2 bytes can hold is U+07FF, and in UTF-EBCDIC it is U+03FF. + * See http://www.unicode.org/unicode/reports/tr16 */ +#define LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS 0x0178 /* Also is title case */ +#define GREEK_CAPITAL_LETTER_MU 0x039C /* Upper and title case of MICRON */ + +/* Below are several macros that generate code */ +/* Generates code to store a unicode codepoint c that is known to occupy + * exactly two UTF-8 and UTF-EBCDIC bytes; it is stored into p and p+1. */ +#define STORE_UNI_TO_UTF8_TWO_BYTE(p, c) \ + STMT_START { \ + *(p) = UTF8_TWO_BYTE_HI(c); \ + *((p)+1) = UTF8_TWO_BYTE_LO(c); \ + } STMT_END + +/* Like STORE_UNI_TO_UTF8_TWO_BYTE, but advances p to point to the next + * available byte after the two bytes */ +#define CAT_UNI_TO_UTF8_TWO_BYTE(p, c) \ + STMT_START { \ + *(p)++ = UTF8_TWO_BYTE_HI(c); \ + *((p)++) = UTF8_TWO_BYTE_LO(c); \ + } STMT_END + +/* Generates code to store the upper case of latin1 character l which is known + * to have its upper case be non-latin1 into the two bytes p and p+1. There + * are only two characters that fit this description, and this macro knows + * about them, and that the upper case values fit into two UTF-8 or UTF-EBCDIC + * bytes */ +#define STORE_NON_LATIN1_UC(p, l) \ +STMT_START { \ + if ((l) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) { \ + STORE_UNI_TO_UTF8_TWO_BYTE((p), LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); \ + } else { /* Must be the following letter */ \ + STORE_UNI_TO_UTF8_TWO_BYTE((p), GREEK_CAPITAL_LETTER_MU); \ + } \ +} STMT_END + +/* Like STORE_NON_LATIN1_UC, but advances p to point to the next available byte + * after the character stored */ +#define CAT_NON_LATIN1_UC(p, l) \ +STMT_START { \ + if ((l) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) { \ + CAT_UNI_TO_UTF8_TWO_BYTE((p), LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); \ + } else { \ + CAT_UNI_TO_UTF8_TWO_BYTE((p), GREEK_CAPITAL_LETTER_MU); \ + } \ +} STMT_END + +/* Generates code to add the two UTF-8 bytes (probably u) that are the upper + * case of l into p and p+1. u must be the result of toUPPER_LATIN1_MOD(l), + * and must require two bytes to store it. Advances p to point to the next + * available position */ +#define CAT_TWO_BYTE_UNI_UPPER_MOD(p, l, u) \ +STMT_START { \ + if ((u) != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) { \ + CAT_UNI_TO_UTF8_TWO_BYTE((p), (u)); /* not special, just save it */ \ + } else if (l == LATIN_SMALL_LETTER_SHARP_S) { \ + *(p)++ = 'S'; *(p)++ = 'S'; /* upper case is 'SS' */ \ + } else {/* else is one of the other two special cases */ \ + CAT_NON_LATIN1_UC((p), (l)); \ + } \ +} STMT_END + PP(pp_ucfirst) { + /* Actually is both lcfirst() and ucfirst(). Only the first character + * changes. This means that possibly we can change in-place, ie., just + * take the source and change that one character and store it back, but not + * if read-only etc, or if the length changes */ + dVAR; dSP; SV *source = TOPs; - STRLEN slen; + STRLEN slen; /* slen is the byte length of the whole SV. */ STRLEN need; SV *dest; - bool inplace = TRUE; - bool doing_utf8; + bool inplace; /* ? Convert first char only, in-place */ + bool doing_utf8 = FALSE; /* ? using utf8 */ + bool convert_source_to_utf8 = FALSE; /* ? need to convert */ const int op_type = PL_op->op_type; const U8 *s; U8 *d; U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; - STRLEN ulen; - STRLEN tculen; + STRLEN ulen; /* ulen is the byte length of the original Unicode character + * stored as UTF-8 at s. */ + STRLEN tculen; /* tculen is the byte length of the freshly titlecased (or + * lowercased) character stored in tmpbuf. May be either + * UTF-8 or not, but in either case is the number of bytes */ SvGETMAGIC(source); if (SvOK(source)) { @@ -3552,25 +3627,187 @@ PP(pp_ucfirst) slen = 0; } - if (slen && DO_UTF8(source) && UTF8_IS_START(*s)) { + /* We may be able to get away with changing only the first character, in + * place, but not if read-only, etc. Later we may discover more reasons to + * not convert in-place. */ + inplace = SvPADTMP(source) && !SvREADONLY(source) && SvTEMP(source); + + /* First calculate what the changed first character should be. This affects + * whether we can just swap it out, leaving the rest of the string unchanged, + * or even if have to convert the dest to UTF-8 when the source isn't */ + + if (! slen) { /* If empty */ + need = 1; /* still need a trailing NUL */ + } + else if (DO_UTF8(source)) { /* Is the source utf8? */ doing_utf8 = TRUE; - utf8_to_uvchr(s, &ulen); - if (op_type == OP_UCFIRST) { - toTITLE_utf8(s, tmpbuf, &tculen); - } else { - toLOWER_utf8(s, tmpbuf, &tculen); + +/* TODO: This is #ifdefd out because it has hard-coded the standard mappings, + * and doesn't allow for the user to specify their own. When code is added to + * detect if there is a user-defined mapping in force here, and if so to use + * that, then the code below can be compiled. The detection would be a good + * thing anyway, as currently the user-defined mappings only work on utf8 + * strings, and thus depend on the chosen internal storage method, which is a + * bad thing */ +#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS + if (UTF8_IS_INVARIANT(*s)) { + + /* An invariant source character is either ASCII or, in EBCDIC, an + * ASCII equivalent or a caseless C1 control. In both these cases, + * the lower and upper cases of any character are also invariants + * (and title case is the same as upper case). So it is safe to + * use the simple case change macros which avoid the overhead of + * the general functions. Note that if perl were to be extended to + * do locale handling in UTF-8 strings, this wouldn't be true in, + * for example, Lithuanian or Turkic. */ + *tmpbuf = (op_type == OP_LCFIRST) ? toLOWER(*s) : toUPPER(*s); + tculen = ulen = 1; + need = slen + 1; + } + else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { + U8 chr; + + /* Similarly, if the source character isn't invariant but is in the + * latin1 range (or EBCDIC equivalent thereof), we have the case + * changes compiled into perl, and can avoid the overhead of the + * general functions. In this range, the characters are stored as + * two UTF-8 bytes, and it so happens that any changed-case version + * is also two bytes (in both ASCIIish and EBCDIC machines). */ + tculen = ulen = 2; + need = slen + 1; + + /* Convert the two source bytes to a single Unicode code point + * value, change case and save for below */ + chr = UTF8_ACCUMULATE(*s, *(s+1)); + if (op_type == OP_LCFIRST) { /* lower casing is easy */ + U8 lower = toLOWER_LATIN1(chr); + STORE_UNI_TO_UTF8_TWO_BYTE(tmpbuf, lower); + } + else { /* ucfirst */ + U8 upper = toUPPER_LATIN1_MOD(chr); + + /* Most of the latin1 range characters are well-behaved. Their + * title and upper cases are the same, and are also in the + * latin1 range. The macro above returns their upper (hence + * title) case, and all that need be done is to save the result + * for below. However, several characters are problematic, and + * have to be handled specially. The MOD in the macro name + * above means that these tricky characters all get mapped to + * the single character LATIN_SMALL_LETTER_Y_WITH_DIAERESIS. + * This mapping saves some tests for the majority of the + * characters */ + + if (upper != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) { + + /* Not tricky. Just save it. */ + STORE_UNI_TO_UTF8_TWO_BYTE(tmpbuf, upper); + } + else if (chr == LATIN_SMALL_LETTER_SHARP_S) { + + /* This one is tricky because it is two characters long, + * though the UTF-8 is still two bytes, so the stored + * length doesn't change */ + *tmpbuf = 'S'; /* The UTF-8 is 'Ss' */ + *(tmpbuf + 1) = 's'; + } + else { + + /* The other two have their title and upper cases the same, + * but are tricky because the changed-case characters + * aren't in the latin1 range. They, however, do fit into + * two UTF-8 bytes */ + STORE_NON_LATIN1_UC(tmpbuf, chr); + } + } } - /* If the two differ, we definately cannot do inplace. */ - inplace = (ulen == tculen); - need = slen + 1 - ulen + tculen; - } else { - doing_utf8 = FALSE; - need = slen + 1; + else { +#endif /* end of dont want to break user-defined casing */ + + /* Here, can't short-cut the general case */ + + utf8_to_uvchr(s, &ulen); + if (op_type == OP_UCFIRST) toTITLE_utf8(s, tmpbuf, &tculen); + else toLOWER_utf8(s, tmpbuf, &tculen); + + /* we can't do in-place if the length changes. */ + if (ulen != tculen) inplace = FALSE; + need = slen + 1 - ulen + tculen; +#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS + } +#endif } + else { /* Non-zero length, non-UTF-8, Need to consider locale and if + * latin1 is treated as caseless. Note that a locale takes + * precedence */ + tculen = 1; /* Most characters will require one byte, but this will + * need to be overridden for the tricky ones */ + need = slen + 1; + + if (op_type == OP_LCFIRST) { + + /* lower case the first letter: no trickiness for any character */ + *tmpbuf = (IN_LOCALE_RUNTIME) ? toLOWER_LC(*s) : + ((IN_UNI_8_BIT) ? toLOWER_LATIN1(*s) : toLOWER(*s)); + } + /* is ucfirst() */ + else if (IN_LOCALE_RUNTIME) { + *tmpbuf = toUPPER_LC(*s); /* This would be a bug if any locales + * have upper and title case different + */ + } + else if (! IN_UNI_8_BIT) { + *tmpbuf = toUPPER(*s); /* Returns caseless for non-ascii, or + * on EBCDIC machines whatever the + * native function does */ + } + else { /* is ucfirst non-UTF-8, not in locale, and cased latin1 */ + *tmpbuf = toUPPER_LATIN1_MOD(*s); + + /* tmpbuf now has the correct title case for all latin1 characters + * except for the several ones that have tricky handling. All + * of these are mapped by the MOD to the letter below. */ + if (*tmpbuf == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) { + + /* The length is going to change, with all three of these, so + * can't replace just the first character */ + inplace = FALSE; + + /* We use the original to distinguish between these tricky + * cases */ + if (*s == LATIN_SMALL_LETTER_SHARP_S) { + /* Two character title case 'Ss', but can remain non-UTF-8 */ + need = slen + 2; + *tmpbuf = 'S'; + *(tmpbuf + 1) = 's'; /* Assert: length(tmpbuf) >= 2 */ + tculen = 2; + } + else { + + /* The other two tricky ones have their title case outside + * latin1. It is the same as their upper case. */ + doing_utf8 = TRUE; + STORE_NON_LATIN1_UC(tmpbuf, *s); + + /* The UTF-8 and UTF-EBCDIC lengths of both these characters + * and their upper cases is 2. */ + tculen = ulen = 2; + + /* The entire result will have to be in UTF-8. Assume worst + * case sizing in conversion. (all latin1 characters occupy + * at most two bytes in utf8) */ + convert_source_to_utf8 = TRUE; + need = slen * 2 + 1; + } + } /* End of is one of the three special chars */ + } /* End of use Unicode (Latin1) semantics */ + } /* End of changing the case of the first character */ - if (SvPADTMP(source) && !SvREADONLY(source) && inplace && SvTEMP(source)) { - /* We can convert in place. */ + /* Here, have the first character's changed case stored in tmpbuf. Ready to + * generate the result */ + if (inplace) { + /* We can convert in place. This means we change just the first + * character without disturbing the rest; no need to grow */ dest = source; s = d = (U8*)SvPV_force_nomg(source, slen); } else { @@ -3578,53 +3815,83 @@ PP(pp_ucfirst) dest = TARG; + /* Here, we can't convert in place; we earlier calculated how much + * space we will need, so grow to accommodate that */ SvUPGRADE(dest, SVt_PV); d = (U8*)SvGROW(dest, need); (void)SvPOK_only(dest); SETs(dest); - - inplace = FALSE; } if (doing_utf8) { - if(!inplace) { - /* slen is the byte length of the whole SV. - * ulen is the byte length of the original Unicode character - * stored as UTF-8 at s. - * tculen is the byte length of the freshly titlecased (or - * lowercased) Unicode character stored as UTF-8 at tmpbuf. - * We first set the result to be the titlecased (/lowercased) - * character, and then append the rest of the SV data. */ - sv_setpvn(dest, (char*)tmpbuf, tculen); - if (slen > ulen) - sv_catpvn(dest, (char*)(s + ulen), slen - ulen); + if (! inplace) { + if (! convert_source_to_utf8) { + + /* Here both source and dest are in UTF-8, but have to create + * the entire output. We initialize the result to be the + * title/lower cased first character, and then append the rest + * of the string. */ + sv_setpvn(dest, (char*)tmpbuf, tculen); + if (slen > ulen) { + sv_catpvn(dest, (char*)(s + ulen), slen - ulen); + } + } + else { + const U8 *const send = s + slen; + + /* Here the dest needs to be in UTF-8, but the source isn't, + * except we earlier UTF-8'd the first character of the source + * into tmpbuf. First put that into dest, and then append the + * rest of the source, converting it to UTF-8 as we go. */ + + /* Assert tculen is 2 here because the only two characters that + * get to this part of the code have 2-byte UTF-8 equivalents */ + *d++ = *tmpbuf; + *d++ = *(tmpbuf + 1); + s++; /* We have just processed the 1st char */ + + for (; s < send; s++) { + d = uvchr_to_utf8(d, *s); + } + *d = '\0'; + SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); + } SvUTF8_on(dest); } - else { + else { /* in-place UTF-8. Just overwrite the first character */ Copy(tmpbuf, d, tculen, U8); SvCUR_set(dest, need - 1); } } - else { - if (*s) { + else { /* Neither source nor dest are in or need to be UTF-8 */ + if (slen) { if (IN_LOCALE_RUNTIME) { TAINT; SvTAINTED_on(dest); - *d = (op_type == OP_UCFIRST) - ? toUPPER_LC(*s) : toLOWER_LC(*s); } - else - *d = (op_type == OP_UCFIRST) ? toUPPER(*s) : toLOWER(*s); - } else { - /* See bug #39028 */ + if (inplace) { /* in-place, only need to change the 1st char */ + *d = *tmpbuf; + } + else { /* Not in-place */ + + /* Copy the case-changed character(s) from tmpbuf */ + Copy(tmpbuf, d, tculen, U8); + d += tculen - 1; /* Code below expects d to point to final + * character stored */ + } + } + else { /* empty source */ + /* See bug #39028: Don't taint if empty */ *d = *s; } + /* In a "use bytes" we don't treat the source as UTF-8, but, still want + * the destination to retain that flag */ if (SvUTF8(source)) SvUTF8_on(dest); - if (!inplace) { + if (!inplace) { /* Finish the rest of the string, unchanged */ /* This will copy the trailing NUL */ Copy(s + 1, d + 1, slen, U8); SvCUR_set(dest, need - 1); @@ -3636,7 +3903,7 @@ PP(pp_ucfirst) /* There's so much setup/teardown code common between uc and lc, I wonder if it would be worth merging the two, and just having a switch outside each - of the three tight loops. */ + of the three tight loops. There is less and less commonality though */ PP(pp_uc) { dVAR; @@ -3651,9 +3918,16 @@ PP(pp_uc) SvGETMAGIC(source); if (SvPADTMP(source) && !SvREADONLY(source) && !SvAMAGIC(source) - && SvTEMP(source) && !DO_UTF8(source)) { - /* We can convert in place. */ - + && SvTEMP(source) && !DO_UTF8(source) + && (IN_LOCALE_RUNTIME || ! IN_UNI_8_BIT)) { + + /* We can convert in place. The reason we can't if in UNI_8_BIT is to + * make the loop tight, so we overwrite the source with the dest before + * looking at it, and we need to look at the original source + * afterwards. There would also need to be code added to handle + * switching to not in-place in midstream if we run into characters + * that change the length. + */ dest = source; s = d = (U8*)SvPV_force_nomg(source, len); min = len + 1; @@ -3693,48 +3967,209 @@ PP(pp_uc) const U8 *const send = s + len; U8 tmpbuf[UTF8_MAXBYTES+1]; +/* This is ifdefd out because it needs more work and thought. It isn't clear + * that we should do it. These are hard-coded rules from the Unicode standard, + * and may change. 5.2 gives new guidance on the iota subscript, for example, + * which has not been checked against this; and secondly it may be that we are + * passed a subset of the context, via a \U...\E, for example, and its not + * clear what the best approach is to that */ +#ifdef CONTEXT_DEPENDENT_CASING + bool in_iota_subscript = FALSE; +#endif + while (s < send) { - const STRLEN u = UTF8SKIP(s); - STRLEN ulen; - - toUPPER_utf8(s, tmpbuf, &ulen); - if (ulen > u && (SvLEN(dest) < (min += ulen - u))) { - /* If the eventually required minimum size outgrows - * the available space, we need to grow. */ - const UV o = d - (U8*)SvPVX_const(dest); - - /* If someone uppercases one million U+03B0s we SvGROW() one - * million times. Or we could try guessing how much to - allocate without allocating too much. Such is life. */ - SvGROW(dest, min); - d = (U8*)SvPVX(dest) + o; +#ifdef CONTEXT_DEPENDENT_CASING + if (in_iota_subscript && ! is_utf8_mark(s)) { + /* A non-mark. Time to output the iota subscript */ +#define GREEK_CAPITAL_LETTER_IOTA 0x0399 +#define COMBINING_GREEK_YPOGEGRAMMENI 0x0345 + + CAT_UNI_TO_UTF8_TWO_BYTE(d, GREEK_CAPITAL_LETTER_IOTA); + in_iota_subscript = FALSE; + } +#endif + + +/* See comments at the first instance in this file of this ifdef */ +#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS + + /* If the UTF-8 character is invariant, then it is in the range + * known by the standard macro; result is only one byte long */ + if (UTF8_IS_INVARIANT(*s)) { + *d++ = toUPPER(*s); + s++; + } + else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { + + /* Likewise, if it fits in a byte, its case change is in our + * table */ + U8 orig = UTF8_ACCUMULATE(*s, *(s+1)); + U8 upper = toUPPER_LATIN1_MOD(orig); + CAT_TWO_BYTE_UNI_UPPER_MOD(d, orig, upper); + s += 2; + } + else { +#else + { +#endif + + /* Otherwise, need the general UTF-8 case. Get the changed + * case value and copy it to the output buffer */ + + const STRLEN u = UTF8SKIP(s); + STRLEN ulen; + +#ifndef CONTEXT_DEPENDENT_CASING + toUPPER_utf8(s, tmpbuf, &ulen); +#else + const UV uv = toUPPER_utf8(s, tmpbuf, &ulen); + if (uv == GREEK_CAPITAL_LETTER_IOTA && utf8_to_uvchr(s, 0) == COMBINING_GREEK_YPOGEGRAMMENI) { + in_iota_subscript = TRUE; + } + else { +#endif + if (ulen > u && (SvLEN(dest) < (min += ulen - u))) { + /* If the eventually required minimum size outgrows + * the available space, we need to grow. */ + const UV o = d - (U8*)SvPVX_const(dest); + + /* If someone uppercases one million U+03B0s we + * SvGROW() one million times. Or we could try + * guessing how much to allocate without allocating too + * much. Such is life. See corresponding comment in lc code + * for another option */ + SvGROW(dest, min); + d = (U8*)SvPVX(dest) + o; + } + Copy(tmpbuf, d, ulen, U8); + d += ulen; +#ifdef CONTEXT_DEPENDENT_CASING + } +#endif + s += u; } - Copy(tmpbuf, d, ulen, U8); - d += ulen; - s += u; } +#ifdef CONTEXT_DEPENDENT_CASING + if (in_iota_subscript) CAT_UNI_TO_UTF8_TWO_BYTE(d, GREEK_CAPITAL_LETTER_IOTA); +#endif SvUTF8_on(dest); *d = '\0'; SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); - } else { + } else { /* Not UTF-8 */ if (len) { const U8 *const send = s + len; + + /* Use locale casing if in locale; regular style if not treating + * latin1 as having case; otherwise the latin1 casing. Do the + * whole thing in a tight loop, for speed, */ if (IN_LOCALE_RUNTIME) { TAINT; SvTAINTED_on(dest); for (; s < send; d++, s++) *d = toUPPER_LC(*s); } - else { - for (; s < send; d++, s++) + else if (! IN_UNI_8_BIT) { + for (; s < send; d++, s++) { *d = toUPPER(*s); + } } - } + else { + for (; s < send; d++, s++) { + *d = toUPPER_LATIN1_MOD(*s); + if (*d != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) continue; + + /* The mainstream case is the tight loop above. To avoid + * extra tests in that, all three characters that require + * special handling are mapped by the MOD to the one tested + * just above. + * Use the source to distinguish between the three cases */ + + if (*s == LATIN_SMALL_LETTER_SHARP_S) { + + /* uc() of this requires 2 characters, but they are + * ASCII. If not enough room, grow the string */ + if (SvLEN(dest) < ++min) { + const UV o = d - (U8*)SvPVX_const(dest); + SvGROW(dest, min); + d = (U8*)SvPVX(dest) + o; + } + *d++ = 'S'; *d = 'S'; /* upper case is 'SS' */ + continue; /* Back to the tight loop; still in ASCII */ + } + + /* The other two special handling characters have their + * upper cases outside the latin1 range, hence need to be + * in UTF-8, so the whole result needs to be in UTF-8. So, + * here we are somewhere in the middle of processing a + * non-UTF-8 string, and realize that we will have to convert + * the whole thing to UTF-8. What to do? There are + * several possibilities. The simplest to code is to + * convert what we have so far, set a flag, and continue on + * in the loop. The flag would be tested each time through + * the loop, and if set, the next character would be + * converted to UTF-8 and stored. But, I (khw) didn't want + * to slow down the mainstream case at all for this fairly + * rare case, so I didn't want to add a test that didn't + * absolutely have to be there in the loop, besides the + * possibility that it would get too complicated for + * optimizers to deal with. Another possibility is to just + * give up, convert the source to UTF-8, and restart the + * function that way. Another possibility is to convert + * both what has already been processed and what is yet to + * come separately to UTF-8, then jump into the loop that + * handles UTF-8. But the most efficient time-wise of the + * ones I could think of is what follows, and turned out to + * not require much extra code. */ + + /* Convert what we have so far into UTF-8, telling the + * function that we know it should be converted, and to + * allow extra space for what we haven't processed yet. + * Assume the worst case space requirements for converting + * what we haven't processed so far: that it will require + * two bytes for each remaining source character, plus the + * NUL at the end. This may cause the string pointer to + * move, so re-find it. */ + + len = d - (U8*)SvPVX_const(dest); + SvCUR_set(dest, len); + len = sv_utf8_upgrade_flags_grow(dest, + SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, + (send -s) * 2 + 1); + d = (U8*)SvPVX(dest) + len; + + /* And append the current character's upper case in UTF-8 */ + CAT_NON_LATIN1_UC(d, *s); + + /* Now process the remainder of the source, converting to + * upper and UTF-8. If a resulting byte is invariant in + * UTF-8, output it as-is, otherwise convert to UTF-8 and + * append it to the output. */ + + s++; + for (; s < send; s++) { + U8 upper = toUPPER_LATIN1_MOD(*s); + if UTF8_IS_INVARIANT(upper) { + *d++ = upper; + } + else { + CAT_TWO_BYTE_UNI_UPPER_MOD(d, *s, upper); + } + } + + /* Here have processed the whole source; no need to continue + * with the outer loop. Each character has been converted + * to upper case and converted to UTF-8 */ + + break; + } /* End of processing all latin1-style chars */ + } /* End of processing all chars */ + } /* End of source is not empty */ + if (source != dest) { - *d = '\0'; + *d = '\0'; /* Here d points to 1 after last char, add NUL */ SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); } - } + } /* End of isn't utf8 */ SvSETMAGIC(dest); RETURN; } @@ -3754,8 +4189,9 @@ PP(pp_lc) if (SvPADTMP(source) && !SvREADONLY(source) && !SvAMAGIC(source) && SvTEMP(source) && !DO_UTF8(source)) { - /* We can convert in place. */ + /* We can convert in place, as lowercasing anything in the latin1 range + * (or else DO_UTF8 would have been on) doesn't lengthen it */ dest = source; s = d = (U8*)SvPV_force_nomg(source, len); min = len + 1; @@ -3796,56 +4232,148 @@ PP(pp_lc) U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; while (s < send) { - const STRLEN u = UTF8SKIP(s); - STRLEN ulen; - const UV uv = toLOWER_utf8(s, tmpbuf, &ulen); +/* See comments at the first instance in this file of this ifdef */ +#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS + if (UTF8_IS_INVARIANT(*s)) { -#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 /* Unicode U+03A3 */ - if (uv == GREEK_CAPITAL_LETTER_SIGMA) { - NOOP; - /* - * Now if the sigma is NOT followed by - * /$ignorable_sequence$cased_letter/; - * and it IS preceded by /$cased_letter$ignorable_sequence/; - * where $ignorable_sequence is [\x{2010}\x{AD}\p{Mn}]* - * and $cased_letter is [\p{Ll}\p{Lo}\p{Lt}] - * then it should be mapped to 0x03C2, - * (GREEK SMALL LETTER FINAL SIGMA), - * instead of staying 0x03A3. - * "should be": in other words, this is not implemented yet. - * See lib/unicore/SpecialCasing.txt. + /* Invariant characters use the standard mappings compiled in. */ + *d++ = toLOWER(*s); + s++; } - if (ulen > u && (SvLEN(dest) < (min += ulen - u))) { - /* If the eventually required minimum size outgrows - * the available space, we need to grow. */ - const UV o = d - (U8*)SvPVX_const(dest); - - /* If someone lowercases one million U+0130s we SvGROW() one - * million times. Or we could try guessing how much to - allocate without allocating too much. Such is life. */ - SvGROW(dest, min); - d = (U8*)SvPVX(dest) + o; + else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { + + /* As do the ones in the Latin1 range */ + U8 lower = toLOWER_LATIN1(UTF8_ACCUMULATE(*s, *(s+1))); + CAT_UNI_TO_UTF8_TWO_BYTE(d, lower); + s += 2; } - Copy(tmpbuf, d, ulen, U8); - d += ulen; - s += u; - } + else { +#endif + /* Here, is utf8 not in Latin-1 range, have to go out and get + * the mappings from the tables. */ + + const STRLEN u = UTF8SKIP(s); + STRLEN ulen; + +/* See comments at the first instance in this file of this ifdef */ +#ifndef CONTEXT_DEPENDENT_CASING + toLOWER_utf8(s, tmpbuf, &ulen); +#else + /* Here is context dependent casing, not compiled in currently; + * needs more thought and work */ + + const UV uv = toLOWER_utf8(s, tmpbuf, &ulen); + + /* If the lower case is a small sigma, it may be that we need + * to change it to a final sigma. This happens at the end of + * a word that contains more than just this character, and only + * when we started with a capital sigma. */ + if (uv == UNICODE_GREEK_SMALL_LETTER_SIGMA && + s > send - len && /* Makes sure not the first letter */ + utf8_to_uvchr(s, 0) == UNICODE_GREEK_CAPITAL_LETTER_SIGMA + ) { + + /* We use the algorithm in: + * http://www.unicode.org/versions/Unicode5.0.0/ch03.pdf (C + * is a CAPITAL SIGMA): If C is preceded by a sequence + * consisting of a cased letter and a case-ignorable + * sequence, and C is not followed by a sequence consisting + * of a case ignorable sequence and then a cased letter, + * then when lowercasing C, C becomes a final sigma */ + + /* To determine if this is the end of a word, need to peek + * ahead. Look at the next character */ + const U8 *peek = s + u; + + /* Skip any case ignorable characters */ + while (peek < send && is_utf8_case_ignorable(peek)) { + peek += UTF8SKIP(peek); + } + + /* If we reached the end of the string without finding any + * non-case ignorable characters, or if the next such one + * is not-cased, then we have met the conditions for it + * being a final sigma with regards to peek ahead, and so + * must do peek behind for the remaining conditions. (We + * know there is stuff behind to look at since we tested + * above that this isn't the first letter) */ + if (peek >= send || ! is_utf8_cased(peek)) { + peek = utf8_hop(s, -1); + + /* Here are at the beginning of the first character + * before the original upper case sigma. Keep backing + * up, skipping any case ignorable characters */ + while (is_utf8_case_ignorable(peek)) { + peek = utf8_hop(peek, -1); + } + + /* Here peek points to the first byte of the closest + * non-case-ignorable character before the capital + * sigma. If it is cased, then by the Unicode + * algorithm, we should use a small final sigma instead + * of what we have */ + if (is_utf8_cased(peek)) { + STORE_UNI_TO_UTF8_TWO_BYTE(tmpbuf, + UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA); + } + } + } + else { /* Not a context sensitive mapping */ +#endif /* End of commented out context sensitive */ + if (ulen > u && (SvLEN(dest) < (min += ulen - u))) { + + /* If the eventually required minimum size outgrows + * the available space, we need to grow. */ + const UV o = d - (U8*)SvPVX_const(dest); + + /* If someone lowercases one million U+0130s we + * SvGROW() one million times. Or we could try + * guessing how much to allocate without allocating too + * much. Such is life. Another option would be to + * grow an extra byte or two more each time we need to + * grow, which would cut down the million to 500K, with + * little waste */ + SvGROW(dest, min); + d = (U8*)SvPVX(dest) + o; + } +#ifdef CONTEXT_DEPENDENT_CASING + } +#endif + /* Copy the newly lowercased letter to the output buffer we're + * building */ + Copy(tmpbuf, d, ulen, U8); + d += ulen; + s += u; +#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS + } +#endif + } /* End of looping through the source string */ SvUTF8_on(dest); *d = '\0'; SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); - } else { + } else { /* Not utf8 */ if (len) { const U8 *const send = s + len; + + /* Use locale casing if in locale; regular style if not treating + * latin1 as having case; otherwise the latin1 casing. Do the + * whole thing in a tight loop, for speed, */ if (IN_LOCALE_RUNTIME) { TAINT; SvTAINTED_on(dest); for (; s < send; d++, s++) *d = toLOWER_LC(*s); } - else { - for (; s < send; d++, s++) + else if (! IN_UNI_8_BIT) { + for (; s < send; d++, s++) { *d = toLOWER(*s); + } + } + else { + for (; s < send; d++, s++) { + *d = toLOWER_LATIN1(*s); + } } } if (source != dest) { diff --git a/utf8.h b/utf8.h index 1c8e06b..7c205d1 100644 --- a/utf8.h +++ b/utf8.h @@ -207,6 +207,7 @@ encoded character. #define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES) #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES) +#define IN_UNI_8_BIT (CopHINTS_get(PL_curcop) & HINT_UNI_8_BIT && ! IN_LOCALE_RUNTIME && ! IN_BYTES) #define UTF8_ALLOW_EMPTY 0x0001 #define UTF8_ALLOW_CONTINUATION 0x0002