From: Jarkko Hietaniemi Date: Sun, 29 Apr 2001 02:04:46 +0000 (+0000) Subject: In character classes one couldn't have 0x80..0xff characters X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=3568d8383f3d0b22eb07927391114af2a91b06ed;p=p5sagit%2Fp5-mst-13.2.git In character classes one couldn't have 0x80..0xff characters at the left hand side if there were 0x100.. characters in the character class. p4raw-id: //depot/perl@9901 --- diff --git a/doop.c b/doop.c index d7baecc..755cbfd 100644 --- a/doop.c +++ b/doop.c @@ -344,7 +344,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ } while (s < send) { - if ((uv = swash_fetch(rv, s)) < none) { + if ((uv = swash_fetch(rv, s, TRUE)) < none) { s += UTF8SKIP(s); matches++; d = uvuni_to_utf8(d, uv); @@ -423,7 +423,7 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */ send = s + len; while (s < send) { - if ((uv = swash_fetch(rv, s)) < none || uv == extra) + if ((uv = swash_fetch(rv, s, TRUE)) < none || uv == extra) matches++; s += UTF8SKIP(s); } @@ -491,7 +491,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ if (squash) { UV puv = 0xfeedface; while (s < send) { - uv = swash_fetch(rv, s); + uv = swash_fetch(rv, s, TRUE); if (d > dend) { STRLEN clen = d - dstart; @@ -546,7 +546,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ } else { while (s < send) { - uv = swash_fetch(rv, s); + uv = swash_fetch(rv, s, TRUE); if (d > dend) { STRLEN clen = d - dstart; STRLEN nlen = dend - dstart + len + UTF8_MAXLEN; diff --git a/embed.h b/embed.h index 887e9eb..dd5d658 100644 --- a/embed.h +++ b/embed.h @@ -2216,7 +2216,7 @@ #define sv_vsetpvfn(a,b,c,d,e,f,g) Perl_sv_vsetpvfn(aTHX_ a,b,c,d,e,f,g) #define str_to_version(a) Perl_str_to_version(aTHX_ a) #define swash_init(a,b,c,d,e) Perl_swash_init(aTHX_ a,b,c,d,e) -#define swash_fetch(a,b) Perl_swash_fetch(aTHX_ a,b) +#define swash_fetch(a,b,c) Perl_swash_fetch(aTHX_ a,b,c) #define taint_env() Perl_taint_env(aTHX) #define taint_proper(a,b) Perl_taint_proper(aTHX_ a,b) #define to_utf8_lower(a) Perl_to_utf8_lower(aTHX_ a) diff --git a/embed.pl b/embed.pl index 9e272b8..fcaaaed 100755 --- a/embed.pl +++ b/embed.pl @@ -2089,7 +2089,7 @@ Apd |void |sv_vsetpvfn |SV* sv|const char* pat|STRLEN patlen \ Ap |NV |str_to_version |SV *sv Ap |SV* |swash_init |char* pkg|char* name|SV* listsv \ |I32 minbits|I32 none -Ap |UV |swash_fetch |SV *sv|U8 *ptr +Ap |UV |swash_fetch |SV *sv|U8 *ptr|bool do_utf8 Ap |void |taint_env Ap |void |taint_proper |const char* f|const char* s Ap |UV |to_utf8_lower |U8 *p diff --git a/objXSUB.h b/objXSUB.h index 99d9a3e..28bed78 100644 --- a/objXSUB.h +++ b/objXSUB.h @@ -579,6 +579,10 @@ #define Perl_init_stacks pPerl->Perl_init_stacks #undef init_stacks #define init_stacks Perl_init_stacks +#undef Perl_init_tm +#define Perl_init_tm pPerl->Perl_init_tm +#undef init_tm +#define init_tm Perl_init_tm #undef Perl_instr #define Perl_instr pPerl->Perl_instr #undef instr @@ -857,6 +861,10 @@ #define Perl_mg_size pPerl->Perl_mg_size #undef mg_size #define mg_size Perl_mg_size +#undef Perl_mini_mktime +#define Perl_mini_mktime pPerl->Perl_mini_mktime +#undef mini_mktime +#define mini_mktime Perl_mini_mktime #undef Perl_moreswitches #define Perl_moreswitches pPerl->Perl_moreswitches #undef moreswitches @@ -927,6 +935,10 @@ #define Perl_my_stat pPerl->Perl_my_stat #undef my_stat #define my_stat Perl_my_stat +#undef Perl_my_strftime +#define Perl_my_strftime pPerl->Perl_my_strftime +#undef my_strftime +#define my_strftime Perl_my_strftime #if defined(MYSWAP) #undef Perl_my_swap #define Perl_my_swap pPerl->Perl_my_swap diff --git a/proto.h b/proto.h index 63fc518..cc4050d 100644 --- a/proto.h +++ b/proto.h @@ -808,7 +808,7 @@ PERL_CALLCONV void Perl_sv_vcatpvfn(pTHX_ SV* sv, const char* pat, STRLEN patlen PERL_CALLCONV void Perl_sv_vsetpvfn(pTHX_ SV* sv, const char* pat, STRLEN patlen, va_list* args, SV** svargs, I32 svmax, bool *maybe_tainted); PERL_CALLCONV NV Perl_str_to_version(pTHX_ SV *sv); PERL_CALLCONV SV* Perl_swash_init(pTHX_ char* pkg, char* name, SV* listsv, I32 minbits, I32 none); -PERL_CALLCONV UV Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr); +PERL_CALLCONV UV Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8); PERL_CALLCONV void Perl_taint_env(pTHX); PERL_CALLCONV void Perl_taint_proper(pTHX_ const char* f, const char* s); PERL_CALLCONV UV Perl_to_utf8_lower(pTHX_ U8 *p); diff --git a/regcomp.c b/regcomp.c index 1cc3a98..20388f1 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2799,11 +2799,12 @@ tryagain: break; case 'p': case 'P': - { /* a lovely hack--pretend we saw [\pX] instead */ + { char* oldregxend = RExC_end; char* parse_start = RExC_parse; if (RExC_parse[1] == '{') { + /* a lovely hack--pretend we saw [\pX] instead */ RExC_end = strchr(RExC_parse, '}'); if (!RExC_end) { RExC_parse += 2; @@ -3259,7 +3260,7 @@ STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state) { register UV value; - register IV lastvalue = OOB_UNICODE; + register IV prevvalue = OOB_UNICODE; register IV range = 0; register regnode *ret; STRLEN numlen; @@ -3270,7 +3271,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) register char *e; char *parse_start = RExC_parse; /* MJD */ UV n; - bool dont_optimize_invert = FALSE; + bool optimize_invert = TRUE; ret = reganode(pRExC_state, ANYOF, 0); @@ -3312,8 +3313,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) rangebegin = RExC_parse; if (UTF) { value = utf8n_to_uvchr((U8*)RExC_parse, - RExC_end - RExC_parse, - &numlen, 0); + RExC_end - RExC_parse, + &numlen, 0); RExC_parse += numlen; } else @@ -3423,14 +3424,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) RExC_parse - rangebegin, RExC_parse - rangebegin, rangebegin); - if (lastvalue < 256) { - ANYOF_BITMAP_SET(ret, lastvalue); + if (prevvalue < 256) { + ANYOF_BITMAP_SET(ret, prevvalue); ANYOF_BITMAP_SET(ret, '-'); } else { ANYOF_FLAGS(ret) |= ANYOF_UNICODE; Perl_sv_catpvf(aTHX_ listsv, - "%04"UVxf"\n%04"UVxf"\n", (UV)lastvalue, (UV) '-'); + "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-'); } } @@ -3438,6 +3439,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) } if (!SIZE_ONLY) { + if (namedclass > OOB_NAMEDCLASS) + optimize_invert = FALSE; /* Possible truncation here but in some 64-bit environments * the compiler gets heartburn about switch on 64-bit values. * A similar issue a little earlier when switching on value. @@ -3451,7 +3454,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isALNUM(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break; case ANYOF_NALNUM: @@ -3462,7 +3464,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isALNUM(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break; case ANYOF_ALNUMC: @@ -3473,7 +3474,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isALNUMC(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break; case ANYOF_NALNUMC: @@ -3484,7 +3484,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isALNUMC(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break; case ANYOF_ALPHA: @@ -3495,7 +3494,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isALPHA(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break; case ANYOF_NALPHA: @@ -3506,7 +3504,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isALPHA(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break; case ANYOF_ASCII: @@ -3529,7 +3526,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) } #endif /* EBCDIC */ } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break; case ANYOF_NASCII: @@ -3552,7 +3548,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) } #endif /* EBCDIC */ } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); break; case ANYOF_BLANK: @@ -3563,7 +3558,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isBLANK(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n"); break; case ANYOF_NBLANK: @@ -3574,7 +3568,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isBLANK(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n"); break; case ANYOF_CNTRL: @@ -3585,7 +3578,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isCNTRL(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break; case ANYOF_NCNTRL: @@ -3596,7 +3588,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isCNTRL(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); break; case ANYOF_DIGIT: @@ -3607,7 +3598,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) for (value = '0'; value <= '9'; value++) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); break; case ANYOF_NDIGIT: @@ -3620,7 +3610,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) for (value = '9' + 1; value < 256; value++) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break; case ANYOF_GRAPH: @@ -3631,7 +3620,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isGRAPH(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break; case ANYOF_NGRAPH: @@ -3642,7 +3630,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isGRAPH(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break; case ANYOF_LOWER: @@ -3653,7 +3640,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isLOWER(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break; case ANYOF_NLOWER: @@ -3664,7 +3650,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isLOWER(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break; case ANYOF_PRINT: @@ -3675,7 +3660,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isPRINT(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break; case ANYOF_NPRINT: @@ -3686,7 +3670,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isPRINT(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); break; case ANYOF_PSXSPC: @@ -3697,7 +3680,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isPSXSPC(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break; case ANYOF_NPSXSPC: @@ -3708,7 +3690,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isPSXSPC(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break; case ANYOF_PUNCT: @@ -3719,7 +3700,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isPUNCT(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break; case ANYOF_NPUNCT: @@ -3730,7 +3710,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isPUNCT(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break; case ANYOF_SPACE: @@ -3741,7 +3720,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isSPACE(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n"); break; case ANYOF_NSPACE: @@ -3752,7 +3730,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isSPACE(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n"); break; case ANYOF_UPPER: @@ -3763,7 +3740,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isUPPER(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break; case ANYOF_NUPPER: @@ -3774,7 +3750,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isUPPER(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break; case ANYOF_XDIGIT: @@ -3785,7 +3760,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isXDIGIT(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break; case ANYOF_NXDIGIT: @@ -3796,7 +3770,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isXDIGIT(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break; default: @@ -3810,17 +3783,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) } /* end of namedclass \blah */ if (range) { - if (((lastvalue > value) && !(PL_hints & HINT_RE_ASCIIR)) || - ((NATIVE_TO_UNI(lastvalue) > NATIVE_TO_UNI(value)) && (PL_hints & HINT_RE_ASCIIR))) /* b-a */ { + if (((prevvalue > value) && !(PL_hints & HINT_RE_ASCIIR)) || + ((NATIVE_TO_UNI(prevvalue) > NATIVE_TO_UNI(value)) && + (PL_hints & HINT_RE_ASCIIR))) /* b-a */ { Simple_vFAIL4("Invalid [] range \"%*.*s\"", RExC_parse - rangebegin, RExC_parse - rangebegin, rangebegin); + range = 0; /* not a valid range */ } - range = 0; /* not a true range */ } else { - lastvalue = value; /* save the beginning of the range */ + prevvalue = value; /* save the beginning of the range */ if (*RExC_parse == '-' && RExC_parse+1 < RExC_end && RExC_parse[1] != ']') { RExC_parse++; @@ -3843,42 +3817,45 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) /* now is the next time */ if (!SIZE_ONLY) { - if (lastvalue < 256 && value < 256) { -#ifdef EBCDIC /* EBCDIC, for example. */ - if (PL_hints & HINT_RE_ASCIIR) { - IV i; + IV i; + + if (prevvalue < 256) { + IV ceilvalue = value < 256 ? value : 255; + +#ifdef EBCDIC /* New style scheme for ranges: - * after : * use re 'asciir'; * do ranges in ASCII/Unicode space */ - for (i = NATIVE_TO_ASCII(lastvalue) ; i <= NATIVE_TO_ASCII(value); i++) - ANYOF_BITMAP_SET(ret, ASCII_TO_NATIVE(i)); + for (i = NATIVE_TO_ASCII(prevvalue); + i <= NATIVE_TO_ASCII(ceilvalue); + i++) + ANYOF_BITMAP_SET(ret, ASCII_TO_NATIVE(i)); } - else if ((isLOWER(lastvalue) && isLOWER(value)) || - (isUPPER(lastvalue) && isUPPER(value))) + else if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) || + (isUPPER(prevvalue) && isUPPER(ceilvalue))) { - IV i; - if (isLOWER(lastvalue)) { - for (i = lastvalue; i <= value; i++) + if (isLOWER(prevvalue)) { + for (i = prevvalue; i <= ceilvalue; i++) if (isLOWER(i)) ANYOF_BITMAP_SET(ret, i); } else { - for (i = lastvalue; i <= value; i++) + for (i = prevvalue; i <= ceilvalue; i++) if (isUPPER(i)) ANYOF_BITMAP_SET(ret, i); } } else #endif - for ( ; lastvalue <= value; lastvalue++) - ANYOF_BITMAP_SET(ret, lastvalue); - } else { + for (i = prevvalue; i <= ceilvalue; i++) + ANYOF_BITMAP_SET(ret, i); + } + if (value > 255) { ANYOF_FLAGS(ret) |= ANYOF_UNICODE; - if (lastvalue < value) + if (prevvalue < value) Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", - (UV)lastvalue, (UV)value); - else + (UV)prevvalue, (UV)value); + else if (prevvalue == value) Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", (UV)value); } @@ -3912,7 +3889,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) } /* optimize inverted simple patterns (e.g. [^a-z]) */ - if (!SIZE_ONLY && !dont_optimize_invert && + if (!SIZE_ONLY && optimize_invert && /* If the only flag is inversion. */ (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) @@ -4448,7 +4425,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) for (i = 0; i <= 256; i++) { /* just the first 256 */ U8 *e = uvchr_to_utf8(s, i); - if (i < 256 && swash_fetch(sw, s)) { + if (i < 256 && swash_fetch(sw, s, TRUE)) { if (rangestart == -1) rangestart = i; } else if (rangestart != -1) { diff --git a/regexec.c b/regexec.c index e358d63..c9096f0 100644 --- a/regexec.c +++ b/regexec.c @@ -958,7 +958,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta LOAD_UTF8_CHARCLASS(alnum,"a"); while (s < strend) { if (tmp == !(OP(c) == BOUND ? - swash_fetch(PL_utf8_alnum, (U8*)s) : + swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) : isALNUM_LC_utf8((U8*)s))) { tmp = !tmp; @@ -1001,7 +1001,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta LOAD_UTF8_CHARCLASS(alnum,"a"); while (s < strend) { if (tmp == !(OP(c) == NBOUND ? - swash_fetch(PL_utf8_alnum, (U8*)s) : + swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) : isALNUM_LC_utf8((U8*)s))) tmp = !tmp; else if ((norun || regtry(prog, s))) @@ -1029,7 +1029,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (do_utf8) { LOAD_UTF8_CHARCLASS(alnum,"a"); while (s < strend) { - if (swash_fetch(PL_utf8_alnum, (U8*)s)) { + if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; else @@ -1087,7 +1087,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (do_utf8) { LOAD_UTF8_CHARCLASS(alnum,"a"); while (s < strend) { - if (!swash_fetch(PL_utf8_alnum, (U8*)s)) { + if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; else @@ -1145,7 +1145,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (do_utf8) { LOAD_UTF8_CHARCLASS(space," "); while (s < strend) { - if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) { + if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; else @@ -1203,7 +1203,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (do_utf8) { LOAD_UTF8_CHARCLASS(space," "); while (s < strend) { - if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) { + if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) { if (tmp && (norun || regtry(prog, s))) goto got_it; else @@ -1261,7 +1261,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (do_utf8) { LOAD_UTF8_CHARCLASS(digit,"0"); while (s < strend) { - if (swash_fetch(PL_utf8_digit,(U8*)s)) { + if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; else @@ -1319,7 +1319,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (do_utf8) { LOAD_UTF8_CHARCLASS(digit,"0"); while (s < strend) { - if (!swash_fetch(PL_utf8_digit,(U8*)s)) { + if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; else @@ -2214,7 +2214,7 @@ S_regmatch(pTHX_ regnode *prog) sayNO; if (do_utf8) { if (!(OP(scan) == ALNUM - ? swash_fetch(PL_utf8_alnum, (U8*)locinput) + ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8) : isALNUM_LC_utf8((U8*)locinput))) { sayNO; @@ -2237,7 +2237,7 @@ S_regmatch(pTHX_ regnode *prog) if (do_utf8) { LOAD_UTF8_CHARCLASS(alnum,"a"); if (OP(scan) == NALNUM - ? swash_fetch(PL_utf8_alnum, (U8*)locinput) + ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8) : isALNUM_LC_utf8((U8*)locinput)) { sayNO; @@ -2269,7 +2269,7 @@ S_regmatch(pTHX_ regnode *prog) if (OP(scan) == BOUND || OP(scan) == NBOUND) { ln = isALNUM_uni(ln); LOAD_UTF8_CHARCLASS(alnum,"a"); - n = swash_fetch(PL_utf8_alnum, (U8*)locinput); + n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8); } else { ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln)); @@ -2302,7 +2302,7 @@ S_regmatch(pTHX_ regnode *prog) if (UTF8_IS_CONTINUED(nextchr)) { LOAD_UTF8_CHARCLASS(space," "); if (!(OP(scan) == SPACE - ? swash_fetch(PL_utf8_space, (U8*)locinput) + ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8) : isSPACE_LC_utf8((U8*)locinput))) { sayNO; @@ -2332,7 +2332,7 @@ S_regmatch(pTHX_ regnode *prog) if (do_utf8) { LOAD_UTF8_CHARCLASS(space," "); if (OP(scan) == NSPACE - ? swash_fetch(PL_utf8_space, (U8*)locinput) + ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8) : isSPACE_LC_utf8((U8*)locinput)) { sayNO; @@ -2355,7 +2355,7 @@ S_regmatch(pTHX_ regnode *prog) if (do_utf8) { LOAD_UTF8_CHARCLASS(digit,"0"); if (!(OP(scan) == DIGIT - ? swash_fetch(PL_utf8_digit, (U8*)locinput) + ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8) : isDIGIT_LC_utf8((U8*)locinput))) { sayNO; @@ -2378,7 +2378,7 @@ S_regmatch(pTHX_ regnode *prog) if (do_utf8) { LOAD_UTF8_CHARCLASS(digit,"0"); if (OP(scan) == NDIGIT - ? swash_fetch(PL_utf8_digit, (U8*)locinput) + ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8) : isDIGIT_LC_utf8((U8*)locinput)) { sayNO; @@ -2394,10 +2394,12 @@ S_regmatch(pTHX_ regnode *prog) break; case CLUMP: LOAD_UTF8_CHARCLASS(mark,"~"); - if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput)) + if (locinput >= PL_regeol || + swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8)) sayNO; locinput += PL_utf8skip[nextchr]; - while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput)) + while (locinput < PL_regeol && + swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8)) locinput += UTF8SKIP(locinput); if (locinput > PL_regeol) sayNO; @@ -3623,7 +3625,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max) loceol = PL_regeol; LOAD_UTF8_CHARCLASS(alnum,"a"); while (hardcount < max && scan < loceol && - swash_fetch(PL_utf8_alnum, (U8*)scan)) { + swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) { scan += UTF8SKIP(scan); hardcount++; } @@ -3651,7 +3653,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max) loceol = PL_regeol; LOAD_UTF8_CHARCLASS(alnum,"a"); while (hardcount < max && scan < loceol && - !swash_fetch(PL_utf8_alnum, (U8*)scan)) { + !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) { scan += UTF8SKIP(scan); hardcount++; } @@ -3679,7 +3681,8 @@ S_regrepeat(pTHX_ regnode *p, I32 max) loceol = PL_regeol; LOAD_UTF8_CHARCLASS(space," "); while (hardcount < max && scan < loceol && - (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) { + (*scan == ' ' || + swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) { scan += UTF8SKIP(scan); hardcount++; } @@ -3707,7 +3710,8 @@ S_regrepeat(pTHX_ regnode *p, I32 max) loceol = PL_regeol; LOAD_UTF8_CHARCLASS(space," "); while (hardcount < max && scan < loceol && - !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) { + !(*scan == ' ' || + swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) { scan += UTF8SKIP(scan); hardcount++; } @@ -3735,7 +3739,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max) loceol = PL_regeol; LOAD_UTF8_CHARCLASS(digit,"0"); while (hardcount < max && scan < loceol && - swash_fetch(PL_utf8_digit,(U8*)scan)) { + swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) { scan += UTF8SKIP(scan); hardcount++; } @@ -3749,7 +3753,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max) loceol = PL_regeol; LOAD_UTF8_CHARCLASS(digit,"0"); while (hardcount < max && scan < loceol && - !swash_fetch(PL_utf8_digit,(U8*)scan)) { + !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) { scan += UTF8SKIP(scan); hardcount++; } @@ -3879,25 +3883,22 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8) char flags = ANYOF_FLAGS(n); bool match = FALSE; UV c; - STRLEN len; + STRLEN len = 0; - if (do_utf8) - c = utf8_to_uvchr(p, &len); - else - c = *p; + c = do_utf8 ? utf8_to_uvchr(p, &len) : *p; if (do_utf8 || (flags & ANYOF_UNICODE)) { if (do_utf8 && !ANYOF_RUNTIME(n)) { if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c)) match = TRUE; } - if (do_utf8 && flags & ANYOF_UNICODE_ALL && c >= 256) + if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256) match = TRUE; if (!match) { SV *sw = regclass_swash(n, TRUE, 0); if (sw) { - if (swash_fetch(sw, p)) + if (swash_fetch(sw, p, do_utf8)) match = TRUE; else if (flags & ANYOF_FOLD) { U8 tmpbuf[UTF8_MAXLEN+1]; @@ -3908,7 +3909,7 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8) } else uvchr_to_utf8(tmpbuf, toLOWER_utf8(p)); - if (swash_fetch(sw, tmpbuf)) + if (swash_fetch(sw, tmpbuf, do_utf8)) match = TRUE; } } @@ -3918,7 +3919,7 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8) if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; else if (flags & ANYOF_FOLD) { - I32 f; + I32 f; if (flags & ANYOF_LOCALE) { PL_reg_flags |= RF_tainted; diff --git a/t/op/pat.t b/t/op/pat.t index 9130454..1be7234 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -5,7 +5,8 @@ # that does fit that format, add it to op/re_tests, not here. $| = 1; -print "1..587\n"; + +print "1..615\n"; BEGIN { chdir 't' if -d 't'; @@ -1302,6 +1303,7 @@ print "ok 247\n"; { # the second half of 20001028.003 + my $X = ''; $X =~ s/^/chr(1488)/e; print "not " unless length $X == 1 && ord($X) == 1488; print "ok 260\n"; @@ -1353,10 +1355,11 @@ print "ok 247\n"; "\0" => 'Cc', ); - for my $char (keys %s) { + for my $char (map { s/^\S+ //; $_ } + sort map { sprintf("%06x", ord($_))." $_" } keys %s) { my $class = $s{$char}; - my $code = sprintf("%04x", ord($char)); - printf "# 0x$code\n"; + my $code = sprintf("%06x", ord($char)); + printf "#\n# 0x$code\n#\n"; print "# IsAlpha\n"; if ($class =~ /^[LM]/) { print "not " unless $char =~ /\p{IsAlpha}/; @@ -1382,7 +1385,7 @@ print "ok 247\n"; print "ok $test\n"; $test++; } print "# IsASCII\n"; - if ($code <= 127) { + if ($code le '00007f') { print "not " unless $char =~ /\p{IsASCII}/; print "ok $test\n"; $test++; print "not " if $char =~ /\P{IsASCII}/; @@ -1583,3 +1586,104 @@ EOT print "not " unless ord($x) == 0x12345678 && length($x) == 1; print "ok 587\n"; } + +{ + my $x = "\x7f"; + + print "not " if $x =~ /[\x80-\xff]/; + print "ok 588\n"; + + print "not " if $x =~ /[\x80-\x{100}]/; + print "ok 589\n"; + + print "not " if $x =~ /[\x{100}]/; + print "ok 590\n"; + + print "not " if $x =~ /\p{InLatin1Supplement}/; + print "ok 591\n"; + + print "not " unless $x =~ /\P{InLatin1Supplement}/; + print "ok 592\n"; + + print "not " if $x =~ /\p{InLatinExtendedA}/; + print "ok 593\n"; + + print "not " unless $x =~ /\P{InLatinExtendedA}/; + print "ok 594\n"; +} + +{ + my $x = "\x80"; + + print "not " unless $x =~ /[\x80-\xff]/; + print "ok 595\n"; + + print "not " unless $x =~ /[\x80-\x{100}]/; + print "ok 596\n"; + + print "not " if $x =~ /[\x{100}]/; + print "ok 597\n"; + + print "not " unless $x =~ /\p{InLatin1Supplement}/; + print "ok 598\n"; + + print "not " if $x =~ /\P{InLatin1Supplement}/; + print "ok 599\n"; + + print "not " if $x =~ /\p{InLatinExtendedA}/; + print "ok 600\n"; + + print "not " unless $x =~ /\P{InLatinExtendedA}/; + print "ok 601\n"; +} + +{ + my $x = "\xff"; + + print "not " unless $x =~ /[\x80-\xff]/; + print "ok 602\n"; + + print "not " unless $x =~ /[\x80-\x{100}]/; + print "ok 603\n"; + + print "not " if $x =~ /[\x{100}]/; + print "ok 604\n"; + + print "not " unless $x =~ /\p{InLatin1Supplement}/; + print "ok 605\n"; + + print "not " if $x =~ /\P{InLatin1Supplement}/; + print "ok 606\n"; + + print "not " if $x =~ /\p{InLatinExtendedA}/; + print "ok 607\n"; + + print "not " unless $x =~ /\P{InLatinExtendedA}/; + print "ok 608\n"; +} + +{ + my $x = "\x{100}"; + + print "not " if $x =~ /[\x80-\xff]/; + print "ok 609\n"; + + print "not " unless $x =~ /[\x80-\x{100}]/; + print "ok 610\n"; + + print "not " unless $x =~ /[\x{100}]/; + print "ok 611\n"; + + print "not " if $x =~ /\p{InLatin1Supplement}/; + print "ok 612\n"; + + print "not " unless $x =~ /\P{InLatin1Supplement}/; + print "ok 613\n"; + + print "not " unless $x =~ /\p{InLatinExtendedA}/; + print "ok 614\n"; + + print "not " if $x =~ /\P{InLatinExtendedA}/; + print "ok 615\n"; +} + diff --git a/utf8.c b/utf8.c index fda9920..b682cf6 100644 --- a/utf8.c +++ b/utf8.c @@ -1045,13 +1045,13 @@ Perl_is_utf8_alnum(pTHX_ U8 *p) * descendant of isalnum(3), in other words, it doesn't * contain the '_'. --jhi */ PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0); - return swash_fetch(PL_utf8_alnum, p); + return swash_fetch(PL_utf8_alnum, p, TRUE); /* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */ #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */ if (!PL_utf8_alnum) PL_utf8_alnum = swash_init("utf8", "", sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0); - return swash_fetch(PL_utf8_alnum, p); + return swash_fetch(PL_utf8_alnum, p, TRUE); #endif } @@ -1062,13 +1062,13 @@ Perl_is_utf8_alnumc(pTHX_ U8 *p) return FALSE; if (!PL_utf8_alnum) PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0); - return swash_fetch(PL_utf8_alnum, p); + return swash_fetch(PL_utf8_alnum, p, TRUE); /* return is_utf8_alpha(p) || is_utf8_digit(p); */ #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */ if (!PL_utf8_alnum) PL_utf8_alnum = swash_init("utf8", "", sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0); - return swash_fetch(PL_utf8_alnum, p); + return swash_fetch(PL_utf8_alnum, p, TRUE); #endif } @@ -1085,7 +1085,7 @@ Perl_is_utf8_alpha(pTHX_ U8 *p) return FALSE; if (!PL_utf8_alpha) PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0); - return swash_fetch(PL_utf8_alpha, p); + return swash_fetch(PL_utf8_alpha, p, TRUE); } bool @@ -1095,7 +1095,7 @@ Perl_is_utf8_ascii(pTHX_ U8 *p) return FALSE; if (!PL_utf8_ascii) PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0); - return swash_fetch(PL_utf8_ascii, p); + return swash_fetch(PL_utf8_ascii, p, TRUE); } bool @@ -1105,7 +1105,7 @@ Perl_is_utf8_space(pTHX_ U8 *p) return FALSE; if (!PL_utf8_space) PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0); - return swash_fetch(PL_utf8_space, p); + return swash_fetch(PL_utf8_space, p, TRUE); } bool @@ -1115,7 +1115,7 @@ Perl_is_utf8_digit(pTHX_ U8 *p) return FALSE; if (!PL_utf8_digit) PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0); - return swash_fetch(PL_utf8_digit, p); + return swash_fetch(PL_utf8_digit, p, TRUE); } bool @@ -1125,7 +1125,7 @@ Perl_is_utf8_upper(pTHX_ U8 *p) return FALSE; if (!PL_utf8_upper) PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0); - return swash_fetch(PL_utf8_upper, p); + return swash_fetch(PL_utf8_upper, p, TRUE); } bool @@ -1135,7 +1135,7 @@ Perl_is_utf8_lower(pTHX_ U8 *p) return FALSE; if (!PL_utf8_lower) PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0); - return swash_fetch(PL_utf8_lower, p); + return swash_fetch(PL_utf8_lower, p, TRUE); } bool @@ -1145,7 +1145,7 @@ Perl_is_utf8_cntrl(pTHX_ U8 *p) return FALSE; if (!PL_utf8_cntrl) PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0); - return swash_fetch(PL_utf8_cntrl, p); + return swash_fetch(PL_utf8_cntrl, p, TRUE); } bool @@ -1155,7 +1155,7 @@ Perl_is_utf8_graph(pTHX_ U8 *p) return FALSE; if (!PL_utf8_graph) PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0); - return swash_fetch(PL_utf8_graph, p); + return swash_fetch(PL_utf8_graph, p, TRUE); } bool @@ -1165,7 +1165,7 @@ Perl_is_utf8_print(pTHX_ U8 *p) return FALSE; if (!PL_utf8_print) PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0); - return swash_fetch(PL_utf8_print, p); + return swash_fetch(PL_utf8_print, p, TRUE); } bool @@ -1175,7 +1175,7 @@ Perl_is_utf8_punct(pTHX_ U8 *p) return FALSE; if (!PL_utf8_punct) PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0); - return swash_fetch(PL_utf8_punct, p); + return swash_fetch(PL_utf8_punct, p, TRUE); } bool @@ -1185,7 +1185,7 @@ Perl_is_utf8_xdigit(pTHX_ U8 *p) return FALSE; if (!PL_utf8_xdigit) PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0); - return swash_fetch(PL_utf8_xdigit, p); + return swash_fetch(PL_utf8_xdigit, p, TRUE); } bool @@ -1195,7 +1195,7 @@ Perl_is_utf8_mark(pTHX_ U8 *p) return FALSE; if (!PL_utf8_mark) PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0); - return swash_fetch(PL_utf8_mark, p); + return swash_fetch(PL_utf8_mark, p, TRUE); } UV @@ -1205,7 +1205,7 @@ Perl_to_utf8_upper(pTHX_ U8 *p) if (!PL_utf8_toupper) PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0); - uv = swash_fetch(PL_utf8_toupper, p); + uv = swash_fetch(PL_utf8_toupper, p, TRUE); return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0); } @@ -1216,7 +1216,7 @@ Perl_to_utf8_title(pTHX_ U8 *p) if (!PL_utf8_totitle) PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0); - uv = swash_fetch(PL_utf8_totitle, p); + uv = swash_fetch(PL_utf8_totitle, p, TRUE); return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0); } @@ -1227,7 +1227,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p) if (!PL_utf8_tolower) PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0); - uv = swash_fetch(PL_utf8_tolower, p); + uv = swash_fetch(PL_utf8_tolower, p, TRUE); return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0); } @@ -1282,21 +1282,31 @@ Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none) } UV -Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr) +Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8) { HV* hv = (HV*)SvRV(sv); - /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ - then the "swatch" is a vec() for al the chars which start - with 0xAA..0xYY - So the key in the hash is length of encoded char -1 - */ - U32 klen = UTF8SKIP(ptr) - 1; - U32 off = ptr[klen]; + U32 klen; + U32 off; STRLEN slen; STRLEN needents; U8 *tmps; U32 bit; SV *retval; + U8 tmputf8[2]; + UV c = NATIVE_TO_ASCII(*ptr); + + if (!do_utf8 && !UNI_IS_INVARIANT(c)) { + tmputf8[0] = UTF8_EIGHT_BIT_HI(c); + tmputf8[1] = UTF8_EIGHT_BIT_LO(c); + ptr = tmputf8; + } + /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ + * then the "swatch" is a vec() for al the chars which start + * with 0xAA..0xYY + * So the key in the hash (klen) is length of encoded char -1 + */ + klen = UTF8SKIP(ptr) - 1; + off = ptr[klen]; if (klen == 0) { @@ -1322,9 +1332,9 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr) * NB: this code assumes that swatches are never modified, once generated! */ - if (hv == PL_last_swash_hv && + if (hv == PL_last_swash_hv && klen == PL_last_swash_klen && - (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) ) + (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) ) { tmps = PL_last_swash_tmps; slen = PL_last_swash_slen; @@ -1348,7 +1358,8 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr) EXTEND(SP,3); PUSHs((SV*)sv); /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */ - PUSHs(sv_2mortal(newSViv((klen) ? (code_point & ~(needents - 1)) : 0))); + PUSHs(sv_2mortal(newSViv((klen) ? + (code_point & ~(needents - 1)) : 0))); PUSHs(sv_2mortal(newSViv(needents))); PUTBACK; if (call_method("SWASHGET", G_SCALAR))