From: SADAHIRO Tomoyuki Date: Sat, 4 Nov 2006 21:53:50 +0000 (+0900) Subject: Re: [perl #40641] crash with unicode characters in regex comment X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=9e08bc66da56140ed8efaea283d1b4b6053eef0b;p=p5sagit%2Fp5-mst-13.2.git Re: [perl #40641] crash with unicode characters in regex comment Message-Id: <20061104215302.3325.BQW10602@nifty.com> p4raw-id: //depot/perl@29204 --- diff --git a/MANIFEST b/MANIFEST index d79308d..b5a4017 100644 --- a/MANIFEST +++ b/MANIFEST @@ -3587,14 +3587,16 @@ t/uni/chomp.t See if Unicode chomp works t/uni/chr.t See if Unicode chr works t/uni/class.t See if Unicode classes work (\p) t/uni/fold.t See if Unicode folding works +t/uni/greek.t See if Unicode in greek works +t/uni/latin2.t See if Unicode in latin2 works t/uni/lower.t See if Unicode casing works t/uni/overload.t See if Unicode overloading works t/uni/sprintf.t See if Unicode sprintf works t/uni/title.t See if Unicode casing works -t/uni/tr_7jis.t See if Unicode tr/// works -t/uni/tr_eucjp.t See if Unicode tr/// works -t/uni/tr_sjis.t See if Unicode tr/// works -t/uni/tr_utf8.t See if Unicode tr/// works +t/uni/tr_7jis.t See if Unicode tr/// in 7jis works +t/uni/tr_eucjp.t See if Unicode tr/// in eucjp works +t/uni/tr_sjis.t See if Unicode tr/// in sjis works +t/uni/tr_utf8.t See if Unicode tr/// in utf8 works t/uni/upper.t See if Unicode casing works t/uni/write.t See if Unicode formats work t/win32/getosversion.t Test if Win32::GetOSVersion() works diff --git a/embed.fnc b/embed.fnc index d7b3592..350b433 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1310,6 +1310,7 @@ Es |STRLEN |reguni |NN const struct RExC_state_t *state|UV uv|NN char *s Es |regnode*|regclass |NN struct RExC_state_t *state|U32 depth ERsn |I32 |regcurly |NN const char * Es |regnode*|reg_node |NN struct RExC_state_t *state|U8 op +Es |UV |reg_recode |const char value|NULLOK SV **encp Es |regnode*|regpiece |NN struct RExC_state_t *state|NN I32 *flagp|U32 depth Es |regnode*|reg_namedseq |NN struct RExC_state_t *state|NULLOK UV *valuep Es |void |reginsert |NN struct RExC_state_t *state|U8 op|NN regnode *opnd|U32 depth diff --git a/embed.h b/embed.h index 74adbd2..22595d5 100644 --- a/embed.h +++ b/embed.h @@ -1314,6 +1314,7 @@ #define regclass S_regclass #define regcurly S_regcurly #define reg_node S_reg_node +#define reg_recode S_reg_recode #define regpiece S_regpiece #define reg_namedseq S_reg_namedseq #define reginsert S_reginsert @@ -3513,6 +3514,7 @@ #define regclass(a,b) S_regclass(aTHX_ a,b) #define regcurly S_regcurly #define reg_node(a,b) S_reg_node(aTHX_ a,b) +#define reg_recode(a,b) S_reg_recode(aTHX_ a,b) #define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c) #define reg_namedseq(a,b) S_reg_namedseq(aTHX_ a,b) #define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d) diff --git a/pod/perldiag.pod b/pod/perldiag.pod index f785603..c20b060 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -2071,6 +2071,15 @@ recognized by Perl or by a user-supplied handler. See L. (W printf) Perl does not understand the given format conversion. See L. +=item Invalid escape in the specified encoding in regex; marked by <-- HERE in m/%s/ + +(W regexp) The numeric escape (for example C<\xHH>) of value < 256 +didn't correspond to a single character through the conversion +from the encoding specified by the encoding pragma. +The escape was replaced with REPLACEMENT CHARACTER (U+FFFD) instead. +The <-- HERE shows in the regular expression about where the +escape was discovered. + =item Invalid [] range "%s" in regex; marked by <-- HERE in m/%s/ (F) The range specified in a character class had a minimum character diff --git a/proto.h b/proto.h index b751dba..b141466 100644 --- a/proto.h +++ b/proto.h @@ -3566,6 +3566,7 @@ STATIC I32 S_regcurly(const char *) STATIC regnode* S_reg_node(pTHX_ struct RExC_state_t *state, U8 op) __attribute__nonnull__(pTHX_1); +STATIC UV S_reg_recode(pTHX_ const char value, SV **encp); STATIC regnode* S_regpiece(pTHX_ struct RExC_state_t *state, I32 *flagp, U32 depth) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2); diff --git a/regcomp.c b/regcomp.c index 1523fc1..00c4838 100644 --- a/regcomp.c +++ b/regcomp.c @@ -5782,6 +5782,39 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) } +/* + * reg_recode + * + * It returns the code point in utf8 for the value in *encp. + * value: a code value in the source encoding + * encp: a pointer to an Encode object + * + * If the result from Encode is not a single character, + * it returns U+FFFD (Replacement character) and sets *encp to NULL. + */ +STATIC UV +S_reg_recode(pTHX_ const char value, SV **encp) +{ + STRLEN numlen = 1; + SV * const sv = sv_2mortal(newSVpvn(&value, numlen)); + const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp) + : SvPVX(sv); + const STRLEN newlen = SvCUR(sv); + UV uv = UNICODE_REPLACEMENT; + + if (newlen) + uv = SvUTF8(sv) + ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) + : *(U8*)s; + + if (!newlen || numlen != newlen) { + uv = UNICODE_REPLACEMENT; + if (encp) + *encp = NULL; + } + return uv; +} + /* - regatom - the lowest level @@ -6230,6 +6263,8 @@ tryagain: ender = grok_hex(p, &numlen, &flags, NULL); p += numlen; } + if (PL_encoding && ender < 0x100) + goto recode_encoding; break; case 'c': p++; @@ -6249,6 +6284,17 @@ tryagain: --p; goto loopdone; } + if (PL_encoding && ender < 0x100) + goto recode_encoding; + break; + recode_encoding: + { + SV* enc = PL_encoding; + ender = reg_recode((const char)(U8)ender, &enc); + if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP)) + vWARN(p, "Invalid escape in the specified encoding"); + RExC_utf8 = 1; + } break; case '\0': if (p >= RExC_end) @@ -6376,33 +6422,6 @@ tryagain: break; } - /* If the encoding pragma is in effect recode the text of - * any EXACT-kind nodes. */ - if (ret && PL_encoding && PL_regkind[OP(ret)] == EXACT) { - const STRLEN oldlen = STR_LEN(ret); - SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen)); - - if (RExC_utf8) - SvUTF8_on(sv); - if (sv_utf8_downgrade(sv, TRUE)) { - const char * const s = sv_recode_to_utf8(sv, PL_encoding); - const STRLEN newlen = SvCUR(sv); - - if (SvUTF8(sv)) - RExC_utf8 = 1; - if (!SIZE_ONLY) { - GET_RE_DEBUG_FLAGS_DECL; - DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n", - (int)oldlen, STRING(ret), - (int)newlen, s)); - Copy(s, STRING(ret), newlen, char); - STR_LEN(ret) += newlen - oldlen; - RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen); - } else - RExC_size += STR_SZ(newlen) - STR_SZ(oldlen); - } - } - return(ret); } @@ -6773,6 +6792,8 @@ parseit: value = grok_hex(RExC_parse, &numlen, &flags, NULL); RExC_parse += numlen; } + if (PL_encoding && value < 0x100) + goto recode_encoding; break; case 'c': value = UCHARAT(RExC_parse++); @@ -6780,13 +6801,24 @@ parseit: break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - { - I32 flags = 0; - numlen = 3; - value = grok_oct(--RExC_parse, &numlen, &flags, NULL); - RExC_parse += numlen; - break; - } + { + I32 flags = 0; + numlen = 3; + value = grok_oct(--RExC_parse, &numlen, &flags, NULL); + RExC_parse += numlen; + if (PL_encoding && value < 0x100) + goto recode_encoding; + break; + } + recode_encoding: + { + SV* enc = PL_encoding; + value = reg_recode((const char)(U8)value, &enc); + if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP)) + vWARN(RExC_parse, + "Invalid escape in the specified encoding"); + break; + } default: if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP)) vWARN2(RExC_parse, diff --git a/t/uni/greek.t b/t/uni/greek.t new file mode 100644 index 0000000..a8102f3 --- /dev/null +++ b/t/uni/greek.t @@ -0,0 +1,119 @@ +BEGIN { + if ($ENV{'PERL_CORE'}){ + chdir 't'; + @INC = '../lib'; + } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + if (ord("A") == 193) { + print "1..0 # Skip: EBCDIC\n"; + exit 0; + } + unless (PerlIO::Layer->find('perlio')){ + print "1..0 # Skip: PerlIO required\n"; + exit 0; + } + if ($ENV{PERL_CORE_MINITEST}) { + print "1..0 # Skip: no dynamic loading on miniperl, no Encode\n"; + exit 0; + } + $| = 1; + require './test.pl'; +} + +plan tests => 72; + +use encoding "greek"; # iso 8859-7 + +# U+0391, \xC1, \301, GREEK CAPITAL LETTER ALPHA +# U+03B1, \xE1, \341, GREEK SMALL LETTER ALPHA + +ok("\xC1" =~ /\xC1/, '\xC1 to /\xC1/'); +ok("\x{391}" =~ /\xC1/, '\x{391} to /\xC1/'); +ok("\xC1" =~ /\x{C1}/, '\xC1 to /\x{C1}/'); +ok("\x{391}" =~ /\x{C1}/, '\x{391} to /\x{C1}/'); +ok("\xC1" =~ /\301/, '\xC1 to /\301/'); +ok("\x{391}" =~ /\301/, '\x{391} to /\301/'); +ok("\xC1" =~ /\x{391}/, '\xC1 to /\x{391}/'); +ok("\x{391}" =~ /\x{391}/, '\x{391} to /\x{391}/'); + +ok("\xC1" =~ /\xC1/i, '\xC1 to /\xC1/i'); +ok("\xE1" =~ /\xC1/i, '\xE1 to /\xC1/i'); +ok("\xC1" =~ /\xE1/i, '\xC1 to /\xE1/i'); +ok("\xE1" =~ /\xE1/i, '\xE1 to /\xE1/i'); +ok("\xC1" =~ /\x{391}/i, '\xC1 to /\x{391}/i'); +ok("\xE1" =~ /\x{391}/i, '\xE1 to /\x{391}/i'); +ok("\xC1" =~ /\x{3B1}/i, '\xC1 to /\x{3B1}/i'); +ok("\xE1" =~ /\x{3B1}/i, '\xE1 to /\x{3B1}/i'); + +ok("\xC1" =~ /[\xC1]/, '\xC1 to /[\xC1]/'); +ok("\x{391}" =~ /[\xC1]/, '\x{391} to /[\xC1]/'); +ok("\xC1" =~ /[\x{C1}]/, '\xC1 to /[\x{C1}]/'); +ok("\x{391}" =~ /[\x{C1}]/, '\x{391} to /[\x{C1}]/'); +ok("\xC1" =~ /[\301]/, '\xC1 to /[\301]/'); +ok("\x{391}" =~ /[\301]/, '\x{391} to /[\301]/'); +ok("\xC1" =~ /[\x{391}]/, '\xC1 to /[\x{391}]/'); +ok("\x{391}" =~ /[\x{391}]/, '\x{391} to /[\x{391}]/'); + +ok("\xC1" =~ /[\xC1]/i, '\xC1 to /[\xC1]/i'); +ok("\xE1" =~ /[\xC1]/i, '\xE1 to /[\xC1]/i'); +ok("\xC1" =~ /[\xE1]/i, '\xC1 to /[\xE1]/i'); +ok("\xE1" =~ /[\xE1]/i, '\xE1 to /[\xE1]/i'); +ok("\xC1" =~ /[\x{391}]/i, '\xC1 to /[\x{391}]/i'); +ok("\xE1" =~ /[\x{391}]/i, '\xE1 to /[\x{391}]/i'); +ok("\xC1" =~ /[\x{3B1}]/i, '\xC1 to /[\x{3B1}]/i'); +ok("\xE1" =~ /[\x{3B1}]/i, '\xE1 to /[\x{3B1}]/i'); + +ok("\xC1" =~ '\xC1', '\xC1 to \'\xC1\''); +ok("\xC1" =~ '\x{C1}', '\xC1 to \'\x{C1}\''); +ok("\xC1" =~ '\301', '\xC1 to \'\301\''); +ok("\xC1" =~ '\x{391}', '\xC1 to \'\x{391}\''); +ok("\xC1" =~ '[\xC1]', '\xC1 to \'[\xC1]\''); +ok("\xC1" =~ '[\x{C1}]', '\xC1 to \'[\x{C1}]\''); +ok("\xC1" =~ '[\301]', '\xC1 to \'[\301]\''); +ok("\xC1" =~ '[\x{391}]', '\xC1 to \'[\x{391}]\''); + +ok("\xC1" =~ /Á/, '\xC1 to //'); +ok("\xE1" !~ /Á/, '\xE1 to //'); +ok("\xC1" =~ /Á/i, '\xC1 to //i'); +ok("\xE1" =~ /Á/i, '\xE1 to //i'); +ok("\xC1" =~ /[Á]/, '\xC1 to /[]/'); +ok("\xE1" !~ /[Á]/, '\xE1 to /[]/'); +ok("\xC1" =~ /[Á]/i, '\xC1 to /[]/i'); +ok("\xE1" =~ /[Á]/i, '\xE1 to /[]/i'); + +ok("\xC1\xC1" =~ /Á\xC1/, '\xC1\xC1 to /\xC1/'); +ok("\xC1\xC1" =~ /\xC1Á/, '\xC1\xC1 to /\xC1/'); +ok("\xC1\xC1" =~ /Á\xC1/i, '\xC1\xC1 to /\xC1/i'); +ok("\xC1\xC1" =~ /\xC1Á/i, '\xC1\xC1 to /\xC1/i'); +ok("\xC1\xE1" =~ /Á\xC1/i, '\xC1\xE1 to /\xC1/i'); +ok("\xC1\xE1" =~ /\xC1Á/i, '\xC1\xE1 to /\xC1/i'); +ok("\xE1\xE1" =~ /Á\xC1/i, '\xE1\xE1 to /\xC1/i'); +ok("\xE1\xE1" =~ /\xC1Á/i, '\xE1\xE1 to /\xC1/i'); + +# U+038A, \xBA, GREEK CAPITAL LETTER IOTA WITH TONOS +# U+03AF, \xDF, GREEK SMALL LETTER IOTA WITH TONOS + +ok("\x{38A}" =~ /\xBA/, '\x{38A} to /\xBA/'); +ok("\x{38A}" !~ /\xDF/, '\x{38A} to /\xDF/'); +ok("\x{38A}" =~ /\xBA/i, '\x{38A} to /\xBA/i'); +ok("\x{38A}" =~ /\xDF/i, '\x{38A} to /\xDF/i'); +ok("\x{38A}" =~ /[\xBA]/, '\x{38A} to /[\xBA]/'); +ok("\x{38A}" !~ /[\xDF]/, '\x{38A} to /[\xDF]/'); +ok("\x{38A}" =~ /[\xBA]/i, '\x{38A} to /[\xBA]/i'); +ok("\x{38A}" =~ /[\xDF]/i, '\x{38A} to /[\xDF]/i'); + +# \xDF is not LATIN SMALL LETTER SHARP S + +ok("SS" !~ /\xDF/i, 'SS to /\xDF/i'); +ok("Ss" !~ /\xDF/i, 'Ss to /\xDF/i'); +ok("sS" !~ /\xDF/i, 'sS to /\xDF/i'); +ok("ss" !~ /\xDF/i, 'ss to /\xDF/i'); +ok("SS" !~ /ß/i, 'SS to //i'); +ok("Ss" !~ /ß/i, 'Ss to //i'); +ok("sS" !~ /ß/i, 'sS to //i'); +ok("ss" !~ /ß/i, 'ss to //i'); + diff --git a/t/uni/latin2.t b/t/uni/latin2.t new file mode 100644 index 0000000..08928b6 --- /dev/null +++ b/t/uni/latin2.t @@ -0,0 +1,153 @@ +BEGIN { + if ($ENV{'PERL_CORE'}){ + chdir 't'; + @INC = '../lib'; + } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + if (ord("A") == 193) { + print "1..0 # Skip: EBCDIC\n"; + exit 0; + } + unless (PerlIO::Layer->find('perlio')){ + print "1..0 # Skip: PerlIO required\n"; + exit 0; + } + if ($ENV{PERL_CORE_MINITEST}) { + print "1..0 # Skip: no dynamic loading on miniperl, no Encode\n"; + exit 0; + } + $| = 1; + require './test.pl'; +} + +plan tests => 94; + +use encoding "latin2"; # iso 8859-2 + +# U+00C1, \xC1, \301, LATIN CAPITAL LETTER A WITH ACUTE +# U+0102, \xC3, \402, LATIN CAPITAL LETTER A WITH BREVE +# U+00E1, \xE1, \303, LATIN SMALL LETTER A WITH ACUTE +# U+0103, \xE3, \403, LATIN SMALL LETTER A WITH BREVE + +ok("\xC1" =~ /\xC1/, '\xC1 to /\xC1/'); +ok("\x{C1}" =~ /\x{C1}/, '\x{C1} to /\x{C1}/'); +ok("\xC3" =~ /\xC3/, '\xC3 to /\xC3/'); +ok("\x{102}" =~ /\xC3/, '\x{102} to /\xC3/'); +ok("\xC3" =~ /\x{C3}/, '\xC3 to /\x{C3}/'); +ok("\x{102}" =~ /\x{C3}/, '\x{102} to /\x{C3}/'); +ok("\xC3" =~ /\x{102}/, '\xC3 to /\x{102}/'); +ok("\x{102}" =~ /\x{102}/, '\x{102} to /\x{102}/'); + +ok("\xC1" =~ /\xC1/i, '\xC1 to /\xC1/i'); +ok("\xE1" =~ /\xC1/i, '\xE1 to /\xC1/i'); +ok("\xC1" =~ /\xE1/i, '\xC1 to /\xE1/i'); +ok("\xE1" =~ /\xE1/i, '\xE1 to /\xE1/i'); +ok("\x{102}" =~ /\xC3/i, '\x{102} to /\xC3/i'); +ok("\x{103}" =~ /\xC3/i, '\x{103} to /\xC3/i'); +ok("\x{102}" =~ /\xE3/i, '\x{102} to /\xE3/i'); +ok("\x{103}" =~ /\xE3/i, '\x{103} to /\xE3/i'); + +ok("\xC1" =~ /[\xC1]/, '\xC1 to /[\xC1]/'); +ok("\x{C1}" =~ /[\x{C1}]/, '\x{C1} to /[\x{C1}]/'); +ok("\xC3" =~ /[\xC3]/, '\xC3 to /[\xC3]/'); +ok("\x{102}" =~ /[\xC3]/, '\x{102} to /[\xC3]/'); +ok("\xC3" =~ /[\x{C3}]/, '\xC3 to /[\x{C3}]/'); +ok("\x{102}" =~ /[\x{C3}]/, '\x{102} to /[\x{C3}]/'); +ok("\xC3" =~ /[\x{102}]/, '\xC3 to /[\x{102}]/'); +ok("\x{102}" =~ /[\x{102}]/, '\x{102} to /[\x{102}]/'); + +ok("\xC1" =~ /[\xC1]/i, '\xC1 to /[\xC1]/i'); +ok("\xE1" =~ /[\xC1]/i, '\xE1 to /[\xC1]/i'); +ok("\xC1" =~ /[\xE1]/i, '\xC1 to /[\xE1]/i'); +ok("\xE1" =~ /[\xE1]/i, '\xE1 to /[\xE1]/i'); +ok("\x{102}" =~ /[\xC3]/i, '\x{102} to /[\xC3]/i'); +ok("\x{103}" =~ /[\xC3]/i, '\x{103} to /[\xC3]/i'); +ok("\x{102}" =~ /[\xE3]/i, '\x{102} to /[\xE3]/i'); +ok("\x{103}" =~ /[\xE3]/i, '\x{103} to /[\xE3]/i'); + +ok("\xC1" =~ '\xC1', '\xC1 to \'\xC1\''); +ok("\xC1" =~ '\x{C1}', '\xC1 to \'\x{C1}\''); +ok("\xC3" =~ '\303', '\xC3 to \'\303\''); +ok("\xC3" =~ '\x{102}', '\xC3 to \'\x{102}\''); +ok("\xC1" =~ '[\xC1]', '\xC1 to \'[\xC1]\''); +ok("\xC1" =~ '[\x{C1}]', '\xC1 to \'[\x{C1}]\''); +ok("\xC3" =~ '[\303]', '\xC3 to \'[\303]\''); +ok("\xC3" =~ '[\x{102}]', '\xC3 to \'[\x{102}]\''); + +ok("\xC1" =~ /Á/, '\xC1 to //'); +ok("\xE1" !~ /Á/, '\xE1 to //'); +ok("\xC1" =~ /Á/i, '\xC1 to //i'); +ok("\xE1" =~ /Á/i, '\xE1 to //i'); +ok("\xC1" =~ /[Á]/, '\xC1 to /[]/'); +ok("\xE1" !~ /[Á]/, '\xE1 to /[]/'); +ok("\xC1" =~ /[Á]/i, '\xC1 to /[]/i'); +ok("\xE1" =~ /[Á]/i, '\xE1 to /[]/i'); + +ok("\xC1\xC1" =~ /Á\xC1/, '\xC1\xC1 to /\xC1/'); +ok("\xC1\xC1" =~ /\xC1Á/, '\xC1\xC1 to /\xC1/'); +ok("\xC1\xC1" =~ /Á\xC1/i, '\xC1\xC1 to /\xC1/i'); +ok("\xC1\xC1" =~ /\xC1Á/i, '\xC1\xC1 to /\xC1/i'); +ok("\xC1\xE1" =~ /Á\xC1/i, '\xC1\xE1 to /\xC1/i'); +ok("\xC1\xE1" =~ /\xC1Á/i, '\xC1\xE1 to /\xC1/i'); +ok("\xE1\xE1" =~ /Á\xC1/i, '\xE1\xE1 to /\xC1/i'); +ok("\xE1\xE1" =~ /\xC1Á/i, '\xE1\xE1 to /\xC1/i'); + +# \xDF is LATIN SMALL LETTER SHARP S + +ok("\xDF" =~ /\xDF/, '\xDF to /\xDF/'); +ok("\xDF" =~ /\xDF/i, '\xDF to /\xDF/i'); +ok("\xDF" =~ /[\xDF]/, '\xDF to /[\xDF]/'); +ok("\xDF" =~ /[\xDF]/i, '\xDF to /[\xDF]/i'); +ok("\xDF" =~ /ß/, '\xDF to //'); +ok("\xDF" =~ /ß/i, '\xDF to //i'); +ok("\xDF" =~ /[ß]/, '\xDF to /[]/'); +ok("\xDF" =~ /[ß]/i, '\xDF to /[]/i'); + +ok("SS" =~ /\xDF/i, 'SS to /\xDF/i'); +ok("Ss" =~ /\xDF/i, 'Ss to /\xDF/i'); +ok("sS" =~ /\xDF/i, 'sS to /\xDF/i'); +ok("ss" =~ /\xDF/i, 'ss to /\xDF/i'); +ok("SS" =~ /ß/i, 'SS to //i'); +ok("Ss" =~ /ß/i, 'Ss to //i'); +ok("sS" =~ /ß/i, 'sS to //i'); +ok("ss" =~ /ß/i, 'ss to //i'); + +ok("\xC3" =~ /\303/, '\xC1 to /\303/'); +ok("\303" =~ /\303/, '\303 to /\303/'); +ok("\xC3" =~ /\303/i, '\xC1 to /\303/i'); +ok("\xE3" =~ /\303/i, '\xC1 to /\303/i'); +ok("\xC3" =~ /[\303]/, '\xC1 to /[\303]/'); +ok("\303" =~ /[\303]/, '\303 to /[\303]/'); +ok("\xC3" =~ /[\303]/i, '\xC1 to /[\303]/i'); +ok("\xE3" =~ /[\303]/i, '\xC1 to /[\303]/i'); + +ok("\xC3" =~ /\402/, '\xC1 to /\402/'); +ok("\402" =~ /\402/, '\402 to /\402/'); +ok("\xC3" =~ /\402/i, '\xC1 to /\402/i'); +ok("\xE3" =~ /\402/i, '\xC1 to /\402/i'); +ok("\xC3" =~ /[\402]/, '\xC1 to /[\402]/'); +ok("\402" =~ /[\402]/, '\402 to /[\402]/'); +ok("\xC3" =~ /[\402]/i, '\xC1 to /[\402]/i'); +ok("\xE3" =~ /[\402]/i, '\xC1 to /[\402]/i'); + +{ + my $re = '(?i:\xC1)'; + + ok("\xC1" =~ $re, '\xC1 to (?i:\xC1)'); + ok("\xE1" =~ $re, '\xE1 to (?i:\xC1)'); + + utf8::downgrade($re); + + ok("\xC1" =~ $re, '\xC1 to (?i:\xC1) down'); + ok("\xE1" =~ $re, '\xE1 to (?i:\xC1) down'); + + utf8::upgrade($re); + + ok("\xC1" =~ $re, '\xC1 to (?i:\xC1) up'); + ok("\xE1" =~ $re, '\xE1 to (?i:\xC1) up'); +} + diff --git a/t/uni/tr_utf8.t b/t/uni/tr_utf8.t index 606a84a..354156a 100644 --- a/t/uni/tr_utf8.t +++ b/t/uni/tr_utf8.t @@ -31,7 +31,7 @@ BEGIN { } use strict; -use Test::More tests => 7; +use Test::More tests => 8; use encoding 'utf8'; @@ -67,4 +67,12 @@ is($str, $hiragana, "s/// # hiragana -> katakana"); $line =~ tr/bcdeghijklmnprstvwxyz$02578/בצדעגהיײקלמנפּרסטװשכיזשױתײחא/; is($line, "aבצדעfגהיײקלמנoפqּרסuטװשכיזש1×±34ת6ײח9", "[perl #16843]"); } + +{ + # [perl #40641] + my $str = qq/Gebääääääääääääääääääääude/; + my $reg = qr/Gebääääääääääääääääääääude/; + ok($str =~ /$reg/, "[perl #40641]"); +} + __END__