From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Tue, 6 Nov 2001 03:05:34 +0000 (+0000)
Subject: Implement the encoding pragma for regex literals.
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=a72c75842468bcd2a7cf17032844c4040a5a31e2;p=p5sagit%2Fp5-mst-13.2.git

Implement the encoding pragma for regex literals.

p4raw-id: //depot/perl@12864
---

diff --git a/lib/encoding.pm b/lib/encoding.pm
index 6f5970f..94ee323 100644
--- a/lib/encoding.pm
+++ b/lib/encoding.pm
@@ -57,14 +57,33 @@ encoding pragma you can change this default.
 The pragma is a per script, not a per block lexical.  Only the last
 C<use encoding> matters, and it affects B<the whole script>.
 
+Notice that only literals (string or regular expression) having only
+legacy code points are affected: if you mix data like this
+
+	\xDF\x{100}
+
+the data is assumed to be in (Latin 1 and) Unicode, not in your native
+encoding.  In other words, this will match in "greek":
+
+	"\xDF" =~ /\x{3af}/
+
+but this will not
+
+	"\xDF\x{100}" =~ /\x{3af}\x{100}/
+
+since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
+because of the C<\x{100}> on the left.  You should not be mixing your
+legacy data and Unicode in the same string.
+
 If no encoding is specified, the environment variable L<PERL_ENCODING>
 is consulted.  If that fails, "latin1" (ISO 8859-1) is assumed.  If no
 encoding can be found, C<Unknown encoding '...'> error will be thrown.
 
 =head1 KNOWN PROBLEMS
 
-Literals in regular expressions are not affected by this pragma.
-They very probably should.
+For native multibyte encodings (either fixed or variable length)
+the current implementation of the regular expressions may introduce
+recoding errors for longer regular expression literals than 127 bytes.
 
 =head1 SEE ALSO
 
diff --git a/lib/encoding.t b/lib/encoding.t
index 0363441..8b14c88 100644
--- a/lib/encoding.t
+++ b/lib/encoding.t
@@ -1,4 +1,4 @@
-print "1..15\n";
+print "1..19\n";
 
 use encoding "latin1"; # ignored (overwritten by the next line)
 use encoding "greek";  # iso 8859-7 (no "latin" alias, surprise...)
@@ -68,3 +68,17 @@ print "ok 14\n";
 print "not " unless ord(substr($c, 2, 1)) == 0x3af;
 print "ok 15\n";
 
+# regex literals
+
+print "not " unless "\xDF"    =~ /\x{3AF}/;
+print "ok 16\n";
+
+print "not " unless "\x{3AF}" =~ /\xDF/;
+print "ok 17\n";
+
+print "not " unless "\xDF"    =~ /\xDF/;
+print "ok 18\n";
+
+print "not " unless "\x{3AF}" =~ /\x{3AF}/;
+print "ok 19\n";
+
diff --git a/regcomp.c b/regcomp.c
index efc1275..cd3857e 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3155,6 +3155,21 @@ tryagain:
 	break;
     }
 
+    if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT && !RExC_utf8) {
+	 STRLEN oldlen = STR_LEN(ret);
+	 SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
+	 char *s       = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
+	 STRLEN newlen = SvCUR(sv);
+	 if (!SIZE_ONLY) {
+	      DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
+				    oldlen, STRING(ret), newlen, s));
+	      Copy(s, STRING(ret), newlen, char);
+	      STR_LEN(ret) += newlen - oldlen;
+	      RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
+	 } else
+	      RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
+    }
+
     return(ret);
 }
 
diff --git a/regexec.c b/regexec.c
index 198e99e..60d93f7 100644
--- a/regexec.c
+++ b/regexec.c
@@ -408,7 +408,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
 	PL_reg_flags |= RF_utf8;
 
     if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
-	DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
+	DEBUG_r(PerlIO_printf(Perl_debug_log,
+			      "String too short... [re_intuit_start]\n"));
 	goto fail;
     }
     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
@@ -1474,19 +1475,10 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
     }
 
     minlen = prog->minlen;
-    if (do_utf8 && !(prog->reganch & ROPT_CANY_SEEN)) {
-        if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) {
-	    DEBUG_r(PerlIO_printf(Perl_debug_log,
-				  "Too short (in characters)...\n"));
-	    goto phooey;
-	}
-    }
-    else {
-        if (strend - startpos < minlen) {
-	    DEBUG_r(PerlIO_printf(Perl_debug_log,
-				  "Too short (in bytes)...\n"));
-	    goto phooey;
-	}
+    if (strend - startpos < minlen) {
+        DEBUG_r(PerlIO_printf(Perl_debug_log,
+			      "String too short [regexec_flags]...\n"));
+	goto phooey;
     }
 
     /* Check validity of program. */
@@ -2215,14 +2207,26 @@ S_regmatch(pTHX_ regnode *prog)
 		char *l = locinput;
 		char *e = s + ln;
 		STRLEN len;
+
 		if (do_utf8)
 		    while (s < e) {
+			UV uv;
+
 			if (l >= PL_regeol)
 			    sayNO;
-			if (*((U8*)s) != utf8_to_uvchr((U8*)l, &len))
-			    sayNO;
-			s += len;
-			l += len;
+			uv = NATIVE_TO_UNI(*(U8*)s);
+			if (UTF8_IS_START(uv)) {
+			     len = UTF8SKIP(s);
+			     if (memNE(s, l, len))
+				  sayNO;
+			     l += len;
+			     s += len;
+			} else {
+			     if (uv != utf8_to_uvchr((U8*)l, &len))
+				  sayNO;
+			     l += len;
+			     s ++;
+			}
 		    }
 		else
 		    while (s < e) {
@@ -2230,8 +2234,8 @@ S_regmatch(pTHX_ regnode *prog)
 			    sayNO;
 			if (*((U8*)l) != utf8_to_uvchr((U8*)s, &len))
 			    sayNO;
-			s+=len;
-			l++;
+			s += len;
+			l ++;
 		    }
 		locinput = l;
 		nextchr = UCHARAT(locinput);