From: Inaba Hiroto <inaba@st.rim.or.jp>
Date: Sat, 1 Feb 2003 21:58:20 +0000 (+0900)
Subject: [Patch] parsing under encoding (Re: [Encode] HEADS-UP; $Encode::VERSION++ to enhance... 
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=220e2d4e806a3c9c0a9f9b31667e8be830cbc55b;p=p5sagit%2Fp5-mst-13.2.git

[Patch] parsing under encoding (Re: [Encode] HEADS-UP; $Encode::VERSION++ to enhance filter option)([perl #16823])
Message-ID: <3E3BC46B.6C687CFD@st.rim.or.jp>

p4raw-id: //depot/perl@18660
---

diff --git a/embed.fnc b/embed.fnc
index a2009db..8951a36 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -750,6 +750,8 @@ Amd	|char*	|sv_pvn_force	|SV* sv|STRLEN* lp
 Apd	|char*	|sv_pvutf8n_force|SV* sv|STRLEN* lp
 Apd	|char*	|sv_pvbyten_force|SV* sv|STRLEN* lp
 Apd	|char*	|sv_recode_to_utf8	|SV* sv|SV *encoding
+Apd	|bool	|sv_cat_decode	|SV* dsv|SV *encoding|SV *ssv|int *offset \
+				|char* tstr|int tlen
 Apd	|char*	|sv_reftype	|SV* sv|int ob
 Apd	|void	|sv_replace	|SV* sv|SV* nsv
 Apd	|void	|sv_report_used
diff --git a/embed.h b/embed.h
index c3de3cf..887eee1 100644
--- a/embed.h
+++ b/embed.h
@@ -1021,6 +1021,7 @@
 #define sv_pvutf8n_force	Perl_sv_pvutf8n_force
 #define sv_pvbyten_force	Perl_sv_pvbyten_force
 #define sv_recode_to_utf8	Perl_sv_recode_to_utf8
+#define sv_cat_decode		Perl_sv_cat_decode
 #define sv_reftype		Perl_sv_reftype
 #define sv_replace		Perl_sv_replace
 #define sv_report_used		Perl_sv_report_used
@@ -3470,6 +3471,7 @@
 #define sv_pvutf8n_force(a,b)	Perl_sv_pvutf8n_force(aTHX_ a,b)
 #define sv_pvbyten_force(a,b)	Perl_sv_pvbyten_force(aTHX_ a,b)
 #define sv_recode_to_utf8(a,b)	Perl_sv_recode_to_utf8(aTHX_ a,b)
+#define sv_cat_decode(a,b,c,d,e,f)	Perl_sv_cat_decode(aTHX_ a,b,c,d,e,f)
 #define sv_reftype(a,b)		Perl_sv_reftype(aTHX_ a,b)
 #define sv_replace(a,b)		Perl_sv_replace(aTHX_ a,b)
 #define sv_report_used()	Perl_sv_report_used(aTHX)
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm
index c85cbbe..548c5ab 100644
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -271,6 +271,19 @@ sub predefine_encodings{
 		return $octets;
 	    };
 	}
+	*cat_decode = sub{ # ($obj, $dst, $src, $pos, $trm, $chk)
+	    my ($obj, undef, undef, $pos, $trm) = @_; # currently ignores $chk
+	    my ($rdst, $rsrc, $rpos) = \@_[1,2,3];
+	    use bytes;
+	    if ((my $npos = index($$rsrc, $trm, $pos)) >= 0) {
+		$$rdst .= substr($$rsrc, $pos, $npos - $pos + length($trm));
+		$$rpos = $npos + length($trm);
+		return 1;
+	    }
+	    $$rdst .= substr($$rsrc, $pos);
+	    $$rpos = length($$rsrc);
+	    return '';
+	};
 	$Encode::Encoding{utf8} =
 	    bless {Name => "utf8"} => "Encode::utf8";
     }
diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs
index 0461690..c4cb98e 100644
--- a/ext/Encode/Encode.xs
+++ b/ext/Encode/Encode.xs
@@ -59,7 +59,7 @@ call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig)
 
 static SV *
 encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
-	      int check)
+	      int check, STRLEN * offset, SV * term, int * retcode)
 {
     STRLEN slen;
     U8 *s = (U8 *) SvPV(src, slen);
@@ -72,20 +72,30 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
     SV *dst = sv_2mortal(newSV(slen+1));
     U8 *d = (U8 *)SvPVX(dst);
     STRLEN dlen = SvLEN(dst)-1;
-    int code;
+    int code = 0;
+    STRLEN trmlen = 0;
+    U8 *trm = term ? SvPV(term, trmlen) : NULL;
+
+    if (offset) {
+      s += *offset;
+      slen -= *offset;
+      tlen = slen;
+    }
 
-    if (!slen){
+    if (slen <= 0){
 	SvCUR_set(dst, 0);
 	SvPOK_only(dst);
 	goto ENCODE_END;
     }
 
-    while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check)) ) 
+    while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check,
+			     trm, trmlen)) ) 
     {
 	SvCUR_set(dst, dlen+ddone);
 	SvPOK_only(dst);
 	
-	if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL){
+	if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL ||
+	    code == ENCODE_FOUND_TERM) {
 	    break;
 	}
 	switch (code) {
@@ -233,8 +243,12 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
     }
 #endif
 
+    if (offset) 
+      *offset += sdone + slen;
+
  ENCODE_END:
     *SvEND(dst) = '\0';
+    if (retcode) *retcode = code;
     return dst;
 }
 
@@ -381,6 +395,33 @@ CODE:
 }
 
 void
+Method_cat_decode(obj, dst, src, off, term, check = 0)
+SV *	obj
+SV *	dst
+SV *	src
+SV *	off
+SV *	term
+int	check
+CODE:
+{
+    encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
+    STRLEN offset = (STRLEN)SvIV(off);
+    int code = 0;
+    if (SvUTF8(src)) {
+    	sv_utf8_downgrade(src, FALSE);
+    }
+    sv_catsv(dst, encode_method(aTHX_ enc, enc->t_utf8, src, check,
+				&offset, term, &code));
+    SvIVX(off) = (IV)offset;
+    if (code == ENCODE_FOUND_TERM) {
+	ST(0) = &PL_sv_yes;
+    }else{
+	ST(0) = &PL_sv_no;
+    }
+    XSRETURN(1);
+}
+
+void
 Method_decode(obj,src,check = 0)
 SV *	obj
 SV *	src
@@ -391,7 +432,8 @@ CODE:
     if (SvUTF8(src)) {
     	sv_utf8_downgrade(src, FALSE);
     }
-    ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
+    ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check,
+			  NULL, Nullsv, NULL);
     SvUTF8_on(ST(0));
     XSRETURN(1);
 }
@@ -405,7 +447,8 @@ CODE:
 {
     encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
     sv_utf8_upgrade(src);
-    ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
+    ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check,
+			  NULL, Nullsv, NULL);
     XSRETURN(1);
 }
 
diff --git a/ext/Encode/Encode/encode.h b/ext/Encode/Encode/encode.h
index b860578..fc8301a 100644
--- a/ext/Encode/Encode/encode.h
+++ b/ext/Encode/Encode/encode.h
@@ -76,7 +76,8 @@ struct encode_s
 /* See comment at top of file for deviousness */
 
 extern int do_encode(encpage_t *enc, const U8 *src, STRLEN *slen,
-                     U8 *dst, STRLEN dlen, STRLEN *dout, int approx);
+                     U8 *dst, STRLEN dlen, STRLEN *dout, int approx,
+		     const U8 *term, STRLEN tlen);
 
 extern void Encode_DefineEncoding(encode_t *enc);
 
@@ -86,6 +87,7 @@ extern void Encode_DefineEncoding(encode_t *enc);
 #define ENCODE_PARTIAL  2
 #define ENCODE_NOREP    3
 #define ENCODE_FALLBACK 4
+#define ENCODE_FOUND_TERM 5
 
 #define FBCHAR_UTF8		"\xEF\xBF\xBD"
 
diff --git a/ext/Encode/encengine.c b/ext/Encode/encengine.c
index 4c2a7cf..6a08cfd 100644
--- a/ext/Encode/encengine.c
+++ b/ext/Encode/encengine.c
@@ -93,13 +93,13 @@ we add a flag to re-add the removed byte to the source we could handle
 
 int
 do_encode(encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
-	  STRLEN dlen, STRLEN * dout, int approx)
+	  STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
 {
     const U8 *s = src;
     const U8 *send = s + *slen;
     const U8 *last = s;
     U8 *d = dst;
-    U8 *dend = d + dlen;
+    U8 *dend = d + dlen, *dlast = d;
     int code = 0;
     while (s < send) {
 	encpage_t *e = enc;
@@ -133,6 +133,11 @@ do_encode(encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
 		    if (approx && (e->slen & 0x80))
 			code = ENCODE_FALLBACK;
 		    last = s;
+		    if (term && d-dlast == tlen && memEQ(dlast, term, tlen)) {
+		      code = ENCODE_FOUND_TERM;
+		      break;
+		    }
+		    dlast = d;
 		}
 	    }
 	    else {
diff --git a/ext/Encode/lib/Encode/Encoding.pm b/ext/Encode/lib/Encode/Encoding.pm
index 1876cb7..4e842b6 100644
--- a/ext/Encode/lib/Encode/Encoding.pm
+++ b/ext/Encode/lib/Encode/Encoding.pm
@@ -130,6 +130,13 @@ replacement character.
 
 =back
 
+=item -E<gt>cat_decode($destination, $octets, $offset, $terminator [,$check])
+
+MUST decode I<$octets> with I<$offset> and concatenate it to I<$destination>.
+Decoding will terminate when $terminator (a string) appears in output.
+I<$offset> will be modified to the last $octets position at end of decode.
+Returns true if $terminator appears output, else returns false.
+
 =head2 Other methods defined in Encode::Encodings
 
 You do not have to override methods shown below unless you have to.
diff --git a/ext/Encode/lib/Encode/JP/JIS7.pm b/ext/Encode/lib/Encode/JP/JIS7.pm
index d49ec6c..52e5e5c 100644
--- a/ext/Encode/lib/Encode/JP/JIS7.pm
+++ b/ext/Encode/lib/Encode/JP/JIS7.pm
@@ -60,9 +60,52 @@ sub encode($$;$)
     return $octet;
 }
 
+#
+# cat_decode
+#
+my $re_scan_jis_g = qr{
+   \G ( ($RE{JIS_0212}) |  $RE{JIS_0208}  |
+        ($RE{ISO_ASC})  | ($RE{JIS_KANA}) | )
+      ([^\e]*)
+}x;
+sub cat_decode { # ($obj, $dst, $src, $pos, $trm, $chk)
+    my ($obj, undef, undef, $pos, $trm) = @_; # currently ignores $chk
+    my ($rdst, $rsrc, $rpos) = \@_[1,2,3];
+    local ${^ENCODING};
+    use bytes;
+    my $opos = pos($$rsrc);
+    pos($$rsrc) = $pos;
+    while ($$rsrc =~ /$re_scan_jis_g/gc) {
+	my ($esc, $esc_0212, $esc_asc, $esc_kana, $chunk) =
+	  ($1, $2, $3, $4, $5);
+
+	unless ($chunk) { $esc or last;  next; }
+
+	if ($esc && !$esc_asc) {
+	    $chunk =~ tr/\x21-\x7e/\xa1-\xfe/;
+	    if ($esc_kana) {
+		$chunk =~ s/([\xa1-\xdf])/\x8e$1/og;
+	    } elsif ($esc_0212) {
+		$chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
+	    }
+	    $chunk = Encode::decode('euc-jp', $chunk, 0);
+	}
+	elsif ((my $npos = index($chunk, $trm)) >= 0) {
+	    $$rdst .= substr($chunk, 0, $npos + length($trm));
+	    $$rpos += length($esc) + $npos + length($trm);
+	    pos($$rsrc) = $opos;
+	    return 1;
+	}
+	$$rdst .= $chunk;
+	$$rpos = pos($$rsrc);
+    }
+    $$rpos = pos($$rsrc);
+    pos($$rsrc) = $opos;
+    return '';
+}
 
 # JIS<->EUC
-our $re_scan_jis = qr{
+my $re_scan_jis = qr{
    (?:($RE{JIS_0212})|$RE{JIS_0208}|($RE{ISO_ASC})|($RE{JIS_KANA}))([^\e]*)
 }x;
 
diff --git a/global.sym b/global.sym
index 9e3ddcd..3a8b5b9 100644
--- a/global.sym
+++ b/global.sym
@@ -469,6 +469,7 @@ Perl_sv_pos_b2u
 Perl_sv_pvutf8n_force
 Perl_sv_pvbyten_force
 Perl_sv_recode_to_utf8
+Perl_sv_cat_decode
 Perl_sv_reftype
 Perl_sv_replace
 Perl_sv_report_used
diff --git a/lib/utf8.t b/lib/utf8.t
index 8072c87..6728238 100644
--- a/lib/utf8.t
+++ b/lib/utf8.t
@@ -37,7 +37,7 @@ no utf8; # Ironic, no?
 #
 #
 
-plan tests => 98;
+plan tests => 99;
 
 {
     # bug id 20001009.001
@@ -323,3 +323,8 @@ END
     is("@i", "60 62 58 50 52 48 70 72 68", "utf8 heredoc index and rindex");
 }
 
+{
+    use utf8;
+    eval qq{is(q \xc3\xbc test \xc3\xbc, qq\xc2\xb7 test \xc2\xb7,
+	       "utf8 quote delimiters [perl #16823]");};
+}
diff --git a/pod/perlapi.pod b/pod/perlapi.pod
index 695a44c..59b80c3 100644
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -4788,6 +4788,23 @@ The pointer to the PV of the dsv is returned.
 =for hackers
 Found in file utf8.c
 
+=item sv_cat_decode
+
+The encoding is assumed to be an Encode object, the PV of the ssv is
+assumed to be octets in that encoding and decoding the input starts
+from the position which (PV + *offset) pointed to.  The dsv will be
+concatenated the decoded UTF-8 string from ssv.  Decoding will terminate
+when the string tstr appears in decoding output or the input ends on
+the PV of the ssv. The value which the offset points will be modified
+to the last input position on the ssv.
+
+Returns TRUE if the terminator was found, else returns FALSE.
+
+	bool	sv_cat_decode(SV* dsv, SV *encoding, SV *ssv, int *offset, char* tstr, int tlen)
+
+=for hackers
+Found in file sv.c
+
 =item sv_recode_to_utf8
 
 The encoding is assumed to be an Encode object, on entry the PV
diff --git a/sv.c b/sv.c
index 4c148d8..89792cf 100644
--- a/sv.c
+++ b/sv.c
@@ -11168,14 +11168,14 @@ The PV of the sv is returned.
 char *
 Perl_sv_recode_to_utf8(pTHX_ SV *sv, SV *encoding)
 {
-    if (SvPOK(sv) && !DO_UTF8(sv) && SvROK(encoding)) {
-	int vary = FALSE;
+    if (SvPOK(sv) && !SvUTF8(sv) && !IN_BYTES && SvROK(encoding)) {
 	SV *uni;
 	STRLEN len;
 	char *s;
 	dSP;
 	ENTER;
 	SAVETMPS;
+	save_re_context();
 	PUSHMARK(sp);
 	EXTEND(SP, 3);
 	XPUSHs(encoding);
@@ -11196,13 +11196,6 @@ Perl_sv_recode_to_utf8(pTHX_ SV *sv, SV *encoding)
 	uni = POPs;
 	PUTBACK;
 	s = SvPV(uni, len);
-	{
-	    U8 *t = (U8 *)s, *e = (U8 *)s + len;
-	    while (t < e) {
-		if ((vary = !UTF8_IS_INVARIANT(*t++)))
-		    break;
-	    }
-	}
 	if (s != SvPVX(sv)) {
 	    SvGROW(sv, len + 1);
 	    Move(s, SvPVX(sv), len, char);
@@ -11211,12 +11204,54 @@ Perl_sv_recode_to_utf8(pTHX_ SV *sv, SV *encoding)
 	}
 	FREETMPS;
 	LEAVE;
-	if (vary)
-	    SvUTF8_on(sv);
 	SvUTF8_on(sv);
     }
     return SvPVX(sv);
 }
 
+/*
+=for apidoc sv_cat_decode
+
+The encoding is assumed to be an Encode object, the PV of the ssv is
+assumed to be octets in that encoding and decoding the input starts
+from the position which (PV + *offset) pointed to.  The dsv will be
+concatenated the decoded UTF-8 string from ssv.  Decoding will terminate
+when the string tstr appears in decoding output or the input ends on
+the PV of the ssv. The value which the offset points will be modified
+to the last input position on the ssv.
 
+Returns TRUE if the terminator was found, else returns FALSE.
+
+=cut */
+
+bool
+Perl_sv_cat_decode(pTHX_ SV *dsv, SV *encoding,
+		   SV *ssv, int *offset, char *tstr, int tlen)
+{
+    if (SvPOK(ssv) && SvPOK(dsv) && SvROK(encoding) && offset) {
+        bool ret = FALSE;
+	SV *offsv;
+	dSP;
+	ENTER;
+	SAVETMPS;
+	save_re_context();
+	PUSHMARK(sp);
+	EXTEND(SP, 6);
+	XPUSHs(encoding);
+	XPUSHs(dsv);
+	XPUSHs(ssv);
+	XPUSHs(offsv = sv_2mortal(newSViv(*offset)));
+	XPUSHs(sv_2mortal(newSVpvn(tstr, tlen)));
+	PUTBACK;
+	call_method("cat_decode", G_SCALAR);
+	SPAGAIN;
+	ret = SvTRUE(TOPs);
+	*offset = SvIV(offsv);
+	PUTBACK;
+	FREETMPS;
+	LEAVE;
+	return ret;
+    }
+    Perl_croak(aTHX_ "Invalid argument to sv_cat_decode.");
+}
 
diff --git a/t/uni/tr_7jis.t b/t/uni/tr_7jis.t
index 894ff4c..6e74f1d 100644
--- a/t/uni/tr_7jis.t
+++ b/t/uni/tr_7jis.t
@@ -53,10 +53,10 @@ is($str, $katakana, "tr// # hiragana -> katakana");
 $str = $katakana; $str =~ tr/ァ-ン/ぁ-ん/;
 is($str, $hiragana, "tr// # hiragana -> katakana");
 
-$str = $hiragana; eval qq{\$str =~ tr/ぁ-ん/ァ-ン/};
-is($str, $katakana, "eval qq{tr//} # hiragana -> katakana");
-$str = $katakana; eval qq{\$str =~ tr/ァ-ン/ぁ-ん/};
-is($str, $hiragana, "eval qq{tr//} # hiragana -> katakana");
+$str = $hiragana; eval qq(\$str =~ tr/ぁ-ん/ァ-ン/);
+is($str, $katakana, "eval qq(tr//) # hiragana -> katakana");
+$str = $katakana; eval qq(\$str =~ tr/ァ-ン/ぁ-ん/);
+is($str, $hiragana, "eval qq(tr//) # hiragana -> katakana");
 
 $str = $hiragana; $str =~ s/([ぁ-ん])/$h2k{$1}/go;
 is($str, $katakana, "s/// # hiragana -> katakana");
diff --git a/toke.c b/toke.c
index e7834c4..6b27a37 100644
--- a/toke.c
+++ b/toke.c
@@ -6882,6 +6882,10 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
     register char *to;			/* current position in the sv's data */
     I32 brackets = 1;			/* bracket nesting level */
     bool has_utf8 = FALSE;		/* is there any utf8 content? */
+    I32 termcode;			/* terminating char. code */
+    U8 termstr[UTF8_MAXLEN];		/* terminating string */
+    STRLEN termlen;			/* length of terminating string */
+    char *last = NULL;			/* last position for nesting bracket */
 
     /* skip space before the delimiter */
     if (isSPACE(*s))
@@ -6892,8 +6896,16 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
 
     /* after skipping whitespace, the next character is the terminator */
     term = *s;
-    if (!UTF8_IS_INVARIANT((U8)term) && UTF)
-	has_utf8 = TRUE;
+    if (!UTF) {
+	termcode = termstr[0] = term;
+	termlen = 1;
+    }
+    else {
+	termcode = utf8_to_uvchr(s, &termlen);
+	Copy(s, termstr, termlen, U8);
+	if (!UTF8_IS_INVARIANT(term))
+	    has_utf8 = TRUE;
+    }
 
     /* mark where we are */
     PL_multi_start = CopLINE(PL_curcop);
@@ -6901,21 +6913,92 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
 
     /* find corresponding closing delimiter */
     if (term && (tmps = strchr("([{< )]}> )]}>",term)))
-	term = tmps[5];
+	termcode = termstr[0] = term = tmps[5];
+
     PL_multi_close = term;
 
     /* create a new SV to hold the contents.  87 is leak category, I'm
        assuming.  79 is the SV's initial length.  What a random number. */
     sv = NEWSV(87,79);
     sv_upgrade(sv, SVt_PVIV);
-    SvIVX(sv) = term;
+    SvIVX(sv) = termcode;
     (void)SvPOK_only(sv);		/* validate pointer */
 
     /* move past delimiter and try to read a complete string */
     if (keep_delims)
-	sv_catpvn(sv, s, 1);
-    s++;
+	sv_catpvn(sv, s, termlen);
+    s += termlen;
     for (;;) {
+	if (PL_encoding && !UTF) {
+	    bool cont = TRUE;
+
+	    while (cont) {
+		int offset = s - SvPVX(PL_linestr);
+		bool found = sv_cat_decode(sv, PL_encoding, PL_linestr,
+					   &offset, termstr, termlen);
+		char *ns = SvPVX(PL_linestr) + offset;
+		char *svlast = SvEND(sv) - 1;
+
+		for (; s < ns; s++) {
+		    if (*s == '\n' && !PL_rsfp)
+			CopLINE_inc(PL_curcop);
+		}
+		if (!found)
+		    goto read_more_line;
+		else {
+		    /* handle quoted delimiters */
+		    if (*(svlast-1) == '\\') {
+			char *t;
+			for (t = svlast-2; t >= SvPVX(sv) && *t == '\\';)
+			    t--;
+			if ((svlast-1 - t) % 2) {
+			    if (!keep_quoted) {
+				*(svlast-1) = term;
+				*svlast = '\0';
+				SvCUR_set(sv, SvCUR(sv) - 1);
+			    }
+			    continue;
+			}
+		    }
+		    if (PL_multi_open == PL_multi_close) {
+			cont = FALSE;
+		    }
+		    else {
+			char *t, *w;
+			if (!last)
+			    last = SvPVX(sv);
+			for (w = t = last; t < svlast; w++, t++) {
+			    /* At here, all closes are "was quoted" one,
+			       so we don't check PL_multi_close. */
+			    if (*t == '\\') {
+				if (!keep_quoted && *(t+1) == PL_multi_open)
+				    t++;
+				else
+				    *w++ = *t++;
+			    }
+			    else if (*t == PL_multi_open)
+				brackets++;
+
+			    *w = *t;
+			}
+			if (w < t) {
+			    *w++ = term;
+			    *w = '\0';
+			    SvCUR_set(sv, w - SvPVX(sv));
+			}
+			last = w;
+			if (--brackets <= 0)
+			    cont = FALSE;
+		    }
+		}
+	    }
+	    if (!keep_delims) {
+		SvCUR_set(sv, SvCUR(sv) - 1);
+		*SvEND(sv) = '\0';
+	    }
+	    break;
+	}
+
     	/* extend sv if need be */
 	SvGROW(sv, SvCUR(sv) + (PL_bufend - s) + 1);
 	/* set 'to' to the next character in the sv's string */
@@ -6937,8 +7020,12 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
 		}
 		/* terminate when run out of buffer (the for() condition), or
 		   have found the terminator */
-		else if (*s == term)
-		    break;
+		else if (*s == term) {
+		    if (termlen == 1)
+			break;
+		    if (s+termlen <= PL_bufend && memEQ(s, termstr, termlen))
+			break;
+		}
 		else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF)
 		    has_utf8 = TRUE;
 		*to = *s;
@@ -7000,6 +7087,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
 	    to[-1] = '\n';
 #endif
 	
+     read_more_line:
 	/* if we're out of file, or a read fails, bail and reset the current
 	   line marker so we can report where the unterminated string began
 	*/
@@ -7030,15 +7118,15 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
 
     /* at this point, we have successfully read the delimited string */
 
-    if (keep_delims)
-	sv_catpvn(sv, s, 1);
-    if (has_utf8)
+    if (!PL_encoding || UTF) {
+	if (keep_delims)
+	    sv_catpvn(sv, s, termlen);
+	s += termlen;
+    }
+    if (has_utf8 || PL_encoding)
 	SvUTF8_on(sv);
-    else if (PL_encoding)
-	sv_recode_to_utf8(sv, PL_encoding);
 
     PL_multi_end = CopLINE(PL_curcop);
-    s++;
 
     /* if we allocated too much space, give some back */
     if (SvCUR(sv) + 5 < SvLEN(sv)) {