Apd |char* |sv_pvutf8n_force|SV* sv|STRLEN* lp
Apd |char* |sv_pvbyten_force|SV* sv|STRLEN* lp
Apd |char* |sv_recode_to_utf8 |SV* sv|SV *encoding
+Apd |bool |sv_cat_decode |SV* dsv|SV *encoding|SV *ssv|int *offset \
+ |char* tstr|int tlen
Apd |char* |sv_reftype |SV* sv|int ob
Apd |void |sv_replace |SV* sv|SV* nsv
Apd |void |sv_report_used
#define sv_pvutf8n_force Perl_sv_pvutf8n_force
#define sv_pvbyten_force Perl_sv_pvbyten_force
#define sv_recode_to_utf8 Perl_sv_recode_to_utf8
+#define sv_cat_decode Perl_sv_cat_decode
#define sv_reftype Perl_sv_reftype
#define sv_replace Perl_sv_replace
#define sv_report_used Perl_sv_report_used
#define sv_pvutf8n_force(a,b) Perl_sv_pvutf8n_force(aTHX_ a,b)
#define sv_pvbyten_force(a,b) Perl_sv_pvbyten_force(aTHX_ a,b)
#define sv_recode_to_utf8(a,b) Perl_sv_recode_to_utf8(aTHX_ a,b)
+#define sv_cat_decode(a,b,c,d,e,f) Perl_sv_cat_decode(aTHX_ a,b,c,d,e,f)
#define sv_reftype(a,b) Perl_sv_reftype(aTHX_ a,b)
#define sv_replace(a,b) Perl_sv_replace(aTHX_ a,b)
#define sv_report_used() Perl_sv_report_used(aTHX)
return $octets;
};
}
+ *cat_decode = sub{ # ($obj, $dst, $src, $pos, $trm, $chk)
+ my ($obj, undef, undef, $pos, $trm) = @_; # currently ignores $chk
+ my ($rdst, $rsrc, $rpos) = \@_[1,2,3];
+ use bytes;
+ if ((my $npos = index($$rsrc, $trm, $pos)) >= 0) {
+ $$rdst .= substr($$rsrc, $pos, $npos - $pos + length($trm));
+ $$rpos = $npos + length($trm);
+ return 1;
+ }
+ $$rdst .= substr($$rsrc, $pos);
+ $$rpos = length($$rsrc);
+ return '';
+ };
$Encode::Encoding{utf8} =
bless {Name => "utf8"} => "Encode::utf8";
}
static SV *
encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
- int check)
+ int check, STRLEN * offset, SV * term, int * retcode)
{
STRLEN slen;
U8 *s = (U8 *) SvPV(src, slen);
SV *dst = sv_2mortal(newSV(slen+1));
U8 *d = (U8 *)SvPVX(dst);
STRLEN dlen = SvLEN(dst)-1;
- int code;
+ int code = 0;
+ STRLEN trmlen = 0;
+ U8 *trm = term ? SvPV(term, trmlen) : NULL;
+
+ if (offset) {
+ s += *offset;
+ slen -= *offset;
+ tlen = slen;
+ }
- if (!slen){
+ if (slen <= 0){
SvCUR_set(dst, 0);
SvPOK_only(dst);
goto ENCODE_END;
}
- while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check)) )
+ while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check,
+ trm, trmlen)) )
{
SvCUR_set(dst, dlen+ddone);
SvPOK_only(dst);
- if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL){
+ if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL ||
+ code == ENCODE_FOUND_TERM) {
break;
}
switch (code) {
}
#endif
+ if (offset)
+ *offset += sdone + slen;
+
ENCODE_END:
*SvEND(dst) = '\0';
+ if (retcode) *retcode = code;
return dst;
}
}
void
+Method_cat_decode(obj, dst, src, off, term, check = 0)
+SV * obj
+SV * dst
+SV * src
+SV * off
+SV * term
+int check
+CODE:
+{
+ encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
+ STRLEN offset = (STRLEN)SvIV(off);
+ int code = 0;
+ if (SvUTF8(src)) {
+ sv_utf8_downgrade(src, FALSE);
+ }
+ sv_catsv(dst, encode_method(aTHX_ enc, enc->t_utf8, src, check,
+ &offset, term, &code));
+ SvIVX(off) = (IV)offset;
+ if (code == ENCODE_FOUND_TERM) {
+ ST(0) = &PL_sv_yes;
+ }else{
+ ST(0) = &PL_sv_no;
+ }
+ XSRETURN(1);
+}
+
+void
Method_decode(obj,src,check = 0)
SV * obj
SV * src
if (SvUTF8(src)) {
sv_utf8_downgrade(src, FALSE);
}
- ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
+ ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check,
+ NULL, Nullsv, NULL);
SvUTF8_on(ST(0));
XSRETURN(1);
}
{
encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
sv_utf8_upgrade(src);
- ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
+ ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check,
+ NULL, Nullsv, NULL);
XSRETURN(1);
}
/* See comment at top of file for deviousness */
extern int do_encode(encpage_t *enc, const U8 *src, STRLEN *slen,
- U8 *dst, STRLEN dlen, STRLEN *dout, int approx);
+ U8 *dst, STRLEN dlen, STRLEN *dout, int approx,
+ const U8 *term, STRLEN tlen);
extern void Encode_DefineEncoding(encode_t *enc);
#define ENCODE_PARTIAL 2
#define ENCODE_NOREP 3
#define ENCODE_FALLBACK 4
+#define ENCODE_FOUND_TERM 5
#define FBCHAR_UTF8 "\xEF\xBF\xBD"
int
do_encode(encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
- STRLEN dlen, STRLEN * dout, int approx)
+ STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
{
const U8 *s = src;
const U8 *send = s + *slen;
const U8 *last = s;
U8 *d = dst;
- U8 *dend = d + dlen;
+ U8 *dend = d + dlen, *dlast = d;
int code = 0;
while (s < send) {
encpage_t *e = enc;
if (approx && (e->slen & 0x80))
code = ENCODE_FALLBACK;
last = s;
+ if (term && d-dlast == tlen && memEQ(dlast, term, tlen)) {
+ code = ENCODE_FOUND_TERM;
+ break;
+ }
+ dlast = d;
}
}
else {
=back
+=item -E<gt>cat_decode($destination, $octets, $offset, $terminator [,$check])
+
+MUST decode I<$octets> with I<$offset> and concatenate it to I<$destination>.
+Decoding will terminate when $terminator (a string) appears in output.
+I<$offset> will be modified to the last $octets position at end of decode.
+Returns true if $terminator appears output, else returns false.
+
=head2 Other methods defined in Encode::Encodings
You do not have to override methods shown below unless you have to.
return $octet;
}
+#
+# cat_decode
+#
+my $re_scan_jis_g = qr{
+ \G ( ($RE{JIS_0212}) | $RE{JIS_0208} |
+ ($RE{ISO_ASC}) | ($RE{JIS_KANA}) | )
+ ([^\e]*)
+}x;
+sub cat_decode { # ($obj, $dst, $src, $pos, $trm, $chk)
+ my ($obj, undef, undef, $pos, $trm) = @_; # currently ignores $chk
+ my ($rdst, $rsrc, $rpos) = \@_[1,2,3];
+ local ${^ENCODING};
+ use bytes;
+ my $opos = pos($$rsrc);
+ pos($$rsrc) = $pos;
+ while ($$rsrc =~ /$re_scan_jis_g/gc) {
+ my ($esc, $esc_0212, $esc_asc, $esc_kana, $chunk) =
+ ($1, $2, $3, $4, $5);
+
+ unless ($chunk) { $esc or last; next; }
+
+ if ($esc && !$esc_asc) {
+ $chunk =~ tr/\x21-\x7e/\xa1-\xfe/;
+ if ($esc_kana) {
+ $chunk =~ s/([\xa1-\xdf])/\x8e$1/og;
+ } elsif ($esc_0212) {
+ $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
+ }
+ $chunk = Encode::decode('euc-jp', $chunk, 0);
+ }
+ elsif ((my $npos = index($chunk, $trm)) >= 0) {
+ $$rdst .= substr($chunk, 0, $npos + length($trm));
+ $$rpos += length($esc) + $npos + length($trm);
+ pos($$rsrc) = $opos;
+ return 1;
+ }
+ $$rdst .= $chunk;
+ $$rpos = pos($$rsrc);
+ }
+ $$rpos = pos($$rsrc);
+ pos($$rsrc) = $opos;
+ return '';
+}
# JIS<->EUC
-our $re_scan_jis = qr{
+my $re_scan_jis = qr{
(?:($RE{JIS_0212})|$RE{JIS_0208}|($RE{ISO_ASC})|($RE{JIS_KANA}))([^\e]*)
}x;
Perl_sv_pvutf8n_force
Perl_sv_pvbyten_force
Perl_sv_recode_to_utf8
+Perl_sv_cat_decode
Perl_sv_reftype
Perl_sv_replace
Perl_sv_report_used
#
#
-plan tests => 98;
+plan tests => 99;
{
# bug id 20001009.001
is("@i", "60 62 58 50 52 48 70 72 68", "utf8 heredoc index and rindex");
}
+{
+ use utf8;
+ eval qq{is(q \xc3\xbc test \xc3\xbc, qq\xc2\xb7 test \xc2\xb7,
+ "utf8 quote delimiters [perl #16823]");};
+}
=for hackers
Found in file utf8.c
+=item sv_cat_decode
+
+The encoding is assumed to be an Encode object, the PV of the ssv is
+assumed to be octets in that encoding and decoding the input starts
+from the position which (PV + *offset) pointed to. The dsv will be
+concatenated the decoded UTF-8 string from ssv. Decoding will terminate
+when the string tstr appears in decoding output or the input ends on
+the PV of the ssv. The value which the offset points will be modified
+to the last input position on the ssv.
+
+Returns TRUE if the terminator was found, else returns FALSE.
+
+ bool sv_cat_decode(SV* dsv, SV *encoding, SV *ssv, int *offset, char* tstr, int tlen)
+
+=for hackers
+Found in file sv.c
+
=item sv_recode_to_utf8
The encoding is assumed to be an Encode object, on entry the PV
char *
Perl_sv_recode_to_utf8(pTHX_ SV *sv, SV *encoding)
{
- if (SvPOK(sv) && !DO_UTF8(sv) && SvROK(encoding)) {
- int vary = FALSE;
+ if (SvPOK(sv) && !SvUTF8(sv) && !IN_BYTES && SvROK(encoding)) {
SV *uni;
STRLEN len;
char *s;
dSP;
ENTER;
SAVETMPS;
+ save_re_context();
PUSHMARK(sp);
EXTEND(SP, 3);
XPUSHs(encoding);
uni = POPs;
PUTBACK;
s = SvPV(uni, len);
- {
- U8 *t = (U8 *)s, *e = (U8 *)s + len;
- while (t < e) {
- if ((vary = !UTF8_IS_INVARIANT(*t++)))
- break;
- }
- }
if (s != SvPVX(sv)) {
SvGROW(sv, len + 1);
Move(s, SvPVX(sv), len, char);
}
FREETMPS;
LEAVE;
- if (vary)
- SvUTF8_on(sv);
SvUTF8_on(sv);
}
return SvPVX(sv);
}
+/*
+=for apidoc sv_cat_decode
+
+The encoding is assumed to be an Encode object, the PV of the ssv is
+assumed to be octets in that encoding and decoding the input starts
+from the position which (PV + *offset) pointed to. The dsv will be
+concatenated the decoded UTF-8 string from ssv. Decoding will terminate
+when the string tstr appears in decoding output or the input ends on
+the PV of the ssv. The value which the offset points will be modified
+to the last input position on the ssv.
+Returns TRUE if the terminator was found, else returns FALSE.
+
+=cut */
+
+bool
+Perl_sv_cat_decode(pTHX_ SV *dsv, SV *encoding,
+ SV *ssv, int *offset, char *tstr, int tlen)
+{
+ if (SvPOK(ssv) && SvPOK(dsv) && SvROK(encoding) && offset) {
+ bool ret = FALSE;
+ SV *offsv;
+ dSP;
+ ENTER;
+ SAVETMPS;
+ save_re_context();
+ PUSHMARK(sp);
+ EXTEND(SP, 6);
+ XPUSHs(encoding);
+ XPUSHs(dsv);
+ XPUSHs(ssv);
+ XPUSHs(offsv = sv_2mortal(newSViv(*offset)));
+ XPUSHs(sv_2mortal(newSVpvn(tstr, tlen)));
+ PUTBACK;
+ call_method("cat_decode", G_SCALAR);
+ SPAGAIN;
+ ret = SvTRUE(TOPs);
+ *offset = SvIV(offsv);
+ PUTBACK;
+ FREETMPS;
+ LEAVE;
+ return ret;
+ }
+ Perl_croak(aTHX_ "Invalid argument to sv_cat_decode.");
+}
$str = $katakana; $str =~ tr/\e$B%!\e(B-\e$B%s\e(B/\e$B$!\e(B-\e$B$s\e(B/;
is($str, $hiragana, "tr// # hiragana -> katakana");
-$str = $hiragana; eval qq{\$str =~ tr/\e$B$!\e(B-\e$B$s\e(B/\e$B%!\e(B-\e$B%s\e(B/};
-is($str, $katakana, "eval qq{tr//} # hiragana -> katakana");
-$str = $katakana; eval qq{\$str =~ tr/\e$B%!\e(B-\e$B%s\e(B/\e$B$!\e(B-\e$B$s\e(B/};
-is($str, $hiragana, "eval qq{tr//} # hiragana -> katakana");
+$str = $hiragana; eval qq(\$str =~ tr/\e$B$!\e(B-\e$B$s\e(B/\e$B%!\e(B-\e$B%s\e(B/);
+is($str, $katakana, "eval qq(tr//) # hiragana -> katakana");
+$str = $katakana; eval qq(\$str =~ tr/\e$B%!\e(B-\e$B%s\e(B/\e$B$!\e(B-\e$B$s\e(B/);
+is($str, $hiragana, "eval qq(tr//) # hiragana -> katakana");
$str = $hiragana; $str =~ s/([\e$B$!\e(B-\e$B$s\e(B])/$h2k{$1}/go;
is($str, $katakana, "s/// # hiragana -> katakana");
register char *to; /* current position in the sv's data */
I32 brackets = 1; /* bracket nesting level */
bool has_utf8 = FALSE; /* is there any utf8 content? */
+ I32 termcode; /* terminating char. code */
+ U8 termstr[UTF8_MAXLEN]; /* terminating string */
+ STRLEN termlen; /* length of terminating string */
+ char *last = NULL; /* last position for nesting bracket */
/* skip space before the delimiter */
if (isSPACE(*s))
/* after skipping whitespace, the next character is the terminator */
term = *s;
- if (!UTF8_IS_INVARIANT((U8)term) && UTF)
- has_utf8 = TRUE;
+ if (!UTF) {
+ termcode = termstr[0] = term;
+ termlen = 1;
+ }
+ else {
+ termcode = utf8_to_uvchr(s, &termlen);
+ Copy(s, termstr, termlen, U8);
+ if (!UTF8_IS_INVARIANT(term))
+ has_utf8 = TRUE;
+ }
/* mark where we are */
PL_multi_start = CopLINE(PL_curcop);
/* find corresponding closing delimiter */
if (term && (tmps = strchr("([{< )]}> )]}>",term)))
- term = tmps[5];
+ termcode = termstr[0] = term = tmps[5];
+
PL_multi_close = term;
/* create a new SV to hold the contents. 87 is leak category, I'm
assuming. 79 is the SV's initial length. What a random number. */
sv = NEWSV(87,79);
sv_upgrade(sv, SVt_PVIV);
- SvIVX(sv) = term;
+ SvIVX(sv) = termcode;
(void)SvPOK_only(sv); /* validate pointer */
/* move past delimiter and try to read a complete string */
if (keep_delims)
- sv_catpvn(sv, s, 1);
- s++;
+ sv_catpvn(sv, s, termlen);
+ s += termlen;
for (;;) {
+ if (PL_encoding && !UTF) {
+ bool cont = TRUE;
+
+ while (cont) {
+ int offset = s - SvPVX(PL_linestr);
+ bool found = sv_cat_decode(sv, PL_encoding, PL_linestr,
+ &offset, termstr, termlen);
+ char *ns = SvPVX(PL_linestr) + offset;
+ char *svlast = SvEND(sv) - 1;
+
+ for (; s < ns; s++) {
+ if (*s == '\n' && !PL_rsfp)
+ CopLINE_inc(PL_curcop);
+ }
+ if (!found)
+ goto read_more_line;
+ else {
+ /* handle quoted delimiters */
+ if (*(svlast-1) == '\\') {
+ char *t;
+ for (t = svlast-2; t >= SvPVX(sv) && *t == '\\';)
+ t--;
+ if ((svlast-1 - t) % 2) {
+ if (!keep_quoted) {
+ *(svlast-1) = term;
+ *svlast = '\0';
+ SvCUR_set(sv, SvCUR(sv) - 1);
+ }
+ continue;
+ }
+ }
+ if (PL_multi_open == PL_multi_close) {
+ cont = FALSE;
+ }
+ else {
+ char *t, *w;
+ if (!last)
+ last = SvPVX(sv);
+ for (w = t = last; t < svlast; w++, t++) {
+ /* At here, all closes are "was quoted" one,
+ so we don't check PL_multi_close. */
+ if (*t == '\\') {
+ if (!keep_quoted && *(t+1) == PL_multi_open)
+ t++;
+ else
+ *w++ = *t++;
+ }
+ else if (*t == PL_multi_open)
+ brackets++;
+
+ *w = *t;
+ }
+ if (w < t) {
+ *w++ = term;
+ *w = '\0';
+ SvCUR_set(sv, w - SvPVX(sv));
+ }
+ last = w;
+ if (--brackets <= 0)
+ cont = FALSE;
+ }
+ }
+ }
+ if (!keep_delims) {
+ SvCUR_set(sv, SvCUR(sv) - 1);
+ *SvEND(sv) = '\0';
+ }
+ break;
+ }
+
/* extend sv if need be */
SvGROW(sv, SvCUR(sv) + (PL_bufend - s) + 1);
/* set 'to' to the next character in the sv's string */
}
/* terminate when run out of buffer (the for() condition), or
have found the terminator */
- else if (*s == term)
- break;
+ else if (*s == term) {
+ if (termlen == 1)
+ break;
+ if (s+termlen <= PL_bufend && memEQ(s, termstr, termlen))
+ break;
+ }
else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF)
has_utf8 = TRUE;
*to = *s;
to[-1] = '\n';
#endif
+ read_more_line:
/* if we're out of file, or a read fails, bail and reset the current
line marker so we can report where the unterminated string began
*/
/* at this point, we have successfully read the delimited string */
- if (keep_delims)
- sv_catpvn(sv, s, 1);
- if (has_utf8)
+ if (!PL_encoding || UTF) {
+ if (keep_delims)
+ sv_catpvn(sv, s, termlen);
+ s += termlen;
+ }
+ if (has_utf8 || PL_encoding)
SvUTF8_on(sv);
- else if (PL_encoding)
- sv_recode_to_utf8(sv, PL_encoding);
PL_multi_end = CopLINE(PL_curcop);
- s++;
/* if we allocated too much space, give some back */
if (SvCUR(sv) + 5 < SvLEN(sv)) {