From: Inaba Hiroto Date: Sat, 1 Feb 2003 21:58:20 +0000 (+0900) Subject: [Patch] parsing under encoding (Re: [Encode] HEADS-UP; $Encode::VERSION++ to enhance... X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=220e2d4e806a3c9c0a9f9b31667e8be830cbc55b;p=p5sagit%2Fp5-mst-13.2.git [Patch] parsing under encoding (Re: [Encode] HEADS-UP; $Encode::VERSION++ to enhance filter option)([perl #16823]) Message-ID: <3E3BC46B.6C687CFD@st.rim.or.jp> p4raw-id: //depot/perl@18660 --- diff --git a/embed.fnc b/embed.fnc index a2009db..8951a36 100644 --- a/embed.fnc +++ b/embed.fnc @@ -750,6 +750,8 @@ Amd |char* |sv_pvn_force |SV* sv|STRLEN* lp Apd |char* |sv_pvutf8n_force|SV* sv|STRLEN* lp Apd |char* |sv_pvbyten_force|SV* sv|STRLEN* lp Apd |char* |sv_recode_to_utf8 |SV* sv|SV *encoding +Apd |bool |sv_cat_decode |SV* dsv|SV *encoding|SV *ssv|int *offset \ + |char* tstr|int tlen Apd |char* |sv_reftype |SV* sv|int ob Apd |void |sv_replace |SV* sv|SV* nsv Apd |void |sv_report_used diff --git a/embed.h b/embed.h index c3de3cf..887eee1 100644 --- a/embed.h +++ b/embed.h @@ -1021,6 +1021,7 @@ #define sv_pvutf8n_force Perl_sv_pvutf8n_force #define sv_pvbyten_force Perl_sv_pvbyten_force #define sv_recode_to_utf8 Perl_sv_recode_to_utf8 +#define sv_cat_decode Perl_sv_cat_decode #define sv_reftype Perl_sv_reftype #define sv_replace Perl_sv_replace #define sv_report_used Perl_sv_report_used @@ -3470,6 +3471,7 @@ #define sv_pvutf8n_force(a,b) Perl_sv_pvutf8n_force(aTHX_ a,b) #define sv_pvbyten_force(a,b) Perl_sv_pvbyten_force(aTHX_ a,b) #define sv_recode_to_utf8(a,b) Perl_sv_recode_to_utf8(aTHX_ a,b) +#define sv_cat_decode(a,b,c,d,e,f) Perl_sv_cat_decode(aTHX_ a,b,c,d,e,f) #define sv_reftype(a,b) Perl_sv_reftype(aTHX_ a,b) #define sv_replace(a,b) Perl_sv_replace(aTHX_ a,b) #define sv_report_used() Perl_sv_report_used(aTHX) diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index c85cbbe..548c5ab 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -271,6 +271,19 @@ sub predefine_encodings{ return $octets; }; } + *cat_decode = sub{ # ($obj, $dst, $src, $pos, $trm, $chk) + my ($obj, undef, undef, $pos, $trm) = @_; # currently ignores $chk + my ($rdst, $rsrc, $rpos) = \@_[1,2,3]; + use bytes; + if ((my $npos = index($$rsrc, $trm, $pos)) >= 0) { + $$rdst .= substr($$rsrc, $pos, $npos - $pos + length($trm)); + $$rpos = $npos + length($trm); + return 1; + } + $$rdst .= substr($$rsrc, $pos); + $$rpos = length($$rsrc); + return ''; + }; $Encode::Encoding{utf8} = bless {Name => "utf8"} => "Encode::utf8"; } diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index 0461690..c4cb98e 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -59,7 +59,7 @@ call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig) static SV * encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, - int check) + int check, STRLEN * offset, SV * term, int * retcode) { STRLEN slen; U8 *s = (U8 *) SvPV(src, slen); @@ -72,20 +72,30 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, SV *dst = sv_2mortal(newSV(slen+1)); U8 *d = (U8 *)SvPVX(dst); STRLEN dlen = SvLEN(dst)-1; - int code; + int code = 0; + STRLEN trmlen = 0; + U8 *trm = term ? SvPV(term, trmlen) : NULL; + + if (offset) { + s += *offset; + slen -= *offset; + tlen = slen; + } - if (!slen){ + if (slen <= 0){ SvCUR_set(dst, 0); SvPOK_only(dst); goto ENCODE_END; } - while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check)) ) + while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check, + trm, trmlen)) ) { SvCUR_set(dst, dlen+ddone); SvPOK_only(dst); - if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL){ + if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL || + code == ENCODE_FOUND_TERM) { break; } switch (code) { @@ -233,8 +243,12 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, } #endif + if (offset) + *offset += sdone + slen; + ENCODE_END: *SvEND(dst) = '\0'; + if (retcode) *retcode = code; return dst; } @@ -381,6 +395,33 @@ CODE: } void +Method_cat_decode(obj, dst, src, off, term, check = 0) +SV * obj +SV * dst +SV * src +SV * off +SV * term +int check +CODE: +{ + encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + STRLEN offset = (STRLEN)SvIV(off); + int code = 0; + if (SvUTF8(src)) { + sv_utf8_downgrade(src, FALSE); + } + sv_catsv(dst, encode_method(aTHX_ enc, enc->t_utf8, src, check, + &offset, term, &code)); + SvIVX(off) = (IV)offset; + if (code == ENCODE_FOUND_TERM) { + ST(0) = &PL_sv_yes; + }else{ + ST(0) = &PL_sv_no; + } + XSRETURN(1); +} + +void Method_decode(obj,src,check = 0) SV * obj SV * src @@ -391,7 +432,8 @@ CODE: if (SvUTF8(src)) { sv_utf8_downgrade(src, FALSE); } - ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check); + ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check, + NULL, Nullsv, NULL); SvUTF8_on(ST(0)); XSRETURN(1); } @@ -405,7 +447,8 @@ CODE: { encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); sv_utf8_upgrade(src); - ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check); + ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check, + NULL, Nullsv, NULL); XSRETURN(1); } diff --git a/ext/Encode/Encode/encode.h b/ext/Encode/Encode/encode.h index b860578..fc8301a 100644 --- a/ext/Encode/Encode/encode.h +++ b/ext/Encode/Encode/encode.h @@ -76,7 +76,8 @@ struct encode_s /* See comment at top of file for deviousness */ extern int do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, - U8 *dst, STRLEN dlen, STRLEN *dout, int approx); + U8 *dst, STRLEN dlen, STRLEN *dout, int approx, + const U8 *term, STRLEN tlen); extern void Encode_DefineEncoding(encode_t *enc); @@ -86,6 +87,7 @@ extern void Encode_DefineEncoding(encode_t *enc); #define ENCODE_PARTIAL 2 #define ENCODE_NOREP 3 #define ENCODE_FALLBACK 4 +#define ENCODE_FOUND_TERM 5 #define FBCHAR_UTF8 "\xEF\xBF\xBD" diff --git a/ext/Encode/encengine.c b/ext/Encode/encengine.c index 4c2a7cf..6a08cfd 100644 --- a/ext/Encode/encengine.c +++ b/ext/Encode/encengine.c @@ -93,13 +93,13 @@ we add a flag to re-add the removed byte to the source we could handle int do_encode(encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst, - STRLEN dlen, STRLEN * dout, int approx) + STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen) { const U8 *s = src; const U8 *send = s + *slen; const U8 *last = s; U8 *d = dst; - U8 *dend = d + dlen; + U8 *dend = d + dlen, *dlast = d; int code = 0; while (s < send) { encpage_t *e = enc; @@ -133,6 +133,11 @@ do_encode(encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst, if (approx && (e->slen & 0x80)) code = ENCODE_FALLBACK; last = s; + if (term && d-dlast == tlen && memEQ(dlast, term, tlen)) { + code = ENCODE_FOUND_TERM; + break; + } + dlast = d; } } else { diff --git a/ext/Encode/lib/Encode/Encoding.pm b/ext/Encode/lib/Encode/Encoding.pm index 1876cb7..4e842b6 100644 --- a/ext/Encode/lib/Encode/Encoding.pm +++ b/ext/Encode/lib/Encode/Encoding.pm @@ -130,6 +130,13 @@ replacement character. =back +=item -Ecat_decode($destination, $octets, $offset, $terminator [,$check]) + +MUST decode I<$octets> with I<$offset> and concatenate it to I<$destination>. +Decoding will terminate when $terminator (a string) appears in output. +I<$offset> will be modified to the last $octets position at end of decode. +Returns true if $terminator appears output, else returns false. + =head2 Other methods defined in Encode::Encodings You do not have to override methods shown below unless you have to. diff --git a/ext/Encode/lib/Encode/JP/JIS7.pm b/ext/Encode/lib/Encode/JP/JIS7.pm index d49ec6c..52e5e5c 100644 --- a/ext/Encode/lib/Encode/JP/JIS7.pm +++ b/ext/Encode/lib/Encode/JP/JIS7.pm @@ -60,9 +60,52 @@ sub encode($$;$) return $octet; } +# +# cat_decode +# +my $re_scan_jis_g = qr{ + \G ( ($RE{JIS_0212}) | $RE{JIS_0208} | + ($RE{ISO_ASC}) | ($RE{JIS_KANA}) | ) + ([^\e]*) +}x; +sub cat_decode { # ($obj, $dst, $src, $pos, $trm, $chk) + my ($obj, undef, undef, $pos, $trm) = @_; # currently ignores $chk + my ($rdst, $rsrc, $rpos) = \@_[1,2,3]; + local ${^ENCODING}; + use bytes; + my $opos = pos($$rsrc); + pos($$rsrc) = $pos; + while ($$rsrc =~ /$re_scan_jis_g/gc) { + my ($esc, $esc_0212, $esc_asc, $esc_kana, $chunk) = + ($1, $2, $3, $4, $5); + + unless ($chunk) { $esc or last; next; } + + if ($esc && !$esc_asc) { + $chunk =~ tr/\x21-\x7e/\xa1-\xfe/; + if ($esc_kana) { + $chunk =~ s/([\xa1-\xdf])/\x8e$1/og; + } elsif ($esc_0212) { + $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og; + } + $chunk = Encode::decode('euc-jp', $chunk, 0); + } + elsif ((my $npos = index($chunk, $trm)) >= 0) { + $$rdst .= substr($chunk, 0, $npos + length($trm)); + $$rpos += length($esc) + $npos + length($trm); + pos($$rsrc) = $opos; + return 1; + } + $$rdst .= $chunk; + $$rpos = pos($$rsrc); + } + $$rpos = pos($$rsrc); + pos($$rsrc) = $opos; + return ''; +} # JIS<->EUC -our $re_scan_jis = qr{ +my $re_scan_jis = qr{ (?:($RE{JIS_0212})|$RE{JIS_0208}|($RE{ISO_ASC})|($RE{JIS_KANA}))([^\e]*) }x; diff --git a/global.sym b/global.sym index 9e3ddcd..3a8b5b9 100644 --- a/global.sym +++ b/global.sym @@ -469,6 +469,7 @@ Perl_sv_pos_b2u Perl_sv_pvutf8n_force Perl_sv_pvbyten_force Perl_sv_recode_to_utf8 +Perl_sv_cat_decode Perl_sv_reftype Perl_sv_replace Perl_sv_report_used diff --git a/lib/utf8.t b/lib/utf8.t index 8072c87..6728238 100644 --- a/lib/utf8.t +++ b/lib/utf8.t @@ -37,7 +37,7 @@ no utf8; # Ironic, no? # # -plan tests => 98; +plan tests => 99; { # bug id 20001009.001 @@ -323,3 +323,8 @@ END is("@i", "60 62 58 50 52 48 70 72 68", "utf8 heredoc index and rindex"); } +{ + use utf8; + eval qq{is(q \xc3\xbc test \xc3\xbc, qq\xc2\xb7 test \xc2\xb7, + "utf8 quote delimiters [perl #16823]");}; +} diff --git a/pod/perlapi.pod b/pod/perlapi.pod index 695a44c..59b80c3 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -4788,6 +4788,23 @@ The pointer to the PV of the dsv is returned. =for hackers Found in file utf8.c +=item sv_cat_decode + +The encoding is assumed to be an Encode object, the PV of the ssv is +assumed to be octets in that encoding and decoding the input starts +from the position which (PV + *offset) pointed to. The dsv will be +concatenated the decoded UTF-8 string from ssv. Decoding will terminate +when the string tstr appears in decoding output or the input ends on +the PV of the ssv. The value which the offset points will be modified +to the last input position on the ssv. + +Returns TRUE if the terminator was found, else returns FALSE. + + bool sv_cat_decode(SV* dsv, SV *encoding, SV *ssv, int *offset, char* tstr, int tlen) + +=for hackers +Found in file sv.c + =item sv_recode_to_utf8 The encoding is assumed to be an Encode object, on entry the PV diff --git a/sv.c b/sv.c index 4c148d8..89792cf 100644 --- a/sv.c +++ b/sv.c @@ -11168,14 +11168,14 @@ The PV of the sv is returned. char * Perl_sv_recode_to_utf8(pTHX_ SV *sv, SV *encoding) { - if (SvPOK(sv) && !DO_UTF8(sv) && SvROK(encoding)) { - int vary = FALSE; + if (SvPOK(sv) && !SvUTF8(sv) && !IN_BYTES && SvROK(encoding)) { SV *uni; STRLEN len; char *s; dSP; ENTER; SAVETMPS; + save_re_context(); PUSHMARK(sp); EXTEND(SP, 3); XPUSHs(encoding); @@ -11196,13 +11196,6 @@ Perl_sv_recode_to_utf8(pTHX_ SV *sv, SV *encoding) uni = POPs; PUTBACK; s = SvPV(uni, len); - { - U8 *t = (U8 *)s, *e = (U8 *)s + len; - while (t < e) { - if ((vary = !UTF8_IS_INVARIANT(*t++))) - break; - } - } if (s != SvPVX(sv)) { SvGROW(sv, len + 1); Move(s, SvPVX(sv), len, char); @@ -11211,12 +11204,54 @@ Perl_sv_recode_to_utf8(pTHX_ SV *sv, SV *encoding) } FREETMPS; LEAVE; - if (vary) - SvUTF8_on(sv); SvUTF8_on(sv); } return SvPVX(sv); } +/* +=for apidoc sv_cat_decode + +The encoding is assumed to be an Encode object, the PV of the ssv is +assumed to be octets in that encoding and decoding the input starts +from the position which (PV + *offset) pointed to. The dsv will be +concatenated the decoded UTF-8 string from ssv. Decoding will terminate +when the string tstr appears in decoding output or the input ends on +the PV of the ssv. The value which the offset points will be modified +to the last input position on the ssv. +Returns TRUE if the terminator was found, else returns FALSE. + +=cut */ + +bool +Perl_sv_cat_decode(pTHX_ SV *dsv, SV *encoding, + SV *ssv, int *offset, char *tstr, int tlen) +{ + if (SvPOK(ssv) && SvPOK(dsv) && SvROK(encoding) && offset) { + bool ret = FALSE; + SV *offsv; + dSP; + ENTER; + SAVETMPS; + save_re_context(); + PUSHMARK(sp); + EXTEND(SP, 6); + XPUSHs(encoding); + XPUSHs(dsv); + XPUSHs(ssv); + XPUSHs(offsv = sv_2mortal(newSViv(*offset))); + XPUSHs(sv_2mortal(newSVpvn(tstr, tlen))); + PUTBACK; + call_method("cat_decode", G_SCALAR); + SPAGAIN; + ret = SvTRUE(TOPs); + *offset = SvIV(offsv); + PUTBACK; + FREETMPS; + LEAVE; + return ret; + } + Perl_croak(aTHX_ "Invalid argument to sv_cat_decode."); +} diff --git a/t/uni/tr_7jis.t b/t/uni/tr_7jis.t index 894ff4c..6e74f1d 100644 --- a/t/uni/tr_7jis.t +++ b/t/uni/tr_7jis.t @@ -53,10 +53,10 @@ is($str, $katakana, "tr// # hiragana -> katakana"); $str = $katakana; $str =~ tr/ァ-ン/ぁ-ん/; is($str, $hiragana, "tr// # hiragana -> katakana"); -$str = $hiragana; eval qq{\$str =~ tr/ぁ-ん/ァ-ン/}; -is($str, $katakana, "eval qq{tr//} # hiragana -> katakana"); -$str = $katakana; eval qq{\$str =~ tr/ァ-ン/ぁ-ん/}; -is($str, $hiragana, "eval qq{tr//} # hiragana -> katakana"); +$str = $hiragana; eval qq(\$str =~ tr/ぁ-ん/ァ-ン/); +is($str, $katakana, "eval qq(tr//) # hiragana -> katakana"); +$str = $katakana; eval qq(\$str =~ tr/ァ-ン/ぁ-ん/); +is($str, $hiragana, "eval qq(tr//) # hiragana -> katakana"); $str = $hiragana; $str =~ s/([ぁ-ん])/$h2k{$1}/go; is($str, $katakana, "s/// # hiragana -> katakana"); diff --git a/toke.c b/toke.c index e7834c4..6b27a37 100644 --- a/toke.c +++ b/toke.c @@ -6882,6 +6882,10 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) register char *to; /* current position in the sv's data */ I32 brackets = 1; /* bracket nesting level */ bool has_utf8 = FALSE; /* is there any utf8 content? */ + I32 termcode; /* terminating char. code */ + U8 termstr[UTF8_MAXLEN]; /* terminating string */ + STRLEN termlen; /* length of terminating string */ + char *last = NULL; /* last position for nesting bracket */ /* skip space before the delimiter */ if (isSPACE(*s)) @@ -6892,8 +6896,16 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) /* after skipping whitespace, the next character is the terminator */ term = *s; - if (!UTF8_IS_INVARIANT((U8)term) && UTF) - has_utf8 = TRUE; + if (!UTF) { + termcode = termstr[0] = term; + termlen = 1; + } + else { + termcode = utf8_to_uvchr(s, &termlen); + Copy(s, termstr, termlen, U8); + if (!UTF8_IS_INVARIANT(term)) + has_utf8 = TRUE; + } /* mark where we are */ PL_multi_start = CopLINE(PL_curcop); @@ -6901,21 +6913,92 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) /* find corresponding closing delimiter */ if (term && (tmps = strchr("([{< )]}> )]}>",term))) - term = tmps[5]; + termcode = termstr[0] = term = tmps[5]; + PL_multi_close = term; /* create a new SV to hold the contents. 87 is leak category, I'm assuming. 79 is the SV's initial length. What a random number. */ sv = NEWSV(87,79); sv_upgrade(sv, SVt_PVIV); - SvIVX(sv) = term; + SvIVX(sv) = termcode; (void)SvPOK_only(sv); /* validate pointer */ /* move past delimiter and try to read a complete string */ if (keep_delims) - sv_catpvn(sv, s, 1); - s++; + sv_catpvn(sv, s, termlen); + s += termlen; for (;;) { + if (PL_encoding && !UTF) { + bool cont = TRUE; + + while (cont) { + int offset = s - SvPVX(PL_linestr); + bool found = sv_cat_decode(sv, PL_encoding, PL_linestr, + &offset, termstr, termlen); + char *ns = SvPVX(PL_linestr) + offset; + char *svlast = SvEND(sv) - 1; + + for (; s < ns; s++) { + if (*s == '\n' && !PL_rsfp) + CopLINE_inc(PL_curcop); + } + if (!found) + goto read_more_line; + else { + /* handle quoted delimiters */ + if (*(svlast-1) == '\\') { + char *t; + for (t = svlast-2; t >= SvPVX(sv) && *t == '\\';) + t--; + if ((svlast-1 - t) % 2) { + if (!keep_quoted) { + *(svlast-1) = term; + *svlast = '\0'; + SvCUR_set(sv, SvCUR(sv) - 1); + } + continue; + } + } + if (PL_multi_open == PL_multi_close) { + cont = FALSE; + } + else { + char *t, *w; + if (!last) + last = SvPVX(sv); + for (w = t = last; t < svlast; w++, t++) { + /* At here, all closes are "was quoted" one, + so we don't check PL_multi_close. */ + if (*t == '\\') { + if (!keep_quoted && *(t+1) == PL_multi_open) + t++; + else + *w++ = *t++; + } + else if (*t == PL_multi_open) + brackets++; + + *w = *t; + } + if (w < t) { + *w++ = term; + *w = '\0'; + SvCUR_set(sv, w - SvPVX(sv)); + } + last = w; + if (--brackets <= 0) + cont = FALSE; + } + } + } + if (!keep_delims) { + SvCUR_set(sv, SvCUR(sv) - 1); + *SvEND(sv) = '\0'; + } + break; + } + /* extend sv if need be */ SvGROW(sv, SvCUR(sv) + (PL_bufend - s) + 1); /* set 'to' to the next character in the sv's string */ @@ -6937,8 +7020,12 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) } /* terminate when run out of buffer (the for() condition), or have found the terminator */ - else if (*s == term) - break; + else if (*s == term) { + if (termlen == 1) + break; + if (s+termlen <= PL_bufend && memEQ(s, termstr, termlen)) + break; + } else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) has_utf8 = TRUE; *to = *s; @@ -7000,6 +7087,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) to[-1] = '\n'; #endif + read_more_line: /* if we're out of file, or a read fails, bail and reset the current line marker so we can report where the unterminated string began */ @@ -7030,15 +7118,15 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) /* at this point, we have successfully read the delimited string */ - if (keep_delims) - sv_catpvn(sv, s, 1); - if (has_utf8) + if (!PL_encoding || UTF) { + if (keep_delims) + sv_catpvn(sv, s, termlen); + s += termlen; + } + if (has_utf8 || PL_encoding) SvUTF8_on(sv); - else if (PL_encoding) - sv_recode_to_utf8(sv, PL_encoding); PL_multi_end = CopLINE(PL_curcop); - s++; /* if we allocated too much space, give some back */ if (SvCUR(sv) + 5 < SvLEN(sv)) {