Re: [PATCH] Re: format, PerlIO and utf8
Inaba Hiroto [Mon, 10 Feb 2003 00:04:51 +0000 (09:04 +0900)]
Message-ID: <3E466E12.E5349D84@st.rim.or.jp>

p4raw-id: //depot/perl@18785

pp_ctl.c
toke.c

index 1528168..e22297e 100644 (file)
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -355,7 +355,8 @@ PP(pp_formline)
     bool gotsome = FALSE;
     STRLEN len;
     STRLEN fudge = SvCUR(tmpForm) * (IN_BYTES ? 1 : 3) + 1;
-    bool item_is_utf = FALSE;
+    bool item_is_utf8 = FALSE;
+    bool targ_is_utf8 = FALSE;
 
     if (!SvMAGICAL(tmpForm) || !SvCOMPILED(tmpForm)) {
        if (SvREADONLY(tmpForm)) {
@@ -366,8 +367,9 @@ PP(pp_formline)
        else
            doparseform(tmpForm);
     }
-
     SvPV_force(PL_formtarget, len);
+    if (DO_UTF8(PL_formtarget))
+       targ_is_utf8 = TRUE;
     t = SvGROW(PL_formtarget, len + fudge + 1);  /* XXX SvCUR bad */
     t += len;
     f = SvPV(tmpForm, len);
@@ -414,6 +416,26 @@ PP(pp_formline)
 
        case FF_LITERAL:
            arg = *fpc++;
+           if (targ_is_utf8 && !SvUTF8(tmpForm)) {
+               while (arg--) {
+                   if (!NATIVE_IS_INVARIANT(*f)) {
+                       U8 ch = NATIVE_TO_ASCII(*f++);
+                       *t++ = (U8)UTF8_EIGHT_BIT_HI(ch);
+                       *t++ = (U8)UTF8_EIGHT_BIT_LO(ch);
+                   }
+                   else 
+                       *t++ = *f++;
+               }
+               break;
+           }
+           if (!targ_is_utf8 && DO_UTF8(tmpForm)) {
+               SvCUR_set(PL_formtarget, t - SvPVX(PL_formtarget));
+               *t = '\0';
+               sv_utf8_upgrade(PL_formtarget);
+               SvGROW(PL_formtarget, SvCUR(PL_formtarget) + fudge + 1);
+               t = SvEND(PL_formtarget);
+               targ_is_utf8 = TRUE;
+           }
            while (arg--)
                *t++ = *f++;
            break;
@@ -458,13 +480,13 @@ PP(pp_formline)
                            break;
                        s++;
                    }
-                   item_is_utf = TRUE;
+                   item_is_utf8 = TRUE;
                    itemsize = s - item;
                    sv_pos_b2u(sv, &itemsize);
                    break;
                }
            }
-           item_is_utf = FALSE;
+           item_is_utf8 = FALSE;
            if (itemsize > fieldsize)
                itemsize = fieldsize;
            send = chophere = s + itemsize;
@@ -519,11 +541,11 @@ PP(pp_formline)
                        itemsize = chophere - item;
                        sv_pos_b2u(sv, &itemsize);
                    }
-                   item_is_utf = TRUE;
+                   item_is_utf8 = TRUE;
                    break;
                }
            }
-           item_is_utf = FALSE;
+           item_is_utf8 = FALSE;
            if (itemsize <= fieldsize) {
                send = chophere = s + itemsize;
                while (s < send) {
@@ -579,7 +601,15 @@ PP(pp_formline)
        case FF_ITEM:
            arg = itemsize;
            s = item;
-           if (item_is_utf) {
+           if (item_is_utf8) {
+               if (!targ_is_utf8) {
+                   SvCUR_set(PL_formtarget, t - SvPVX(PL_formtarget));
+                   *t = '\0';
+                   sv_utf8_upgrade(PL_formtarget);
+                   SvGROW(PL_formtarget, SvCUR(PL_formtarget) + fudge + 1);
+                   t = SvEND(PL_formtarget);
+                   targ_is_utf8 = TRUE;
+               }
                while (arg--) {
                    if (UTF8_IS_CONTINUED(*s)) {
                        STRLEN skip = UTF8SKIP(s);
@@ -628,22 +658,32 @@ PP(pp_formline)
        case FF_LINEGLOB:
            item = s = SvPV(sv, len);
            itemsize = len;
-           item_is_utf = FALSE;                /* XXX is this correct? */
+           if ((item_is_utf8 = DO_UTF8(sv)))
+               itemsize = sv_len_utf8(sv);         
            if (itemsize) {
+               bool chopped = FALSE;
                gotsome = TRUE;
-               send = s + itemsize;
+               send = s + len;
                while (s < send) {
                    if (*s++ == '\n') {
-                       if (s == send)
+                       if (s == send) {
                            itemsize--;
+                           chopped = TRUE;
+                       }
                        else
                            lines++;
                    }
                }
                SvCUR_set(PL_formtarget, t - SvPVX(PL_formtarget));
-               sv_catpvn(PL_formtarget, item, itemsize);
+               if (targ_is_utf8)
+                   SvUTF8_on(PL_formtarget);
+               sv_catsv(PL_formtarget, sv);
+               if (chopped)
+                   SvCUR_set(PL_formtarget, SvCUR(PL_formtarget) - 1);
                SvGROW(PL_formtarget, SvCUR(PL_formtarget) + fudge + 1);
                t = SvPVX(PL_formtarget) + SvCUR(PL_formtarget);
+               if (item_is_utf8)
+                   targ_is_utf8 = TRUE;
            }
            break;
 
@@ -739,6 +779,8 @@ PP(pp_formline)
                        if (strnEQ(linemark, linemark - arg, arg))
                            DIE(aTHX_ "Runaway format");
                    }
+                   if (targ_is_utf8)
+                       SvUTF8_on(PL_formtarget);
                    FmLINES(PL_formtarget) = lines;
                    SP = ORIGMARK;
                    RETURNOP(cLISTOP->op_first);
@@ -778,6 +820,8 @@ PP(pp_formline)
        case FF_END:
            *t = '\0';
            SvCUR_set(PL_formtarget, t - SvPVX(PL_formtarget));
+           if (targ_is_utf8)
+               SvUTF8_on(PL_formtarget);
            FmLINES(PL_formtarget) += lines;
            SP = ORIGMARK;
            RETPUSHYES;
diff --git a/toke.c b/toke.c
index 3e611e9..bc84fda 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -7617,6 +7617,12 @@ S_scan_formline(pTHX_ register char *s)
        }
        else
            PL_lex_state = LEX_FORMLINE;
+       if (!IN_BYTES) {
+           if (UTF && is_utf8_string((U8*)SvPVX(stuff), SvCUR(stuff)))
+               SvUTF8_on(stuff);
+           else if (PL_encoding)
+               sv_recode_to_utf8(stuff, PL_encoding);
+       }
        PL_nextval[PL_nexttoke].opval = (OP*)newSVOP(OP_CONST, 0, stuff);
        force_next(THING);
        PL_nextval[PL_nexttoke].ival = OP_FORMLINE;