Upgrade to Archive-Tar-1.30. Since change #27571 is not included,
[p5sagit/p5-mst-13.2.git] / toke.c
diff --git a/toke.c b/toke.c
index 5976732..b3688bb 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -499,7 +499,7 @@ S_feature_is_enabled(pTHX_ const char *name, STRLEN namelen)
     dVAR;
     HV * const hinthv = GvHV(PL_hintgv);
     char he_name[32] = "feature_";
-    (void) strncpy(&he_name[8], name, 24);
+    (void) my_strlcpy(&he_name[8], name, 24);
 
     return (hinthv && hv_exists(hinthv, he_name, 8 + namelen));
 }
@@ -1699,12 +1699,12 @@ S_sublex_done(pTHX)
   Extracts a pattern, double-quoted string, or transliteration.  This
   is terrifying code.
 
-  It looks at lex_inwhat and PL_lex_inpat to find out whether it's
+  It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
   processing a pattern (PL_lex_inpat is true), a transliteration
-  (lex_inwhat & OP_TRANS is true), or a double-quoted string.
+  (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
 
-  Returns a pointer to the character scanned up to. Iff this is
-  advanced from the start pointer supplied (ie if anything was
+  Returns a pointer to the character scanned up to. If this is
+  advanced from the start pointer supplied (i.e. if anything was
   successfully parsed), will leave an OP for the substring scanned
   in yylval. Caller must intuit reason for not parsing further
   by looking at the next characters herself.
@@ -1713,21 +1713,23 @@ S_sublex_done(pTHX)
     backslashes:
       double-quoted style: \r and \n
       regexp special ones: \D \s
-      constants: \x3
-      backrefs: \1 (deprecated in substitution replacements)
+      constants: \x31
+      backrefs: \1
       case and quoting: \U \Q \E
     stops on @ and $, but not for $ as tail anchor
 
   In transliterations:
     characters are VERY literal, except for - not at the start or end
-    of the string, which indicates a range.  scan_const expands the
-    range to the full set of intermediate characters.
+    of the string, which indicates a range. If the range is in bytes,
+    scan_const expands the range to the full set of intermediate
+    characters. If the range is in utf8, the hyphen is replaced with
+    a certain range mark which will be handled by pmtrans() in op.c.
 
   In double-quoted strings:
     backslashes:
       double-quoted style: \r and \n
-      constants: \x3
-      backrefs: \1 (deprecated)
+      constants: \x31
+      deprecated backrefs: \1 (in substitution replacements)
       case and quoting: \U \Q \E
     stops on @ and $
 
@@ -1735,31 +1737,35 @@ S_sublex_done(pTHX)
   It stops processing as soon as it finds an embedded $ or @ variable
   and leaves it to the caller to work out what's going on.
 
-  @ in pattern could be: @foo, @{foo}, @$foo, @'foo, @::foo.
+  embedded arrays (whether in pattern or not) could be:
+      @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
+
+  $ in double-quoted strings must be the symbol of an embedded scalar.
 
   $ in pattern could be $foo or could be tail anchor.  Assumption:
   it's a tail anchor if $ is the last thing in the string, or if it's
-  followed by one of ")| \n\t"
+  followed by one of "()| \r\n\t"
 
   \1 (backreferences) are turned into $1
 
   The structure of the code is
       while (there's a character to process) {
-          handle transliteration ranges
-         skip regexp comments
-         skip # initiated comments in //x patterns
-         check for embedded @foo
+         handle transliteration ranges
+         skip regexp comments /(?#comment)/ and codes /(?{code})/
+         skip #-initiated comments in //x patterns
+         check for embedded arrays
          check for embedded scalars
          if (backslash) {
-             leave intact backslashes from leave (below)
-             deprecate \1 in strings and sub replacements
+             leave intact backslashes from leaveit (below)
+             deprecate \1 in substitution replacements
              handle string-changing backslashes \l \U \Q \E, etc.
              switch (what was escaped) {
-                 handle - in a transliteration (becomes a literal -)
-                 handle \132 octal characters
-                 handle 0x15 hex characters
-                 handle \cV (control V)
-                 handle printf backslashes (\f, \r, \n, etc)
+                 handle \- in a transliteration (becomes a literal -)
+                 handle \132 (octal characters)
+                 handle \x15 and \x{1234} (hex characters)
+                 handle \N{name} (named characters)
+                 handle \cV (control characters)
+                 handle printf-style backslashes (\f, \r, \n, etc)
              } (end switch)
          } (end if backslash)
     } (end while character to read)
@@ -1781,11 +1787,12 @@ S_scan_const(pTHX_ char *start)
     UV uv;
 #ifdef EBCDIC
     UV literal_endpoint = 0;
+    bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
 #endif
 
     const char * const leaveit = /* set of acceptably-backslashed characters */
        PL_lex_inpat
-           ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxz0123456789[{]} \t\n\r\f\v#"
+           ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#"
            : "";
 
     if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
@@ -1804,7 +1811,15 @@ S_scan_const(pTHX_ char *start)
                I32 min;                        /* first character in range */
                I32 max;                        /* last character in range */
 
-               if (has_utf8) {
+#ifdef EBCDIC
+               UV uvmax = 0;
+#endif
+
+               if (has_utf8
+#ifdef EBCDIC
+                   && !native_range
+#endif
+                   ) {
                    char * const c = (char*)utf8_hop((U8*)d, -1);
                    char *e = d++;
                    while (e-- > c)
@@ -1817,12 +1832,43 @@ S_scan_const(pTHX_ char *start)
                }
 
                i = d - SvPVX_const(sv);                /* remember current offset */
+#ifdef EBCDIC
+                SvGROW(sv,
+                      SvLEN(sv) + (has_utf8 ?
+                                   (512 - UTF_CONTINUATION_MARK +
+                                    UNISKIP(0x100))
+                                   : 256));
+                /* How many two-byte within 0..255: 128 in UTF-8,
+                * 96 in UTF-8-mod. */
+#else
                SvGROW(sv, SvLEN(sv) + 256);    /* never more than 256 chars in a range */
+#endif
                d = SvPVX(sv) + i;              /* refresh d after realloc */
-               d -= 2;                         /* eat the first char and the - */
-
-               min = (U8)*d;                   /* first char in range */
-               max = (U8)d[1];                 /* last char in range  */
+#ifdef EBCDIC
+                if (has_utf8) {
+                    int j;
+                    for (j = 0; j <= 1; j++) {
+                        char * const c = (char*)utf8_hop((U8*)d, -1);
+                        const UV uv    = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
+                        if (j)
+                            min = (U8)uv;
+                        else if (uv < 256)
+                            max = (U8)uv;
+                        else {
+                            max = (U8)0xff; /* only to \xff */
+                            uvmax = uv; /* \x{100} to uvmax */
+                        }
+                        d = c; /* eat endpoint chars */
+                     }
+                }
+               else {
+#endif
+                  d -= 2;              /* eat the first char and the - */
+                  min = (U8)*d;        /* first char in range */
+                  max = (U8)d[1];      /* last char in range  */
+#ifdef EBCDIC
+              }
+#endif
 
                 if (min > max) {
                    Perl_croak(aTHX_
@@ -1847,7 +1893,29 @@ S_scan_const(pTHX_ char *start)
                else
 #endif
                    for (i = min; i <= max; i++)
-                       *d++ = (char)i;
+#ifdef EBCDIC
+                        if (has_utf8) {
+                            const U8 ch = (U8)NATIVE_TO_UTF(i);
+                            if (UNI_IS_INVARIANT(ch))
+                                *d++ = (U8)i;
+                            else {
+                                *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
+                                *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
+                            }
+                        }
+                        else
+#endif
+                            *d++ = (char)i;
+#ifdef EBCDIC
+                if (uvmax) {
+                    d = (char*)uvchr_to_utf8((U8*)d, 0x100);
+                    if (uvmax > 0x101)
+                        *d++ = (char)UTF_TO_NATIVE(0xff);
+                    if (uvmax > 0x100)
+                        d = (char*)uvchr_to_utf8((U8*)d, uvmax);
+                }
+#endif
 
                /* mark the range as done, and continue */
                dorange = FALSE;
@@ -1863,7 +1931,11 @@ S_scan_const(pTHX_ char *start)
                if (didrange) {
                    Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
                }
-               if (has_utf8) {
+               if (has_utf8
+#ifdef EBCDIC
+                   && !native_range
+#endif
+                   ) {
                    *d++ = (char)UTF_TO_NATIVE(0xff);   /* use illegal utf8 byte--see pmtrans */
                    s++;
                    continue;
@@ -1875,6 +1947,7 @@ S_scan_const(pTHX_ char *start)
                didrange = FALSE;
 #ifdef EBCDIC
                literal_endpoint = 0;
+               native_range = TRUE;
 #endif
            }
        }
@@ -1921,9 +1994,14 @@ S_scan_const(pTHX_ char *start)
        /* check for embedded arrays
           (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
           */
-       else if (*s == '@' && s[1]
-                && (isALNUM_lazy_if(s+1,UTF) || strchr(":'{$+-", s[1])))
-           break;
+       else if (*s == '@' && s[1]) {
+           if (isALNUM_lazy_if(s+1,UTF))
+               break;
+           if (strchr(":'{$", s[1]))
+               break;
+           if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
+               break; /* in regexp, neither @+ nor @- are interpolated */
+       }
 
        /* check for embedded scalars.  only stop if we're sure it's a
           variable.
@@ -1979,8 +2057,8 @@ S_scan_const(pTHX_ char *start)
                    if ((isALPHA(*s) || isDIGIT(*s)) &&
                        ckWARN(WARN_MISC))
                        Perl_warner(aTHX_ packWARN(WARN_MISC),
-                              "Unrecognized escape \\%c passed through",
-                              *s);
+                                   "Unrecognized escape \\%c passed through",
+                                   *s);
                    /* default action is to copy the quoted character */
                    goto default_action;
                }
@@ -2078,6 +2156,10 @@ S_scan_const(pTHX_ char *start)
                                (PL_lex_repl ? OPpTRANS_FROM_UTF
                                             : OPpTRANS_TO_UTF);
                        }
+#ifdef EBCDIC
+                       if (uv > 255 && !dorange)
+                           native_range = FALSE;
+#endif
                     }
                    else {
                        *d++ = (char)uv;
@@ -2155,6 +2237,10 @@ S_scan_const(pTHX_ char *start)
                        SvGROW(sv, (SvLEN(sv) + len - (e - s + 4)));
                        d = SvPVX(sv) + (d - odest);
                    }
+#ifdef EBCDIC
+                   if (!dorange)
+                       native_range = FALSE; /* \N{} is guessed to be Unicode */
+#endif
                    Copy(str, d, len, char);
                    d += len;
                    SvREFCNT_dec(res);
@@ -2228,6 +2314,10 @@ S_scan_const(pTHX_ char *start)
            }
            d = (char*)uvchr_to_utf8((U8*)d, nextuv);
            has_utf8 = TRUE;
+#ifdef EBCDIC
+           if (uv > 255 && !dorange)
+               native_range = FALSE;
+#endif
        }
        else {
            *d++ = NATIVE_TO_NEED(has_utf8,*s++);
@@ -6697,7 +6787,7 @@ S_pending_ident(pTHX)
 {
     dVAR;
     register char *d;
-    register PADOFFSET tmp = 0;
+    PADOFFSET tmp = 0;
     /* pit holds the identifier we read and pending_ident is reset */
     char pit = PL_pending_ident;
     PL_pending_ident = 0;
@@ -11186,12 +11276,11 @@ S_scan_inputsymbol(pTHX_ char *start)
           filehandle
        */
        if (*d == '$') {
-           PADOFFSET tmp;
-
            /* try to find it in the pad for this block, otherwise find
               add symbol table ops
            */
-           if ((tmp = pad_findmy(d)) != NOT_IN_PAD) {
+           const PADOFFSET tmp = pad_findmy(d);
+           if (tmp != NOT_IN_PAD) {
                if (PAD_COMPNAME_FLAGS_isOUR(tmp)) {
                    HV * const stash = PAD_COMPNAME_OURSTASH(tmp);
                    HEK * const stashname = HvNAME_HEK(stash);
@@ -12408,6 +12497,15 @@ S_swallow_bom(pTHX_ U8 *s)
                  goto utf16be;
             }
        }
+#ifdef EBCDIC
+    case 0xDD:
+        if (slen > 3 && s[1] == 0x73 && s[2] == 0x66 && s[3] == 0x73) {
+            if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-8 script encoding (BOM)\n");
+            s += 4;                      /* UTF-8 */
+        }
+        break;
+#endif
+
     default:
         if (slen > 3 && s[1] == 0 && s[2] != 0 && s[3] == 0) {
                  /* Leading bytes