PATCH: [perl #56444] delayed interpolation of \N{...}

diff --git a/embed.fnc b/embed.fnc

index 769481b..9ccd663 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -165,6 +165,7 @@ npR |MEM_SIZE|malloc_good_size      |size_t nbytes
 
 AnpR   |void*  |get_context
 Anp    |void   |set_context    |NN void *t
+EpRnP  |I32    |regcurly       |NN const char *s
 
 END_EXTERN_C
 
@@ -1706,7 +1707,6 @@ Es        |regnode*|regbranch     |NN struct RExC_state_t *pRExC_state \
 Es     |STRLEN |reguni         |NN const struct RExC_state_t *pRExC_state \
                                |UV uv|NN char *s
 Es     |regnode*|regclass      |NN struct RExC_state_t *pRExC_state|U32 depth
-ERsn   |I32    |regcurly       |NN const char *s
 Es     |regnode*|reg_node      |NN struct RExC_state_t *pRExC_state|U8 op
 Es     |UV     |reg_recode     |const char value|NN SV **encp
 Es     |regnode*|regpiece      |NN struct RExC_state_t *pRExC_state \
diff --git a/pod/perl5120delta.pod b/pod/perl5120delta.pod

index aebeedf..47304ff 100644 (file)
--- a/pod/perl5120delta.pod
+++ b/pod/perl5120delta.pod
@@ -237,9 +237,10 @@ for some or all operations. (Yuval Kogman)
 
 A new regex escape has been added, C<\N>. It will match any character that
 is not a newline, independently from the presence or absence of the single
-line match modifier C</s>. (If C<\N> is followed by an opening brace and
+line match modifier C</s>.  It is not usable within a character class.
+(If C<\N> is followed by an opening brace and
 by a letter, perl will still assume that a Unicode character name is
-coming, so compatibility is preserved.) (Rafael Garcia-Suarez)
+coming, so compatibility is preserved.) (Rafael Garcia-Suarez).
 
 This will break a L<custom charnames translator|charnames/CUSTOM TRANSLATORS>
 which allows numbers for character names, as C<\N{3}> will now mean to match 3
@@ -2464,24 +2465,6 @@ take a block as their first argument, like
 
 =item *
 
-The C<charnames> pragma may generate a run-time error when a regex is
-interpolated [RT #56444]:
-
-    use charnames ':full';
-    my $r1 = qr/\N{THAI CHARACTER SARA I}/;
-    "foo" =~ $r1;    # okay
-    "foo" =~ /$r1+/; # runtime error
-
-A workaround is to generate the character outside of the regex:
-
-    my $a = "\N{THAI CHARACTER SARA I}";
-    my $r1 = qr/$a/;
-
-However, C<$r1> must be used within the scope of the C<use charnames> for this
-to work.
-
-=item *
-
 Some regexes may run much more slowly when run in a child thread compared
 with the thread the pattern was compiled into [RT #55600].
 
diff --git a/pod/perldiag.pod b/pod/perldiag.pod

index aeb5d27..486a515 100644 (file)
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1912,10 +1912,10 @@ about 250 characters for simple names, and somewhat more for compound
 names (like C<$A::B>).  You've exceeded Perl's limits.  Future versions
 of Perl are likely to eliminate these arbitrary limitations.
 
-=item Ignoring %s in character class in regex; marked by <-- HERE in m/%s/
+=item Ignoring zero length \N{} in character class"
 
-(W) Named Unicode character escapes (\N{...}) may return multi-char
-or zero length sequences. When such an escape is used in a character class
+(W) Named Unicode character escapes (\N{...}) may return a
+zero length sequence.  When such an escape is used in a character class
 its behaviour is not well defined. Check that the correct escape has
 been used, and the correct charname handler is in scope.
 
@@ -2395,6 +2395,10 @@ See also L<Encode/"Handling Malformed Data">.
 (F) Perl thought it was reading UTF-16 encoded character data but while
 doing it Perl met a malformed Unicode surrogate.
 
+=item Malformed UTF-8 returned by \N
+
+(F) The charnames handler returned malformed UTF-8.
+
 =item Malformed UTF-8 string in pack
 
 (F) You tried to pack something that didn't comply with UTF-8 encoding
@@ -2467,7 +2471,7 @@ supplied.
 (F) The argument to the indicated command line switch must follow
 immediately after the switch, without intervening spaces.
 
-=item Missing %sbrace%s on \N{}
+=item Missing braces on \N{}
 
 (F) Wrong syntax of character name literal C<\N{charname}> within
 double-quotish context.
@@ -2506,7 +2510,34 @@ can vary from one line to the next.
 
 =item Missing right brace on %s
 
-(F) Missing right brace in C<\x{...}>, C<\p{...}> or C<\P{...}>.
+(F) Missing right brace in C<\x{...}>, C<\p{...}>, C<\P{...}>, or C<\N{...}>.
+
+=item Missing right brace on \\N{} or unescaped left brace after \\N.  Assuming the latter
+
+(W syntax)
+C<\N> has traditionally been followed by a name enclosed in braces,
+meaning the character (or sequence of characters) given by that name.
+Thus C<\N{ASTERISK}> is another way of writing C<*>, valid in both
+double-quoted strings and regular expression patterns.
+In patterns, it doesn't have the meaning an unescaped C<*> does.
+
+Starting in Perl 5.12.0, C<\N> also can have an additional meaning in patterns,
+namely to match a non-newline character.  (This is like C<.> but is not
+affected by the C</s> modifier.)
+
+This can lead to some ambiguities.  When C<\N> is not followed immediately by a
+left brace, Perl assumes the "match non-newline character" meaning.  Also, if
+the braces form a valid quantifier such as C<\N{3}> or C<\N{5,}>, Perl assumes
+that this means to match the given quantity of non-newlines (in these examples,
+3, and 5 or more, respectively).  In all other case, where there is a C<\N{>
+and a matching C<}>, Perl assumes that a character name is desired.
+
+However, if there is no matching C<}>, Perl doesn't know if it was mistakenly
+omitted, or if "match non-newline" followed by "match a C<{>" was desired.
+It assumes the latter because that is actually a valid interpretation as
+written, unlike the other case.  If you meant the former, you need to add the
+matching right brace.  If you did mean the latter, you can silence this warning
+by writing instead C<\N\{>.
 
 =item Missing right curly or square bracket
 
@@ -2593,6 +2624,13 @@ that yet.
 sense to try to declare one with a package qualifier on the front.  Use
 local() if you want to localize a package variable.
 
+=item \\N in a character class must be a named character: \\N{...}
+
+The new (5.12) meaning of C<\N> to match non-newlines is not valid in a
+bracketed character class, for the same reason that C<.> in a character class
+loses its specialness: it matches almost everything, which is probably not what
+you want.
+
 =item Name "%s::%s" used only once: possible typo
 
 (W once) Typographical errors often show up as unique variable names.
@@ -2605,6 +2643,11 @@ NOTE: This warning detects symbols that have been used only once so $c, @c,
 the same; if a program uses $c only once but also uses any of the others it
 will not trigger this warning.
 
+=item Invalid hexadecimal number in \\N{U+...}
+
+(F) The character constant represented by C<...> is not a valid hexadecimal
+number.
+
 =item Negative '/' count in unpack
 
 (F) The length count obtained from a length/code unpack operation was
@@ -4943,6 +4986,20 @@ C<< @foo->[23] >> or C<< @$ref->[99] >>.  Versions of perl <= 5.6.1 used to
 allow this syntax, but shouldn't have. It is now deprecated, and will be
 removed in a future version.
 
+=item Using just the first character returned by \N{} in character class
+
+(W) A charnames handler may return a sequence of more than one character.
+Currently all but the first one are discarded when used in a regular
+expression pattern bracketed character class.
+
+=item Using just the first characters returned by \N{}
+
+(W) A charnames handler may return a sequence of characters.  There is a finite
+limit as to the number of characters that can be used, which this sequence
+exceeded.  In the message, the characters in the sequence are separated by
+dots, and each is shown by its ordinal in hex.  Anything to the left of the
+C<HERE> was retained; anything to the right was discarded.
+
 =item UTF-16 surrogate %s
 
 (W utf8) You tried to generate half of a UTF-16 surrogate by
diff --git a/regcomp.c b/regcomp.c

index 6669d58..ce4104a 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -132,7 +132,6 @@ typedef struct RExC_state_t {
     I32                orig_utf8;      /* whether the pattern was originally in utf8 */
                                /* XXX use this for future optimisation of case
                                 * where pattern must be upgraded to utf8. */
-    HV         *charnames;             /* cache of named sequences */
     HV         *paren_names;           /* Paren names */
     
     regnode    **recurse;              /* Recurse regops */
@@ -177,7 +176,6 @@ typedef struct RExC_state_t {
 #define RExC_seen_evals        (pRExC_state->seen_evals)
 #define RExC_utf8      (pRExC_state->utf8)
 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
-#define RExC_charnames  (pRExC_state->charnames)
 #define RExC_open_parens       (pRExC_state->open_parens)
 #define RExC_close_parens      (pRExC_state->close_parens)
 #define RExC_opend     (pRExC_state->opend)
@@ -4268,7 +4266,6 @@ redo_first_pass:
     RExC_size = 0L;
     RExC_emit = &PL_regdummy;
     RExC_whilem_seen = 0;
-    RExC_charnames = NULL;
     RExC_open_parens = NULL;
     RExC_close_parens = NULL;
     RExC_opend = NULL;
@@ -6589,56 +6586,69 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
    recognized '\N' and needs to handle the rest. RExC_parse is
    expected to point at the first char following the N at the time
    of the call.
+
+   The \N may be inside (indicated by valuep not being NULL) or outside a
+   character class.
+
+   \N may begin either a named sequence, or if outside a character class, mean
+   to match a non-newline.  For non single-quoted regexes, the tokenizer has
+   attempted to decide which, and in the case of a named sequence converted it
+   into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
+   where c1... are the characters in the sequence.  For single-quoted regexes,
+   the tokenizer passes the \N sequence through unchanged; this code will not
+   attempt to determine this nor expand those.  The net effect is that if the
+   beginning of the passed-in pattern isn't '{U+' or there is no '}', it
+   signals that this \N occurrence means to match a non-newline.
+   
+   Only the \N{U+...} form should occur in a character class, for the same
+   reason that '.' inside a character class means to just match a period: it
+   just doesn't make sense.
    
    If valuep is non-null then it is assumed that we are parsing inside 
    of a charclass definition and the first codepoint in the resolved
    string is returned via *valuep and the routine will return NULL. 
    In this mode if a multichar string is returned from the charnames 
-   handler a warning will be issued, and only the first char in the 
+   handler, a warning will be issued, and only the first char in the 
    sequence will be examined. If the string returned is zero length
    then the value of *valuep is undefined and NON-NULL will 
    be returned to indicate failure. (This will NOT be a valid pointer 
    to a regnode.)
    
-   If valuep is null then it is assumed that we are parsing normal text
-   and inserts a new EXACT node into the program containing the resolved
-   string and returns a pointer to the new node. If the string is 
-   zerolength a NOTHING node is emitted.
+   If valuep is null then it is assumed that we are parsing normal text and a
+   new EXACT node is inserted into the program containing the resolved string,
+   and a pointer to the new node is returned.  But if the string is zero length
+   a NOTHING node is emitted instead.
 
    On success RExC_parse is set to the char following the endbrace.
-   Parsing failures will generate a fatal errorvia vFAIL(...)
-   
-   NOTE: We cache all results from the charnames handler locally in 
-   the RExC_charnames hash (created on first use) to prevent a charnames 
-   handler from playing silly-buggers and returning a short string and 
-   then a long string for a given pattern. Since the regexp program 
-   size is calculated during an initial parse this would result
-   in a buffer overrun so we cache to prevent the charname result from
-   changing during the course of the parse.
-   
+   Parsing failures will generate a fatal error via vFAIL(...)
  */
 STATIC regnode *
 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
 {
-    char * name;        /* start of the content of the name */
     char * endbrace;    /* endbrace following the name */
-    SV *sv_str = NULL;  
-    SV *sv_name = NULL;
-    STRLEN len; /* this has various purposes throughout the code */
-    bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
     regnode *ret = NULL;
+#ifdef DEBUGGING
+    char* parse_start = RExC_parse - 2;            /* points to the '\N' */
+#endif
+
+    GET_RE_DEBUG_FLAGS_DECL;
  
     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
+
+    GET_RE_DEBUG_FLAGS;
    
-    if (*RExC_parse != '{' ||
-           (*RExC_parse == '{' && RExC_parse[1]
-            && strchr("0123456789", RExC_parse[1])))
+    /* Disambiguate between \N meaning a named character versus \N meaning
+     * don't match a newline. */
+    if (*RExC_parse != '{'
+       || (! (endbrace = strchr(RExC_parse, '}'))) /* no trailing brace */
+       || ! (endbrace == RExC_parse + 1        /* nothing between the {} */
+             || (endbrace - RExC_parse > 3     /* U+ and at least one hex */
+                 && strnEQ(RExC_parse + 1, "U+", 2))))
     {
-       GET_RE_DEBUG_FLAGS_DECL;
-       if (valuep)
+       if (valuep) {
            /* no bare \N in a charclass */
-           vFAIL("Missing braces on \\N{}");
-       GET_RE_DEBUG_FLAGS;
+           vFAIL("\\N in a character class must be a named character: \\N{...}");
+       }
        nextchar(pRExC_state);
        ret = reg_node(pRExC_state, REG_ANY);
        *flagp |= HASWIDTH|SIMPLE;
@@ -6647,235 +6657,168 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
         Set_Node_Length(ret, 1); /* MJD */
        return ret;
     }
-    name = RExC_parse+1;
-    endbrace = strchr(RExC_parse, '}');
-    if ( ! endbrace ) {
-        RExC_parse++;
-        vFAIL("Missing right brace on \\N{}");
-    } 
-    RExC_parse = endbrace + 1;  
-    
-    
-    /* RExC_parse points at the beginning brace, 
-       endbrace points at the last */
-    if ( name[0]=='U' && name[1]=='+' ) {
-        /* its a "Unicode hex" notation {U+89AB} */
-        I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
-            | PERL_SCAN_DISALLOW_PREFIX
-            | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
-        UV cp;
-        len = (STRLEN)(endbrace - name - 2);
-        cp = grok_hex(name + 2, &len, &fl, NULL);
-        if ( len != (STRLEN)(endbrace - name - 2) ) {
-            cp = 0xFFFD;
-        }    
-        if ( valuep ) {
-           if (cp > 0xff) RExC_utf8 = 1;
-            *valuep = cp;
-            return NULL;
-        }
 
-       /* Need to convert to utf8 if either: won't fit into a byte, or the re
-        * is going to be in utf8 and the representation changes under utf8. */
-       if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
-           U8 string[UTF8_MAXBYTES+1];
-           U8 *tmps;
-           RExC_utf8 = 1;
-           tmps = uvuni_to_utf8(string, cp);
-           sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
-       } else {    /* Otherwise, no need for utf8, can skip that step */
-           char string;
-           string = (char)cp;
-           sv_str= newSVpvn(&string, 1);
+    /* Here, we have decided it is a named sequence */
+    RExC_parse++;      /* Skip past the '{' */
+    if (endbrace == RExC_parse) {   /* empty: \N{} */
+       if (! valuep) {
+           RExC_parse = endbrace + 1;  
+           return reg_node(pRExC_state,NOTHING);
        }
-    } else {
-        /* fetch the charnames handler for this scope */
-        HV * const table = GvHV(PL_hintgv);
-        SV **cvp= table ? 
-            hv_fetchs(table, "charnames", FALSE) :
-            NULL;
-        SV *cv= cvp ? *cvp : NULL;
-        HE *he_str;
-        int count;
-        /* create an SV with the name as argument */
-        sv_name = newSVpvn(name, endbrace - name);
-        
-        if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
-            vFAIL2("Constant(\\N{%" SVf "}) unknown: "
-                  "(possibly a missing \"use charnames ...\")",
-                  SVfARG(sv_name));
-        }
-        if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
-            vFAIL2("Constant(\\N{%" SVf "}): "
-                  "$^H{charnames} is not defined", SVfARG(sv_name));
-        }
-        
-        
-        
-        if (!RExC_charnames) {
-            /* make sure our cache is allocated */
-            RExC_charnames = newHV();
-            sv_2mortal(MUTABLE_SV(RExC_charnames));
-        } 
-            /* see if we have looked this one up before */
-        he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
-        if ( he_str ) {
-            sv_str = HeVAL(he_str);
-            cached = 1;
-       } else if (PL_parser && PL_parser->error_count > 0) {
-           /* Don't attempt to load charnames if we're already in error */
-           vFAIL("Too many errors, cannot continue parsing");
-        } else {
-            dSP ;
 
-            ENTER ;
-            SAVETMPS ;
-            PUSHMARK(SP) ;
-            
-            XPUSHs(sv_name);
-            
-            PUTBACK ;
-            
-            count= call_sv(cv, G_SCALAR);
-            SPAGAIN ;
-            
-            if (count == 1) { /* XXXX is this right? dmq */
-                sv_str = POPs;
-                SvREFCNT_inc_simple_void(sv_str);
-            } 
-            
-            PUTBACK ;
-            FREETMPS ;
-            LEAVE ;
-            
-            if ( !sv_str || !SvOK(sv_str) ) {
-                vFAIL2("Constant(\\N{%" SVf "}): Call to &{$^H{charnames}} "
-                      "did not return a defined value", SVfARG(sv_name));
-            }
-            if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
-                cached = 1;
-        }
+       if (SIZE_ONLY) {
+           ckWARNreg(RExC_parse,
+                   "Ignoring zero length \\N{} in character class"
+           );
+           RExC_parse = endbrace + 1;  
+       }
+       *valuep = 0;
+       return (regnode *) &RExC_parse; /* Invalid regnode pointer */
     }
-    if (valuep) {
-        char *p = SvPV(sv_str, len);
-        if (len) {
-            STRLEN numlen = 1;
-            if ( SvUTF8(sv_str) ) {
-                *valuep = utf8_to_uvchr((U8*)p, &numlen);
-                if (*valuep > 0x7F)
-                    RExC_utf8 = 1; 
-                /* XXXX
-                  We have to turn on utf8 for high bit chars otherwise
-                  we get failures with
-                  
-                   "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
-                   "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
-                
-                  This is different from what \x{} would do with the same
-                  codepoint, where the condition is > 0xFF.
-                  - dmq
-                */
-                
-                
-            } else {
-                *valuep = (UV)*p;
-                /* warn if we havent used the whole string? */
-            }
-            if (numlen<len && SIZE_ONLY) {
-                ckWARN2reg(RExC_parse,
-                          "Ignoring excess chars from \\N{%" SVf "} in character class",
-                          SVfARG(sv_name)
-                );
-            }        
-        } else if (SIZE_ONLY) {
-            ckWARN2reg(RExC_parse,
-                      "Ignoring zero length \\N{%" SVf "} in character class",
-                      SVfARG(sv_name)
-                );
-        }
-        SvREFCNT_dec(sv_name);
-        if (!cached)
-            SvREFCNT_dec(sv_str);    
-        return len ? NULL : (regnode *)&len;
-    } else if(SvCUR(sv_str)) {     
-        
-        char *s; 
-        char *p, *pend;        
-        STRLEN charlen = 1;
-#ifdef DEBUGGING
-        char * parse_start = name-3; /* needed for the offsets */
-#endif
-        GET_RE_DEBUG_FLAGS_DECL;     /* needed for the offsets */
-        
-        ret = reg_node(pRExC_state,
-            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
-        s= STRING(ret);
-        
-        if ( RExC_utf8 && !SvUTF8(sv_str) ) {
-            sv_utf8_upgrade(sv_str);
-        } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
-            RExC_utf8= 1;
-        }
-        
-        p = SvPV(sv_str, len);
-        pend = p + len;
-        /* len is the length written, charlen is the size the char read */
-        for ( len = 0; p < pend; p += charlen ) {
-            if (UTF) {
-                UV uvc = utf8_to_uvchr((U8*)p, &charlen);
-                if (FOLD) {
-                    STRLEN foldlen,numlen;
-                    U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
-                    uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
-                    /* Emit all the Unicode characters. */
-                    
-                    for (foldbuf = tmpbuf;
-                        foldlen;
-                        foldlen -= numlen) 
-                    {
-                        uvc = utf8_to_uvchr(foldbuf, &numlen);
-                        if (numlen > 0) {
-                            const STRLEN unilen = reguni(pRExC_state, uvc, s);
-                            s       += unilen;
-                            len     += unilen;
-                            /* In EBCDIC the numlen
-                            * and unilen can differ. */
-                            foldbuf += numlen;
-                            if (numlen >= foldlen)
-                                break;
-                        }
-                        else
-                            break; /* "Can't happen." */
-                    }                          
-                } else {
-                    const STRLEN unilen = reguni(pRExC_state, uvc, s);
-                   if (unilen > 0) {
-                      s   += unilen;
-                      len += unilen;
-                   }
-               }
-           } else {
-                len++;
-                REGC(*p, s++);
-            }
-        }
-        if (SIZE_ONLY) {
-            RExC_size += STR_SZ(len);
-        } else {
-            STR_LEN(ret) = len;
-            RExC_emit += STR_SZ(len);
-        }
-        Set_Node_Cur_Length(ret); /* MJD */
-        RExC_parse--; 
-        nextchar(pRExC_state);
-    } else {   /* zero length */
-        ret = reg_node(pRExC_state,NOTHING);
+
+    RExC_utf8 = 1;     /* named sequences imply Unicode semantics */
+    RExC_parse += 2;   /* Skip past the 'U+' */
+
+    if (valuep) {   /* In a bracketed char class */
+       /* We only pay attention to the first char of 
+       multichar strings being returned. I kinda wonder
+       if this makes sense as it does change the behaviour
+       from earlier versions, OTOH that behaviour was broken
+       as well. XXX Solution is to recharacterize as
+       [rest-of-class]|multi1|multi2... */
+
+       STRLEN length_of_hex;
+       I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+           | PERL_SCAN_DISALLOW_PREFIX
+           | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
+    
+       char * endchar = strchr(RExC_parse, '.');
+       if (endchar) {
+           ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
+       }
+       else endchar = endbrace;
+
+       length_of_hex = (STRLEN)(endchar - RExC_parse);
+       *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
+
+       /* The tokenizer should have guaranteed validity, but it's possible to
+        * bypass it by using single quoting, so check */
+       if ( length_of_hex != (STRLEN)(endchar - RExC_parse) ) {
+           *valuep = UNICODE_REPLACEMENT;
+       }    
+
+       RExC_parse = endbrace + 1;
+       if (endchar == endbrace) return NULL;
+
+        ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
     }
-    SvREFCNT_dec(sv_name);
-    if (!cached)
-        SvREFCNT_dec(sv_str);
-    return ret;
+    else {     /* Not a char class */
+       char *s;            /* String to put in generated EXACT node */
+       STRLEN len = 0;     /* Its current length */
+       char *endchar;      /* Points to '.' or '}' ending cur char in the input
+                              stream */
+
+       ret = reg_node(pRExC_state,
+                       (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
+       s= STRING(ret);
+
+       /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
+        * the input which is of the form now 'c1.c2.c3...}' until find the
+        * ending brace or exeed length 255.  The characters that exceed this
+        * limit are dropped.  The limit could be relaxed should it become
+        * desirable by reparsing this as (?:\N{NAME}), so could generate
+        * multiple EXACT nodes, as is done for just regular input.  But this
+        * is primarily a named character, and not intended to be a huge long
+        * string, so 255 bytes should be good enough */
+       while (1) {
+           STRLEN this_char_length;
+           I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
+                           | PERL_SCAN_DISALLOW_PREFIX
+                           | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
+           UV cp;  /* Ord of current character */
+
+           /* Code points are separated by dots.  If none, there is only one
+            * code point, and is terminated by the brace */
+           endchar = strchr(RExC_parse, '.');
+           if (! endchar) endchar = endbrace;
+
+           /* The values are Unicode even on EBCDIC machines */
+           this_char_length = (STRLEN)(endchar - RExC_parse);
+           cp = grok_hex(RExC_parse, &this_char_length, &grok_flags, NULL);
+           if ( this_char_length == 0 
+               || this_char_length != (STRLEN)(endchar - RExC_parse) )
+           {
+               cp = UNICODE_REPLACEMENT;   /* Substitute a valid character */
+           }    
+
+           if (! FOLD) {       /* Not folding, just append to the string */
+               STRLEN unilen;
+
+               /* Quit before adding this character if would exceed limit */
+               if (len + UNISKIP(cp) > U8_MAX) break;
 
+               unilen = reguni(pRExC_state, cp, s);
+               if (unilen > 0) {
+                   s   += unilen;
+                   len += unilen;
+               }
+           } else {    /* Folding, output the folded equivalent */
+               STRLEN foldlen,numlen;
+               U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
+               cp = toFOLD_uni(cp, tmpbuf, &foldlen);
+
+               /* Quit before exceeding size limit */
+               if (len + foldlen > U8_MAX) break;
+               
+               for (foldbuf = tmpbuf;
+                   foldlen;
+                   foldlen -= numlen) 
+               {
+                   cp = utf8_to_uvchr(foldbuf, &numlen);
+                   if (numlen > 0) {
+                       const STRLEN unilen = reguni(pRExC_state, cp, s);
+                       s       += unilen;
+                       len     += unilen;
+                       /* In EBCDIC the numlen and unilen can differ. */
+                       foldbuf += numlen;
+                       if (numlen >= foldlen)
+                           break;
+                   }
+                   else
+                       break; /* "Can't happen." */
+               }                          
+           }
+
+           /* Point to the beginning of the next character in the sequence. */
+           RExC_parse = endchar + 1;
+
+           /* Quit if no more characters */
+           if (RExC_parse >= endbrace) break;
+       }
+
+
+       if (SIZE_ONLY) {
+           if (RExC_parse < endbrace) {
+               ckWARNreg(RExC_parse - 1,
+                         "Using just the first characters returned by \\N{}");
+           }
+
+           RExC_size += STR_SZ(len);
+       } else {
+           STR_LEN(ret) = len;
+           RExC_emit += STR_SZ(len);
+       }
+
+       RExC_parse = endbrace + 1;
+
+       *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
+                              with malformed in t/re/pat_advanced.t */
+       RExC_parse --;
+       Set_Node_Cur_Length(ret); /* MJD */
+       nextchar(pRExC_state);
+    }
+
+    return ret;
 }
 
 
@@ -8908,8 +8851,8 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
 /*
  - regcurly - a little FSA that accepts {\d+,?\d*}
  */
-STATIC I32
-S_regcurly(register const char *s)
+I32
+Perl_regcurly(register const char *s)
 {
     PERL_ARGS_ASSERT_REGCURLY;
 
diff --git a/t/lib/Cname.pm b/t/lib/Cname.pm

index d4b8a9e..562f59a 100644 (file)
--- a/t/lib/Cname.pm
+++ b/t/lib/Cname.pm
@@ -4,6 +4,7 @@ our $Evil='A';
 sub translator {
     my $str = shift;
     if ( $str eq 'EVIL' ) {
+        # Returns A first time, AB second, ABC third ... A-ZA the 27th time.
         (my $c=substr("A".$Evil,-1))++;
         my $r=$Evil;
         $Evil.=$c;
@@ -12,6 +13,25 @@ sub translator {
     if ( $str eq 'EMPTY-STR') {
        return "";
     }
+    if ( $str eq 'NULL') {
+        return "\0";
+    }
+    if ( $str eq 'LONG-STR') {
+        return 'A' x 255;
+    }
+    # Should exceed limit for regex \N bytes in a sequence.  Anyway it will if
+    # UCHAR_MAX is 255.
+    if ( $str eq 'TOO-LONG-STR') {
+       return 'A' x 256;
+    }
+    if ($str eq 'MALFORMED') {
+        $str = "\xDF\xDFabc";
+        utf8::upgrade($str);
+         
+        # Create a malformed in first and second characters.
+        $str =~ s/^\C/A/;
+        $str =~ s/^(\C\C)\C/$1A/;
+    }
     return $str;
 }
 
diff --git a/t/re/pat.t b/t/re/pat.t

index 314e52b..40ae52e 100644 (file)
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -2,7 +2,9 @@
 #
 # This is a home for regular expression tests that don't fit into
 # the format supported by re/regexp.t.  If you want to add a test
-# that does fit that format, add it to re/re_tests, not here.
+# that does fit that format, add it to re/re_tests, not here.  Tests for \N
+# should be added here because they are treated as single quoted strings
+# there, which means they avoid the lexer which otherwise would look at them.
 
 use strict;
 use warnings;
@@ -21,7 +23,7 @@ BEGIN {
 }
 
 
-plan tests => 293;  # Update this when adding/deleting tests.
+plan tests => 297;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -969,6 +971,23 @@ sub run_tests {
         iseq "@space2", "spc tab";
     }
 
+    {
+        use charnames ":full";
+        local $Message = 'Delayed interpolation of \N';
+        my $r1 = qr/\N{THAI CHARACTER SARA I}/;
+        my $s1 = "\x{E34}\x{E34}\x{E34}\x{E34}";
+
+        # Bug #56444
+        ok $s1 =~ /$r1+/, 'my $r1 = qr/\N{THAI CHARACTER SARA I}/; my $s1 = "\x{E34}\x{E34}\x{E34}\x{E34}; $s1 =~ /$r1+/';
+
+        # Bug #62056
+        ok "${s1}A" =~ m/$s1\N{LATIN CAPITAL LETTER A}/, '"${s1}A" =~ m/$s1\N{LATIN CAPITAL LETTER A}/';
+
+        ok "abbbbc" =~ m/\N{1}/ && $& eq "a", '"abbbbc" =~ m/\N{1}/ && $& eq "a"';
+        ok "abbbbc" =~ m/\N{3,4}/ && $& eq "abbb", '"abbbbc" =~ m/\N{3,4}/ && $& eq "abbb"';
+    }
+
+
 } # End of sub run_tests
 
 1;
diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t

index 3a66a0c..86735ec 100644 (file)
--- a/t/re/pat_advanced.t
+++ b/t/re/pat_advanced.t
@@ -21,7 +21,7 @@ BEGIN {
 }
 
 
-plan tests => 1143;  # Update this when adding/deleting tests.
+plan tests => 1155;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -1024,21 +1024,20 @@ sub run_tests {
         use Cname;
 
         ok 'fooB'  =~ /\N{foo}[\N{B}\N{b}]/, "Passthrough charname";
-        my $test   = 1233;
         #
         # Why doesn't must_warn work here?
         #
         my $w;
         local $SIG {__WARN__} = sub {$w .= "@_"};
         eval 'q(xxWxx) =~ /[\N{WARN}]/';
-        ok $w && $w =~ /^Ignoring excess chars from/,
-                 "Ignoring excess chars warning";
+        ok $w && $w =~ /Using just the first character returned by \\N{} in character class/,
+                 "single character in [\\N{}] warning";
 
         undef $w;
         eval q [ok "\0" !~ /[\N{EMPTY-STR}XY]/,
                    "Zerolength charname in charclass doesn't match \\0"];
-        ok $w && $w =~ /^Ignoring zero length/,
-                 'Ignoring zero length \N{%} in character class warning';
+        ok $w && $w =~ /Ignoring zero length/,
+                 'Ignoring zero length \N{} in character class warning';
 
         ok 'AB'  =~ /(\N{EVIL})/ && $1 eq 'A', 'Charname caching $1';
         ok 'ABC' =~ /(\N{EVIL})/,              'Charname caching $1';
@@ -1046,6 +1045,26 @@ sub run_tests {
                     'Empty string charname produces NOTHING node';
         ok ''    =~ /\N{EMPTY-STR}/,
                     'Empty string charname produces NOTHING node';
+        ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/, 'Verify that long string works';
+        ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/i, 'Verify under folding that long string works';
+
+        # If remove the limitation in regcomp code these should work
+        # differently
+        undef $w;
+        eval q [ok "\N{LONG-STR}" =~ /^\N{TOO-LONG-STR}$/, 'Verify that too long a string fails gracefully'];
+        ok $w && $w =~ /Using just the first characters returned/, 'Verify that got too-long string warning in \N{} that exceeds the limit';
+        undef $w;
+        eval q [ok "\N{LONG-STR}" =~ /^\N{TOO-LONG-STR}$/i, 'Verify under folding that too long a string fails gracefully'];
+        ok $w && $w =~ /Using just the first characters returned/, 'Verify under folding that got too-long string warning in \N{} that exceeds the limit';
+        undef $w;
+        eval q [ok "\N{TOO-LONG-STR}" !~ /^\N{TOO-LONG-STR}$/, 'Verify that too long a string doesnt work'];
+        ok $w && $w =~ /Using just the first characters returned/, 'Verify that got too-long string warning in \N{} that exceeds the limit';
+        undef $w;
+        eval q [ok "\N{TOO-LONG-STR}" !~ /^\N{TOO-LONG-STR}$/i, 'Verify under folding that too long a string doesnt work'];
+        ok $w && $w =~ /Using just the first characters returned/i, 'Verify under folding that got too-long string warning in \N{} that exceeds the limit';
+        undef $w;
+        eval 'q(syntax error) =~ /\N{MALFORMED}/';
+        ok $@ && $@ =~ /Malformed/, 'Verify that malformed utf8 gives an error';
 
     }
 
@@ -1064,6 +1083,7 @@ sub run_tests {
         ok "\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}" =~
            /[\N{SPACE}\N{U+0041}][\N{SPACE}\N{U+0042}]/,
             'Intermixed named and unicode escapes';
+        ok "\0" =~ /^\N{NULL}$/, 'Verify that \N{NULL} works; is not confused with an error';
     }
 
 
diff --git a/t/re/re_tests b/t/re/re_tests

index dc03084..6304fe6 100644 (file)
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -1388,6 +1388,13 @@ foo(\h)bar       foo\tbar        y       $1      \t
 # [perl #60344] Regex lookbehind failure after an (if)then|else in perl 5.10
 /\A(?(?=db2)db2|\D+)(?<!processed)\.csv\z/xms  sql_processed.csv       n       -       -
 /\N{U+0100}/   \x{100} y       $&      \x{100} # Bug #59328
+/[a\N{U+0100}]/        \x{100} y       $&      \x{100}
+/[a\N{U+0100}]/        a       y       $&      a
+
+# Verify that \N{U+...} forces Unicode semantics
+/\N{U+41}\x{c1}/i      a\x{e1} y       $&      a\x{e1}
+/[\N{U+41}\x{c1}]/i    \x{e1}  y       $&      \x{e1}
+
 [\s][\S]       \x{a0}\x{a0}    nT      -       -       # Unicode complements should not match same character
 
 # was generating malformed utf8
@@ -1395,4 +1402,22 @@ foo(\h)bar       foo\tbar        y       $1      \t
 
 ((??{ "(?:|)" }))\s    C\x20   y       -       -
 
+# Verify that \ escapes the { after \N, and causes \N to match non-newline
+abc\N\{U+BEEF} abc\n{UBEEF}    n               
+abc\N\{U+BEEF} abc.{UBEEF}     y       $&      abc.{UBEEF}
+[abc\N\{U+BEEF}]       -       c       -       \\N in a character class must be a named character
+
+# Verify that \N can be trailing and causes \N to match non-newline
+abc\N  abcd    y       $&      abcd
+abc\N  abc\n   n               
+
+# Verify get errors.  For these, we need // or else puts it in single quotes,
+# and doesn't expand.
+/\N{U+}/       -       c       -       Invalid hexadecimal number
+/abc\N{def/    -       c       -       Missing right brace
+
+# Verifies catches hex errors, and doesn't expose our . notation to the outside
+/\N{U+0xBEEF}/ -       c       -       Illegal hexadecimal digit
+/\N{U+BEEF.BEAD}/      -       c       -       Illegal hexadecimal digit
+
 # vim: set noexpandtab
diff --git a/t/re/regexp.t b/t/re/regexp.t

index 2344610..558c6f0 100644 (file)
--- a/t/re/regexp.t
+++ b/t/re/regexp.t
@@ -36,6 +36,10 @@
 # If you want to add a regular expression test that can't be expressed
 # in this format, don't add it here: put it in re/pat.t instead.
 #
+# Note that the inputs get passed on as "m're'", so the re bypasses the lexer.
+# This means this file cannot be used for testing anything that the lexer
+# handles; in 5.12 this means just \N{NAME} and \N{U+...}.
+#
 # Note that columns 2,3 and 5 are all enclosed in double quotes and then
 # evalled; so something like a\"\x{100}$1 has length 3+length($1).
 
diff --git a/toke.c b/toke.c

index 9df0ff2..fcfdd71 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -2471,10 +2471,7 @@ S_sublex_done(pTHX)
 
   In patterns:
     backslashes:
-      double-quoted style: \r and \n
-      regexp special ones: \D \s
-      constants: \x31
-      backrefs: \1
+      constants: \N{NAME} only
       case and quoting: \U \Q \E
     stops on @ and $, but not for $ as tail anchor
 
@@ -2488,7 +2485,7 @@ S_sublex_done(pTHX)
   In double-quoted strings:
     backslashes:
       double-quoted style: \r and \n
-      constants: \x31
+      constants: \x31, etc.
       deprecated backrefs: \1 (in substitution replacements)
       case and quoting: \U \Q \E
     stops on @ and $
@@ -2516,14 +2513,14 @@ S_sublex_done(pTHX)
          check for embedded arrays
          check for embedded scalars
          if (backslash) {
-             leave intact backslashes from leaveit (below)
              deprecate \1 in substitution replacements
              handle string-changing backslashes \l \U \Q \E, etc.
              switch (what was escaped) {
                  handle \- in a transliteration (becomes a literal -)
+                 if a pattern and not \N{, go treat as regular character
                  handle \132 (octal characters)
                  handle \x15 and \x{1234} (hex characters)
-                 handle \N{name} (named characters)
+                 handle \N{name} (named characters, also \N{3,5} in a pattern)
                  handle \cV (control characters)
                  handle printf-style backslashes (\f, \r, \n, etc)
              } (end switch)
@@ -2581,6 +2578,7 @@ S_scan_const(pTHX_ char *start)
 
 
     while (s < send || dorange) {
+
         /* get transliterations out of the way (they're most literal) */
        if (PL_lex_inwhat == OP_TRANS) {
            /* expand a range A-Z to the full set of characters.  AIE! */
@@ -2800,6 +2798,8 @@ S_scan_const(pTHX_ char *start)
 
        /* backslashes */
        if (*s == '\\' && s+1 < send) {
+           char* e;    /* Can be used for ending '}', etc. */
+
            s++;
 
            /* deprecate \1 in strings and substitution replacements */
@@ -2816,13 +2816,28 @@ S_scan_const(pTHX_ char *start)
                --s;
                break;
            }
-           /* skip any other backslash escapes in a pattern */
-           else if (PL_lex_inpat) {
+           /* In a pattern, process \N, but skip any other backslash escapes.
+            * This is because we don't want to translate an escape sequence
+            * into a meta symbol and have the regex compiler use the meta
+            * symbol meaning, e.g. \x{2E} would be confused with a dot.  But
+            * in spite of this, we do have to process \N here while the proper
+            * charnames handler is in scope.  See bugs #56444 and #62056.
+            * There is a complication because \N in a pattern may also stand
+            * for 'match a non-nl', and not mean a charname, in which case its
+            * processing should be deferred to the regex compiler.  To be a
+            * charname it must be followed immediately by a '{', and not look
+            * like \N followed by a curly quantifier, i.e., not something like
+            * \N{3,}.  regcurly returns a boolean indicating if it is a legal
+            * quantifier */
+           else if (PL_lex_inpat
+                   && (*s != 'N'
+                       || s[1] != '{'
+                       || regcurly(s + 1)))
+           {
                *d++ = NATIVE_TO_NEED(has_utf8,'\\');
                goto default_action;
            }
 
-           /* if we get here, it's either a quoted -, or a digit */
            switch (*s) {
 
            /* quoted - in transliterations */
@@ -2881,15 +2896,13 @@ S_scan_const(pTHX_ char *start)
                }
 
              NUM_ESCAPE_INSERT:
-               /* Insert oct, hex, or \N{U+...} escaped character.  There will
-                * always be enough room in sv since such escapes will be
-                * longer than any UTF-8 sequence they can end up as, except if
-                * they force us to recode the rest of the string into utf8 */
+               /* Insert oct or hex escaped character.  There will always be
+                * enough room in sv since such escapes will be longer than any
+                * UTF-8 sequence they can end up as, except if they force us
+                * to recode the rest of the string into utf8 */
                
                /* Here uv is the ordinal of the next character being added in
-                * unicode (converted from native).  (It has to be done before
-                * here because \N is interpreted as unicode, and oct and hex
-                * as native.) */
+                * unicode (converted from native). */
                if (!UNI_IS_INVARIANT(uv)) {
                    if (!has_utf8 && uv > 255) {
                        /* Might need to recode whatever we have accumulated so
@@ -2929,92 +2942,289 @@ S_scan_const(pTHX_ char *start)
                }
                continue;
 
-           /* \N{LATIN SMALL LETTER A} is a named character, and so is
-            * \N{U+0041} */
            case 'N':
-               ++s;
-               if (*s == '{') {
-                   char* e = strchr(s, '}');
-                   SV *res;
-                   STRLEN len;
-                   const char *str;
-
-                   if (!e) {
+               /* In a non-pattern \N must be a named character, like \N{LATIN
+                * SMALL LETTER A} or \N{U+0041}.  For patterns, it also can
+                * mean to match a non-newline.  For non-patterns, named
+                * characters are converted to their string equivalents. In
+                * patterns, named characters are not converted to their
+                * ultimate forms for the same reasons that other escapes
+                * aren't.  Instead, they are converted to the \N{U+...} form
+                * to get the value from the charnames that is in effect right
+                * now, while preserving the fact that it was a named character
+                * so that the regex compiler knows this */
+
+               /* This section of code doesn't generally use the
+                * NATIVE_TO_NEED() macro to transform the input.  I (khw) did
+                * a close examination of this macro and determined it is a
+                * no-op except on utfebcdic variant characters.  Every
+                * character generated by this that would normally need to be
+                * enclosed by this macro is invariant, so the macro is not
+                * needed, and would complicate use of copy(). There are other
+                * parts of this file where the macro is used inconsistently,
+                * but are saved by it being a no-op */
+
+               /* The structure of this section of code (besides checking for
+                * errors and upgrading to utf8) is:
+                *  Further disambiguate between the two meanings of \N, and if
+                *      not a charname, go process it elsewhere
+                *  If of form \N{U+...}, pass it through if a pattern; otherwise
+                *      convert to utf8
+                *  Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a pattern;
+                *      otherwise convert to utf8 */
+
+               /* Here, s points to the 'N'; the test below is guaranteed to
+                * succeed if we are being called on a pattern as we already
+                * know from a test above that the next character is a '{'.
+                * On a non-pattern \N must mean 'named sequence, which
+                * requires braces */
+               s++;
+               if (*s != '{') {
+                   yyerror("Missing braces on \\N{}"); 
+                   continue;
+               }
+               s++;
+
+               /* If there is no matching '}', it is an error outside of a
+                * pattern, or ambiguous inside. */
+               if (! (e = strchr(s, '}'))) {
+                   if (! PL_lex_inpat) {
                        yyerror("Missing right brace on \\N{}");
-                       e = s - 1;
-                       goto cont_scan;
+                       continue;
                    }
-                   if (e > s + 2 && s[1] == 'U' && s[2] == '+') {
-                       /* \N{U+...} The ... is a unicode value even on EBCDIC
-                        * machines */
-                       I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
-                         PERL_SCAN_DISALLOW_PREFIX;
-                       s += 3;
-                       len = e - s;
-                       uv = grok_hex(s, &len, &flags, NULL);
-                       if ( e > s && len != (STRLEN)(e - s) ) {
-                           uv = 0xFFFD;
+                   else {
+
+                       /* A missing brace means it can't be a legal character
+                        * name, and it could be a legal "match non-newline".
+                        * But it's kind of weird without an unescaped left
+                        * brace, so warn. */
+                       if (ckWARN(WARN_SYNTAX)) {
+                           Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
+                                   "Missing right brace on \\N{} or unescaped left brace after \\N.  Assuming the latter");
                        }
-                       s = e + 1;
-                       goto NUM_ESCAPE_INSERT;
+                       s -= 3; /* Backup over cur char, {, N, to the '\' */
+                       *d++ = NATIVE_TO_NEED(has_utf8,'\\');
+                       goto default_action;
                    }
-                   res = newSVpvn(s + 1, e - s - 1);
-                   res = new_constant( NULL, 0, "charnames",
-                                       res, NULL, s - 2, e - s + 3 );
-                   if (has_utf8)
-                       sv_utf8_upgrade(res);
-                   str = SvPV_const(res,len);
-#ifdef EBCDIC_NEVER_MIND
-                   /* charnames uses pack U and that has been
-                    * recently changed to do the below uni->native
-                    * mapping, so this would be redundant (and wrong,
-                    * the code point would be doubly converted).
-                    * But leave this in just in case the pack U change
-                    * gets revoked, but the semantics is still
-                    * desireable for charnames. --jhi */
-                   {
-                        UV uv = utf8_to_uvchr((const U8*)str, 0);
+               }
 
-                        if (uv < 0x100) {
-                             U8 tmpbuf[UTF8_MAXBYTES+1], *d;
+               /* Here it looks like a named character */
 
-                             d = uvchr_to_utf8(tmpbuf, UNI_TO_NATIVE(uv));
-                             sv_setpvn(res, (char *)tmpbuf, d - tmpbuf);
-                             str = SvPV_const(res, len);
-                        }
-                   }
-#endif
-                   /* If destination is not in utf8 but this new character is,
-                    * recode the dest to utf8 */
-                   if (!has_utf8 && SvUTF8(res)) {
+               if (PL_lex_inpat) {
+
+                   /* XXX This block is temporary code.  \N{} implies that the
+                    * pattern is to have Unicode semantics, and therefore
+                    * currently has to be encoded in utf8.  By putting it in
+                    * utf8 now, we save a whole pass in the regular expression
+                    * compiler.  Once that code is changed so Unicode
+                    * semantics doesn't necessarily have to be in utf8, this
+                    * block should be removed */
+                   if (!has_utf8) {
                        SvCUR_set(sv, d - SvPVX_const(sv));
                        SvPOK_on(sv);
                        *d = '\0';
                        /* See Note on sizing above.  */
                        sv_utf8_upgrade_flags_grow(sv,
-                                           SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                           len + (STRLEN)(send - s) + 1);
+                                       SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+                                       /* 5 = '\N{' + cur char + NUL */
+                                       (STRLEN)(send - s) + 5);
                        d = SvPVX(sv) + SvCUR(sv);
                        has_utf8 = TRUE;
-                   } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
+                   }
+               }
 
-                       /* See Note on sizing above.  (NOTE: SvCUR() is not set
-                        * correctly here). */
-                       const STRLEN off = d - SvPVX_const(sv);
-                       d = SvGROW(sv, off + len + (STRLEN)(send - s) + 1) + off;
+               if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
+                   I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+                               | PERL_SCAN_DISALLOW_PREFIX;
+                   STRLEN len;
+
+                   /* For \N{U+...}, the '...' is a unicode value even on
+                    * EBCDIC machines */
+                   s += 2;         /* Skip to next char after the 'U+' */
+                   len = e - s;
+                   uv = grok_hex(s, &len, &flags, NULL);
+                   if (len == 0 || len != (STRLEN)(e - s)) {
+                       yyerror("Invalid hexadecimal number in \\N{U+...}");
+                       s = e + 1;
+                       continue;
+                   }
+
+                   if (PL_lex_inpat) {
+
+                       /* Pass through to the regex compiler unchanged.  The
+                        * reason we evaluated the number above is to make sure
+                        * there wasn't a syntax error.  It also makes sure
+                        * that the syntax created below, \N{Uc1.c2...}, is
+                        * internal-only */
+                       s -= 5;     /* Include the '\N{U+' */
+                       Copy(s, d, e - s + 1, char);    /* 1 = include the } */
+                       d += e - s + 1;
+                   }
+                   else {  /* Not a pattern: convert the hex to string */
+
+                        /* If destination is not in utf8, unconditionally
+                         * recode it to be so.  This is because \N{} implies
+                         * Unicode semantics, and scalars have to be in utf8
+                         * to guarantee those semantics */
+                       if (! has_utf8) {
+                           SvCUR_set(sv, d - SvPVX_const(sv));
+                           SvPOK_on(sv);
+                           *d = '\0';
+                           /* See Note on sizing above.  */
+                           sv_utf8_upgrade_flags_grow(
+                                       sv,
+                                       SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+                                       UNISKIP(uv) + (STRLEN)(send - e) + 1);
+                           d = SvPVX(sv) + SvCUR(sv);
+                           has_utf8 = TRUE;
+                       }
+
+                       /* Add the string to the output */
+                       if (UNI_IS_INVARIANT(uv)) {
+                           *d++ = (char) uv;
+                       }
+                       else d = (char*)uvuni_to_utf8((U8*)d, uv);
+                   }
+               }
+               else { /* Here is \N{NAME} but not \N{U+...}. */
+
+                   SV *res;            /* result from charnames */
+                   const char *str;    /* the string in 'res' */
+                   STRLEN len;         /* its length */
+
+                   /* Get the value for NAME */
+                   res = newSVpvn(s, e - s);
+                   res = new_constant( NULL, 0, "charnames",
+                                       /* includes all of: \N{...} */
+                                       res, NULL, s - 3, e - s + 4 );
+
+                   /* Most likely res will be in utf8 already since the
+                    * standard charnames uses pack U, but a custom translator
+                    * can leave it otherwise, so make sure.  XXX This can be
+                    * revisited to not have charnames use utf8 for characters
+                    * that don't need it when regexes don't have to be in utf8
+                    * for Unicode semantics.  If doing so, remember EBCDIC */
+                   sv_utf8_upgrade(res);
+                   str = SvPV_const(res, len);
+
+                   /* Don't accept malformed input */
+                   if (! is_utf8_string((U8 *) str, len)) {
+                       yyerror("Malformed UTF-8 returned by \\N");
+                   }
+                   else if (PL_lex_inpat) {
+
+                       if (! len) { /* The name resolved to an empty string */
+                           Copy("\\N{}", d, 4, char);
+                           d += 4;
+                       }
+                       else {
+                           /* In order to not lose information for the regex
+                           * compiler, pass the result in the specially made
+                           * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
+                           * the code points in hex of each character
+                           * returned by charnames */
+
+                           const char *str_end = str + len;
+                           STRLEN char_length;     /* cur char's byte length */
+                           STRLEN output_length;   /* and the number of bytes
+                                                      after this is translated
+                                                      into hex digits */
+                           const STRLEN off = d - SvPVX_const(sv);
+
+                           /* 2 hex per byte; 2 chars for '\N'; 2 chars for
+                            * max('U+', '.'); and 1 for NUL */
+                           char hex_string[2 * UTF8_MAXBYTES + 5];
+
+                           /* Get the first character of the result. */
+                           U32 uv = utf8n_to_uvuni((U8 *) str,
+                                                   len,
+                                                   &char_length,
+                                                   UTF8_ALLOW_ANYUV);
+
+                           /* The call to is_utf8_string() above hopefully
+                            * guarantees that there won't be an error.  But
+                            * it's easy here to make sure.  The function just
+                            * above warns and returns 0 if invalid utf8, but
+                            * it can also return 0 if the input is validly a
+                            * NUL. Disambiguate */
+                           if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
+                               uv = UNICODE_REPLACEMENT;
+                           }
+
+                           /* Convert first code point to hex, including the
+                            * boiler plate before it */
+                           sprintf(hex_string, "\\N{U+%X", (unsigned int) uv);
+                           output_length = strlen(hex_string);
+
+                           /* Make sure there is enough space to hold it */
+                           d = off + SvGROW(sv, off
+                                                + output_length
+                                                + (STRLEN)(send - e)
+                                                + 2);  /* '}' + NUL */
+                           /* And output it */
+                           Copy(hex_string, d, output_length, char);
+                           d += output_length;
+
+                           /* For each subsequent character, append dot and
+                            * its ordinal in hex */
+                           while ((str += char_length) < str_end) {
+                               const STRLEN off = d - SvPVX_const(sv);
+                               U32 uv = utf8n_to_uvuni((U8 *) str,
+                                                       str_end - str,
+                                                       &char_length,
+                                                       UTF8_ALLOW_ANYUV);
+                               if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
+                                   uv = UNICODE_REPLACEMENT;
+                               }
+
+                               sprintf(hex_string, ".%X", (unsigned int) uv);
+                               output_length = strlen(hex_string);
+
+                               d = off + SvGROW(sv, off
+                                                    + output_length
+                                                    + (STRLEN)(send - e)
+                                                    + 2);      /* '}' +  NUL */
+                               Copy(hex_string, d, output_length, char);
+                               d += output_length;
+                           }
+
+                           *d++ = '}'; /* Done.  Add the trailing brace */
+                       }
+                   }
+                   else { /* Here, not in a pattern.  Convert the name to a
+                           * string. */
+
+                        /* If destination is not in utf8, unconditionally
+                         * recode it to be so.  This is because \N{} implies
+                         * Unicode semantics, and scalars have to be in utf8
+                         * to guarantee those semantics */
+                       if (! has_utf8) {
+                           SvCUR_set(sv, d - SvPVX_const(sv));
+                           SvPOK_on(sv);
+                           *d = '\0';
+                           /* See Note on sizing above.  */
+                           sv_utf8_upgrade_flags_grow(sv,
+                                               SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+                                               len + (STRLEN)(send - s) + 1);
+                           d = SvPVX(sv) + SvCUR(sv);
+                           has_utf8 = TRUE;
+                       } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
+
+                           /* See Note on sizing above.  (NOTE: SvCUR() is not
+                            * set correctly here). */
+                           const STRLEN off = d - SvPVX_const(sv);
+                           d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
+                       }
+                       Copy(str, d, len, char);
+                       d += len;
                    }
-#ifdef EBCDIC
-                   if (!dorange)
-                       native_range = FALSE; /* \N{} is guessed to be Unicode */
-#endif
-                   Copy(str, d, len, char);
-                   d += len;
                    SvREFCNT_dec(res);
-                 cont_scan:
-                   s = e + 1;
                }
-               else
-                   yyerror("Missing braces on \\N{}");
+#ifdef EBCDIC
+               if (!dorange) 
+                   native_range = FALSE; /* \N{} is defined to be Unicode */
+#endif
+               s = e + 1;  /* Point to just after the '}' */
                continue;
 
            /* \c is a control character */
@@ -11308,6 +11518,10 @@ S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen,
        SvREFCNT_dec(msg);
        return sv;
     }
+
+    /* charnames doesn't work well if there have been errors found */
+    if (PL_error_count > 0 && strEQ(key,"charnames")) return res;
+
     cvp = hv_fetch(table, key, keylen, FALSE);
     if (!cvp || !SvOK(*cvp)) {
        why1 = "$^H{";
embed.fnc		patch \| blob \| blame \| history
pod/perl5120delta.pod		patch \| blob \| blame \| history
pod/perldiag.pod		patch \| blob \| blame \| history
regcomp.c		patch \| blob \| blame \| history
t/lib/Cname.pm		patch \| blob \| blame \| history
t/re/pat.t		patch \| blob \| blame \| history
t/re/pat_advanced.t		patch \| blob \| blame \| history
t/re/re_tests		patch \| blob \| blame \| history
t/re/regexp.t		patch \| blob \| blame \| history
toke.c		patch \| blob \| blame \| history