AnpR |void* |get_context
Anp |void |set_context |NN void *t
+EpRnP |I32 |regcurly |NN const char *s
END_EXTERN_C
Es |STRLEN |reguni |NN const struct RExC_state_t *pRExC_state \
|UV uv|NN char *s
Es |regnode*|regclass |NN struct RExC_state_t *pRExC_state|U32 depth
-ERsn |I32 |regcurly |NN const char *s
Es |regnode*|reg_node |NN struct RExC_state_t *pRExC_state|U8 op
Es |UV |reg_recode |const char value|NN SV **encp
Es |regnode*|regpiece |NN struct RExC_state_t *pRExC_state \
A new regex escape has been added, C<\N>. It will match any character that
is not a newline, independently from the presence or absence of the single
-line match modifier C</s>. (If C<\N> is followed by an opening brace and
+line match modifier C</s>. It is not usable within a character class.
+(If C<\N> is followed by an opening brace and
by a letter, perl will still assume that a Unicode character name is
-coming, so compatibility is preserved.) (Rafael Garcia-Suarez)
+coming, so compatibility is preserved.) (Rafael Garcia-Suarez).
This will break a L<custom charnames translator|charnames/CUSTOM TRANSLATORS>
which allows numbers for character names, as C<\N{3}> will now mean to match 3
=item *
-The C<charnames> pragma may generate a run-time error when a regex is
-interpolated [RT #56444]:
-
- use charnames ':full';
- my $r1 = qr/\N{THAI CHARACTER SARA I}/;
- "foo" =~ $r1; # okay
- "foo" =~ /$r1+/; # runtime error
-
-A workaround is to generate the character outside of the regex:
-
- my $a = "\N{THAI CHARACTER SARA I}";
- my $r1 = qr/$a/;
-
-However, C<$r1> must be used within the scope of the C<use charnames> for this
-to work.
-
-=item *
-
Some regexes may run much more slowly when run in a child thread compared
with the thread the pattern was compiled into [RT #55600].
names (like C<$A::B>). You've exceeded Perl's limits. Future versions
of Perl are likely to eliminate these arbitrary limitations.
-=item Ignoring %s in character class in regex; marked by <-- HERE in m/%s/
+=item Ignoring zero length \N{} in character class"
-(W) Named Unicode character escapes (\N{...}) may return multi-char
-or zero length sequences. When such an escape is used in a character class
+(W) Named Unicode character escapes (\N{...}) may return a
+zero length sequence. When such an escape is used in a character class
its behaviour is not well defined. Check that the correct escape has
been used, and the correct charname handler is in scope.
(F) Perl thought it was reading UTF-16 encoded character data but while
doing it Perl met a malformed Unicode surrogate.
+=item Malformed UTF-8 returned by \N
+
+(F) The charnames handler returned malformed UTF-8.
+
=item Malformed UTF-8 string in pack
(F) You tried to pack something that didn't comply with UTF-8 encoding
(F) The argument to the indicated command line switch must follow
immediately after the switch, without intervening spaces.
-=item Missing %sbrace%s on \N{}
+=item Missing braces on \N{}
(F) Wrong syntax of character name literal C<\N{charname}> within
double-quotish context.
=item Missing right brace on %s
-(F) Missing right brace in C<\x{...}>, C<\p{...}> or C<\P{...}>.
+(F) Missing right brace in C<\x{...}>, C<\p{...}>, C<\P{...}>, or C<\N{...}>.
+
+=item Missing right brace on \\N{} or unescaped left brace after \\N. Assuming the latter
+
+(W syntax)
+C<\N> has traditionally been followed by a name enclosed in braces,
+meaning the character (or sequence of characters) given by that name.
+Thus C<\N{ASTERISK}> is another way of writing C<*>, valid in both
+double-quoted strings and regular expression patterns.
+In patterns, it doesn't have the meaning an unescaped C<*> does.
+
+Starting in Perl 5.12.0, C<\N> also can have an additional meaning in patterns,
+namely to match a non-newline character. (This is like C<.> but is not
+affected by the C</s> modifier.)
+
+This can lead to some ambiguities. When C<\N> is not followed immediately by a
+left brace, Perl assumes the "match non-newline character" meaning. Also, if
+the braces form a valid quantifier such as C<\N{3}> or C<\N{5,}>, Perl assumes
+that this means to match the given quantity of non-newlines (in these examples,
+3, and 5 or more, respectively). In all other case, where there is a C<\N{>
+and a matching C<}>, Perl assumes that a character name is desired.
+
+However, if there is no matching C<}>, Perl doesn't know if it was mistakenly
+omitted, or if "match non-newline" followed by "match a C<{>" was desired.
+It assumes the latter because that is actually a valid interpretation as
+written, unlike the other case. If you meant the former, you need to add the
+matching right brace. If you did mean the latter, you can silence this warning
+by writing instead C<\N\{>.
=item Missing right curly or square bracket
sense to try to declare one with a package qualifier on the front. Use
local() if you want to localize a package variable.
+=item \\N in a character class must be a named character: \\N{...}
+
+The new (5.12) meaning of C<\N> to match non-newlines is not valid in a
+bracketed character class, for the same reason that C<.> in a character class
+loses its specialness: it matches almost everything, which is probably not what
+you want.
+
=item Name "%s::%s" used only once: possible typo
(W once) Typographical errors often show up as unique variable names.
the same; if a program uses $c only once but also uses any of the others it
will not trigger this warning.
+=item Invalid hexadecimal number in \\N{U+...}
+
+(F) The character constant represented by C<...> is not a valid hexadecimal
+number.
+
=item Negative '/' count in unpack
(F) The length count obtained from a length/code unpack operation was
allow this syntax, but shouldn't have. It is now deprecated, and will be
removed in a future version.
+=item Using just the first character returned by \N{} in character class
+
+(W) A charnames handler may return a sequence of more than one character.
+Currently all but the first one are discarded when used in a regular
+expression pattern bracketed character class.
+
+=item Using just the first characters returned by \N{}
+
+(W) A charnames handler may return a sequence of characters. There is a finite
+limit as to the number of characters that can be used, which this sequence
+exceeded. In the message, the characters in the sequence are separated by
+dots, and each is shown by its ordinal in hex. Anything to the left of the
+C<HERE> was retained; anything to the right was discarded.
+
=item UTF-16 surrogate %s
(W utf8) You tried to generate half of a UTF-16 surrogate by
I32 orig_utf8; /* whether the pattern was originally in utf8 */
/* XXX use this for future optimisation of case
* where pattern must be upgraded to utf8. */
- HV *charnames; /* cache of named sequences */
HV *paren_names; /* Paren names */
regnode **recurse; /* Recurse regops */
#define RExC_seen_evals (pRExC_state->seen_evals)
#define RExC_utf8 (pRExC_state->utf8)
#define RExC_orig_utf8 (pRExC_state->orig_utf8)
-#define RExC_charnames (pRExC_state->charnames)
#define RExC_open_parens (pRExC_state->open_parens)
#define RExC_close_parens (pRExC_state->close_parens)
#define RExC_opend (pRExC_state->opend)
RExC_size = 0L;
RExC_emit = &PL_regdummy;
RExC_whilem_seen = 0;
- RExC_charnames = NULL;
RExC_open_parens = NULL;
RExC_close_parens = NULL;
RExC_opend = NULL;
recognized '\N' and needs to handle the rest. RExC_parse is
expected to point at the first char following the N at the time
of the call.
+
+ The \N may be inside (indicated by valuep not being NULL) or outside a
+ character class.
+
+ \N may begin either a named sequence, or if outside a character class, mean
+ to match a non-newline. For non single-quoted regexes, the tokenizer has
+ attempted to decide which, and in the case of a named sequence converted it
+ into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
+ where c1... are the characters in the sequence. For single-quoted regexes,
+ the tokenizer passes the \N sequence through unchanged; this code will not
+ attempt to determine this nor expand those. The net effect is that if the
+ beginning of the passed-in pattern isn't '{U+' or there is no '}', it
+ signals that this \N occurrence means to match a non-newline.
+
+ Only the \N{U+...} form should occur in a character class, for the same
+ reason that '.' inside a character class means to just match a period: it
+ just doesn't make sense.
If valuep is non-null then it is assumed that we are parsing inside
of a charclass definition and the first codepoint in the resolved
string is returned via *valuep and the routine will return NULL.
In this mode if a multichar string is returned from the charnames
- handler a warning will be issued, and only the first char in the
+ handler, a warning will be issued, and only the first char in the
sequence will be examined. If the string returned is zero length
then the value of *valuep is undefined and NON-NULL will
be returned to indicate failure. (This will NOT be a valid pointer
to a regnode.)
- If valuep is null then it is assumed that we are parsing normal text
- and inserts a new EXACT node into the program containing the resolved
- string and returns a pointer to the new node. If the string is
- zerolength a NOTHING node is emitted.
+ If valuep is null then it is assumed that we are parsing normal text and a
+ new EXACT node is inserted into the program containing the resolved string,
+ and a pointer to the new node is returned. But if the string is zero length
+ a NOTHING node is emitted instead.
On success RExC_parse is set to the char following the endbrace.
- Parsing failures will generate a fatal errorvia vFAIL(...)
-
- NOTE: We cache all results from the charnames handler locally in
- the RExC_charnames hash (created on first use) to prevent a charnames
- handler from playing silly-buggers and returning a short string and
- then a long string for a given pattern. Since the regexp program
- size is calculated during an initial parse this would result
- in a buffer overrun so we cache to prevent the charname result from
- changing during the course of the parse.
-
+ Parsing failures will generate a fatal error via vFAIL(...)
*/
STATIC regnode *
S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
{
- char * name; /* start of the content of the name */
char * endbrace; /* endbrace following the name */
- SV *sv_str = NULL;
- SV *sv_name = NULL;
- STRLEN len; /* this has various purposes throughout the code */
- bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
regnode *ret = NULL;
+#ifdef DEBUGGING
+ char* parse_start = RExC_parse - 2; /* points to the '\N' */
+#endif
+
+ GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REG_NAMEDSEQ;
+
+ GET_RE_DEBUG_FLAGS;
- if (*RExC_parse != '{' ||
- (*RExC_parse == '{' && RExC_parse[1]
- && strchr("0123456789", RExC_parse[1])))
+ /* Disambiguate between \N meaning a named character versus \N meaning
+ * don't match a newline. */
+ if (*RExC_parse != '{'
+ || (! (endbrace = strchr(RExC_parse, '}'))) /* no trailing brace */
+ || ! (endbrace == RExC_parse + 1 /* nothing between the {} */
+ || (endbrace - RExC_parse > 3 /* U+ and at least one hex */
+ && strnEQ(RExC_parse + 1, "U+", 2))))
{
- GET_RE_DEBUG_FLAGS_DECL;
- if (valuep)
+ if (valuep) {
/* no bare \N in a charclass */
- vFAIL("Missing braces on \\N{}");
- GET_RE_DEBUG_FLAGS;
+ vFAIL("\\N in a character class must be a named character: \\N{...}");
+ }
nextchar(pRExC_state);
ret = reg_node(pRExC_state, REG_ANY);
*flagp |= HASWIDTH|SIMPLE;
Set_Node_Length(ret, 1); /* MJD */
return ret;
}
- name = RExC_parse+1;
- endbrace = strchr(RExC_parse, '}');
- if ( ! endbrace ) {
- RExC_parse++;
- vFAIL("Missing right brace on \\N{}");
- }
- RExC_parse = endbrace + 1;
-
-
- /* RExC_parse points at the beginning brace,
- endbrace points at the last */
- if ( name[0]=='U' && name[1]=='+' ) {
- /* its a "Unicode hex" notation {U+89AB} */
- I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
- | PERL_SCAN_DISALLOW_PREFIX
- | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
- UV cp;
- len = (STRLEN)(endbrace - name - 2);
- cp = grok_hex(name + 2, &len, &fl, NULL);
- if ( len != (STRLEN)(endbrace - name - 2) ) {
- cp = 0xFFFD;
- }
- if ( valuep ) {
- if (cp > 0xff) RExC_utf8 = 1;
- *valuep = cp;
- return NULL;
- }
- /* Need to convert to utf8 if either: won't fit into a byte, or the re
- * is going to be in utf8 and the representation changes under utf8. */
- if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
- U8 string[UTF8_MAXBYTES+1];
- U8 *tmps;
- RExC_utf8 = 1;
- tmps = uvuni_to_utf8(string, cp);
- sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
- } else { /* Otherwise, no need for utf8, can skip that step */
- char string;
- string = (char)cp;
- sv_str= newSVpvn(&string, 1);
+ /* Here, we have decided it is a named sequence */
+ RExC_parse++; /* Skip past the '{' */
+ if (endbrace == RExC_parse) { /* empty: \N{} */
+ if (! valuep) {
+ RExC_parse = endbrace + 1;
+ return reg_node(pRExC_state,NOTHING);
}
- } else {
- /* fetch the charnames handler for this scope */
- HV * const table = GvHV(PL_hintgv);
- SV **cvp= table ?
- hv_fetchs(table, "charnames", FALSE) :
- NULL;
- SV *cv= cvp ? *cvp : NULL;
- HE *he_str;
- int count;
- /* create an SV with the name as argument */
- sv_name = newSVpvn(name, endbrace - name);
-
- if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
- vFAIL2("Constant(\\N{%" SVf "}) unknown: "
- "(possibly a missing \"use charnames ...\")",
- SVfARG(sv_name));
- }
- if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
- vFAIL2("Constant(\\N{%" SVf "}): "
- "$^H{charnames} is not defined", SVfARG(sv_name));
- }
-
-
-
- if (!RExC_charnames) {
- /* make sure our cache is allocated */
- RExC_charnames = newHV();
- sv_2mortal(MUTABLE_SV(RExC_charnames));
- }
- /* see if we have looked this one up before */
- he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
- if ( he_str ) {
- sv_str = HeVAL(he_str);
- cached = 1;
- } else if (PL_parser && PL_parser->error_count > 0) {
- /* Don't attempt to load charnames if we're already in error */
- vFAIL("Too many errors, cannot continue parsing");
- } else {
- dSP ;
- ENTER ;
- SAVETMPS ;
- PUSHMARK(SP) ;
-
- XPUSHs(sv_name);
-
- PUTBACK ;
-
- count= call_sv(cv, G_SCALAR);
- SPAGAIN ;
-
- if (count == 1) { /* XXXX is this right? dmq */
- sv_str = POPs;
- SvREFCNT_inc_simple_void(sv_str);
- }
-
- PUTBACK ;
- FREETMPS ;
- LEAVE ;
-
- if ( !sv_str || !SvOK(sv_str) ) {
- vFAIL2("Constant(\\N{%" SVf "}): Call to &{$^H{charnames}} "
- "did not return a defined value", SVfARG(sv_name));
- }
- if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
- cached = 1;
- }
+ if (SIZE_ONLY) {
+ ckWARNreg(RExC_parse,
+ "Ignoring zero length \\N{} in character class"
+ );
+ RExC_parse = endbrace + 1;
+ }
+ *valuep = 0;
+ return (regnode *) &RExC_parse; /* Invalid regnode pointer */
}
- if (valuep) {
- char *p = SvPV(sv_str, len);
- if (len) {
- STRLEN numlen = 1;
- if ( SvUTF8(sv_str) ) {
- *valuep = utf8_to_uvchr((U8*)p, &numlen);
- if (*valuep > 0x7F)
- RExC_utf8 = 1;
- /* XXXX
- We have to turn on utf8 for high bit chars otherwise
- we get failures with
-
- "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
- "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
-
- This is different from what \x{} would do with the same
- codepoint, where the condition is > 0xFF.
- - dmq
- */
-
-
- } else {
- *valuep = (UV)*p;
- /* warn if we havent used the whole string? */
- }
- if (numlen<len && SIZE_ONLY) {
- ckWARN2reg(RExC_parse,
- "Ignoring excess chars from \\N{%" SVf "} in character class",
- SVfARG(sv_name)
- );
- }
- } else if (SIZE_ONLY) {
- ckWARN2reg(RExC_parse,
- "Ignoring zero length \\N{%" SVf "} in character class",
- SVfARG(sv_name)
- );
- }
- SvREFCNT_dec(sv_name);
- if (!cached)
- SvREFCNT_dec(sv_str);
- return len ? NULL : (regnode *)&len;
- } else if(SvCUR(sv_str)) {
-
- char *s;
- char *p, *pend;
- STRLEN charlen = 1;
-#ifdef DEBUGGING
- char * parse_start = name-3; /* needed for the offsets */
-#endif
- GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
-
- ret = reg_node(pRExC_state,
- (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
- s= STRING(ret);
-
- if ( RExC_utf8 && !SvUTF8(sv_str) ) {
- sv_utf8_upgrade(sv_str);
- } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
- RExC_utf8= 1;
- }
-
- p = SvPV(sv_str, len);
- pend = p + len;
- /* len is the length written, charlen is the size the char read */
- for ( len = 0; p < pend; p += charlen ) {
- if (UTF) {
- UV uvc = utf8_to_uvchr((U8*)p, &charlen);
- if (FOLD) {
- STRLEN foldlen,numlen;
- U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
- uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
- /* Emit all the Unicode characters. */
-
- for (foldbuf = tmpbuf;
- foldlen;
- foldlen -= numlen)
- {
- uvc = utf8_to_uvchr(foldbuf, &numlen);
- if (numlen > 0) {
- const STRLEN unilen = reguni(pRExC_state, uvc, s);
- s += unilen;
- len += unilen;
- /* In EBCDIC the numlen
- * and unilen can differ. */
- foldbuf += numlen;
- if (numlen >= foldlen)
- break;
- }
- else
- break; /* "Can't happen." */
- }
- } else {
- const STRLEN unilen = reguni(pRExC_state, uvc, s);
- if (unilen > 0) {
- s += unilen;
- len += unilen;
- }
- }
- } else {
- len++;
- REGC(*p, s++);
- }
- }
- if (SIZE_ONLY) {
- RExC_size += STR_SZ(len);
- } else {
- STR_LEN(ret) = len;
- RExC_emit += STR_SZ(len);
- }
- Set_Node_Cur_Length(ret); /* MJD */
- RExC_parse--;
- nextchar(pRExC_state);
- } else { /* zero length */
- ret = reg_node(pRExC_state,NOTHING);
+
+ RExC_utf8 = 1; /* named sequences imply Unicode semantics */
+ RExC_parse += 2; /* Skip past the 'U+' */
+
+ if (valuep) { /* In a bracketed char class */
+ /* We only pay attention to the first char of
+ multichar strings being returned. I kinda wonder
+ if this makes sense as it does change the behaviour
+ from earlier versions, OTOH that behaviour was broken
+ as well. XXX Solution is to recharacterize as
+ [rest-of-class]|multi1|multi2... */
+
+ STRLEN length_of_hex;
+ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX
+ | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
+
+ char * endchar = strchr(RExC_parse, '.');
+ if (endchar) {
+ ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
+ }
+ else endchar = endbrace;
+
+ length_of_hex = (STRLEN)(endchar - RExC_parse);
+ *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
+
+ /* The tokenizer should have guaranteed validity, but it's possible to
+ * bypass it by using single quoting, so check */
+ if ( length_of_hex != (STRLEN)(endchar - RExC_parse) ) {
+ *valuep = UNICODE_REPLACEMENT;
+ }
+
+ RExC_parse = endbrace + 1;
+ if (endchar == endbrace) return NULL;
+
+ ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
}
- SvREFCNT_dec(sv_name);
- if (!cached)
- SvREFCNT_dec(sv_str);
- return ret;
+ else { /* Not a char class */
+ char *s; /* String to put in generated EXACT node */
+ STRLEN len = 0; /* Its current length */
+ char *endchar; /* Points to '.' or '}' ending cur char in the input
+ stream */
+
+ ret = reg_node(pRExC_state,
+ (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
+ s= STRING(ret);
+
+ /* Exact nodes can hold only a U8 length's of text = 255. Loop through
+ * the input which is of the form now 'c1.c2.c3...}' until find the
+ * ending brace or exeed length 255. The characters that exceed this
+ * limit are dropped. The limit could be relaxed should it become
+ * desirable by reparsing this as (?:\N{NAME}), so could generate
+ * multiple EXACT nodes, as is done for just regular input. But this
+ * is primarily a named character, and not intended to be a huge long
+ * string, so 255 bytes should be good enough */
+ while (1) {
+ STRLEN this_char_length;
+ I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX
+ | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
+ UV cp; /* Ord of current character */
+
+ /* Code points are separated by dots. If none, there is only one
+ * code point, and is terminated by the brace */
+ endchar = strchr(RExC_parse, '.');
+ if (! endchar) endchar = endbrace;
+
+ /* The values are Unicode even on EBCDIC machines */
+ this_char_length = (STRLEN)(endchar - RExC_parse);
+ cp = grok_hex(RExC_parse, &this_char_length, &grok_flags, NULL);
+ if ( this_char_length == 0
+ || this_char_length != (STRLEN)(endchar - RExC_parse) )
+ {
+ cp = UNICODE_REPLACEMENT; /* Substitute a valid character */
+ }
+
+ if (! FOLD) { /* Not folding, just append to the string */
+ STRLEN unilen;
+
+ /* Quit before adding this character if would exceed limit */
+ if (len + UNISKIP(cp) > U8_MAX) break;
+ unilen = reguni(pRExC_state, cp, s);
+ if (unilen > 0) {
+ s += unilen;
+ len += unilen;
+ }
+ } else { /* Folding, output the folded equivalent */
+ STRLEN foldlen,numlen;
+ U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
+ cp = toFOLD_uni(cp, tmpbuf, &foldlen);
+
+ /* Quit before exceeding size limit */
+ if (len + foldlen > U8_MAX) break;
+
+ for (foldbuf = tmpbuf;
+ foldlen;
+ foldlen -= numlen)
+ {
+ cp = utf8_to_uvchr(foldbuf, &numlen);
+ if (numlen > 0) {
+ const STRLEN unilen = reguni(pRExC_state, cp, s);
+ s += unilen;
+ len += unilen;
+ /* In EBCDIC the numlen and unilen can differ. */
+ foldbuf += numlen;
+ if (numlen >= foldlen)
+ break;
+ }
+ else
+ break; /* "Can't happen." */
+ }
+ }
+
+ /* Point to the beginning of the next character in the sequence. */
+ RExC_parse = endchar + 1;
+
+ /* Quit if no more characters */
+ if (RExC_parse >= endbrace) break;
+ }
+
+
+ if (SIZE_ONLY) {
+ if (RExC_parse < endbrace) {
+ ckWARNreg(RExC_parse - 1,
+ "Using just the first characters returned by \\N{}");
+ }
+
+ RExC_size += STR_SZ(len);
+ } else {
+ STR_LEN(ret) = len;
+ RExC_emit += STR_SZ(len);
+ }
+
+ RExC_parse = endbrace + 1;
+
+ *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
+ with malformed in t/re/pat_advanced.t */
+ RExC_parse --;
+ Set_Node_Cur_Length(ret); /* MJD */
+ nextchar(pRExC_state);
+ }
+
+ return ret;
}
/*
- regcurly - a little FSA that accepts {\d+,?\d*}
*/
-STATIC I32
-S_regcurly(register const char *s)
+I32
+Perl_regcurly(register const char *s)
{
PERL_ARGS_ASSERT_REGCURLY;
sub translator {
my $str = shift;
if ( $str eq 'EVIL' ) {
+ # Returns A first time, AB second, ABC third ... A-ZA the 27th time.
(my $c=substr("A".$Evil,-1))++;
my $r=$Evil;
$Evil.=$c;
if ( $str eq 'EMPTY-STR') {
return "";
}
+ if ( $str eq 'NULL') {
+ return "\0";
+ }
+ if ( $str eq 'LONG-STR') {
+ return 'A' x 255;
+ }
+ # Should exceed limit for regex \N bytes in a sequence. Anyway it will if
+ # UCHAR_MAX is 255.
+ if ( $str eq 'TOO-LONG-STR') {
+ return 'A' x 256;
+ }
+ if ($str eq 'MALFORMED') {
+ $str = "\xDF\xDFabc";
+ utf8::upgrade($str);
+
+ # Create a malformed in first and second characters.
+ $str =~ s/^\C/A/;
+ $str =~ s/^(\C\C)\C/$1A/;
+ }
return $str;
}
#
# This is a home for regular expression tests that don't fit into
# the format supported by re/regexp.t. If you want to add a test
-# that does fit that format, add it to re/re_tests, not here.
+# that does fit that format, add it to re/re_tests, not here. Tests for \N
+# should be added here because they are treated as single quoted strings
+# there, which means they avoid the lexer which otherwise would look at them.
use strict;
use warnings;
}
-plan tests => 293; # Update this when adding/deleting tests.
+plan tests => 297; # Update this when adding/deleting tests.
run_tests() unless caller;
iseq "@space2", "spc tab";
}
+ {
+ use charnames ":full";
+ local $Message = 'Delayed interpolation of \N';
+ my $r1 = qr/\N{THAI CHARACTER SARA I}/;
+ my $s1 = "\x{E34}\x{E34}\x{E34}\x{E34}";
+
+ # Bug #56444
+ ok $s1 =~ /$r1+/, 'my $r1 = qr/\N{THAI CHARACTER SARA I}/; my $s1 = "\x{E34}\x{E34}\x{E34}\x{E34}; $s1 =~ /$r1+/';
+
+ # Bug #62056
+ ok "${s1}A" =~ m/$s1\N{LATIN CAPITAL LETTER A}/, '"${s1}A" =~ m/$s1\N{LATIN CAPITAL LETTER A}/';
+
+ ok "abbbbc" =~ m/\N{1}/ && $& eq "a", '"abbbbc" =~ m/\N{1}/ && $& eq "a"';
+ ok "abbbbc" =~ m/\N{3,4}/ && $& eq "abbb", '"abbbbc" =~ m/\N{3,4}/ && $& eq "abbb"';
+ }
+
+
} # End of sub run_tests
1;
}
-plan tests => 1143; # Update this when adding/deleting tests.
+plan tests => 1155; # Update this when adding/deleting tests.
run_tests() unless caller;
use Cname;
ok 'fooB' =~ /\N{foo}[\N{B}\N{b}]/, "Passthrough charname";
- my $test = 1233;
#
# Why doesn't must_warn work here?
#
my $w;
local $SIG {__WARN__} = sub {$w .= "@_"};
eval 'q(xxWxx) =~ /[\N{WARN}]/';
- ok $w && $w =~ /^Ignoring excess chars from/,
- "Ignoring excess chars warning";
+ ok $w && $w =~ /Using just the first character returned by \\N{} in character class/,
+ "single character in [\\N{}] warning";
undef $w;
eval q [ok "\0" !~ /[\N{EMPTY-STR}XY]/,
"Zerolength charname in charclass doesn't match \\0"];
- ok $w && $w =~ /^Ignoring zero length/,
- 'Ignoring zero length \N{%} in character class warning';
+ ok $w && $w =~ /Ignoring zero length/,
+ 'Ignoring zero length \N{} in character class warning';
ok 'AB' =~ /(\N{EVIL})/ && $1 eq 'A', 'Charname caching $1';
ok 'ABC' =~ /(\N{EVIL})/, 'Charname caching $1';
'Empty string charname produces NOTHING node';
ok '' =~ /\N{EMPTY-STR}/,
'Empty string charname produces NOTHING node';
+ ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/, 'Verify that long string works';
+ ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/i, 'Verify under folding that long string works';
+
+ # If remove the limitation in regcomp code these should work
+ # differently
+ undef $w;
+ eval q [ok "\N{LONG-STR}" =~ /^\N{TOO-LONG-STR}$/, 'Verify that too long a string fails gracefully'];
+ ok $w && $w =~ /Using just the first characters returned/, 'Verify that got too-long string warning in \N{} that exceeds the limit';
+ undef $w;
+ eval q [ok "\N{LONG-STR}" =~ /^\N{TOO-LONG-STR}$/i, 'Verify under folding that too long a string fails gracefully'];
+ ok $w && $w =~ /Using just the first characters returned/, 'Verify under folding that got too-long string warning in \N{} that exceeds the limit';
+ undef $w;
+ eval q [ok "\N{TOO-LONG-STR}" !~ /^\N{TOO-LONG-STR}$/, 'Verify that too long a string doesnt work'];
+ ok $w && $w =~ /Using just the first characters returned/, 'Verify that got too-long string warning in \N{} that exceeds the limit';
+ undef $w;
+ eval q [ok "\N{TOO-LONG-STR}" !~ /^\N{TOO-LONG-STR}$/i, 'Verify under folding that too long a string doesnt work'];
+ ok $w && $w =~ /Using just the first characters returned/i, 'Verify under folding that got too-long string warning in \N{} that exceeds the limit';
+ undef $w;
+ eval 'q(syntax error) =~ /\N{MALFORMED}/';
+ ok $@ && $@ =~ /Malformed/, 'Verify that malformed utf8 gives an error';
}
ok "\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}" =~
/[\N{SPACE}\N{U+0041}][\N{SPACE}\N{U+0042}]/,
'Intermixed named and unicode escapes';
+ ok "\0" =~ /^\N{NULL}$/, 'Verify that \N{NULL} works; is not confused with an error';
}
# [perl #60344] Regex lookbehind failure after an (if)then|else in perl 5.10
/\A(?(?=db2)db2|\D+)(?<!processed)\.csv\z/xms sql_processed.csv n - -
/\N{U+0100}/ \x{100} y $& \x{100} # Bug #59328
+/[a\N{U+0100}]/ \x{100} y $& \x{100}
+/[a\N{U+0100}]/ a y $& a
+
+# Verify that \N{U+...} forces Unicode semantics
+/\N{U+41}\x{c1}/i a\x{e1} y $& a\x{e1}
+/[\N{U+41}\x{c1}]/i \x{e1} y $& \x{e1}
+
[\s][\S] \x{a0}\x{a0} nT - - # Unicode complements should not match same character
# was generating malformed utf8
((??{ "(?:|)" }))\s C\x20 y - -
+# Verify that \ escapes the { after \N, and causes \N to match non-newline
+abc\N\{U+BEEF} abc\n{UBEEF} n
+abc\N\{U+BEEF} abc.{UBEEF} y $& abc.{UBEEF}
+[abc\N\{U+BEEF}] - c - \\N in a character class must be a named character
+
+# Verify that \N can be trailing and causes \N to match non-newline
+abc\N abcd y $& abcd
+abc\N abc\n n
+
+# Verify get errors. For these, we need // or else puts it in single quotes,
+# and doesn't expand.
+/\N{U+}/ - c - Invalid hexadecimal number
+/abc\N{def/ - c - Missing right brace
+
+# Verifies catches hex errors, and doesn't expose our . notation to the outside
+/\N{U+0xBEEF}/ - c - Illegal hexadecimal digit
+/\N{U+BEEF.BEAD}/ - c - Illegal hexadecimal digit
+
# vim: set noexpandtab
# If you want to add a regular expression test that can't be expressed
# in this format, don't add it here: put it in re/pat.t instead.
#
+# Note that the inputs get passed on as "m're'", so the re bypasses the lexer.
+# This means this file cannot be used for testing anything that the lexer
+# handles; in 5.12 this means just \N{NAME} and \N{U+...}.
+#
# Note that columns 2,3 and 5 are all enclosed in double quotes and then
# evalled; so something like a\"\x{100}$1 has length 3+length($1).
In patterns:
backslashes:
- double-quoted style: \r and \n
- regexp special ones: \D \s
- constants: \x31
- backrefs: \1
+ constants: \N{NAME} only
case and quoting: \U \Q \E
stops on @ and $, but not for $ as tail anchor
In double-quoted strings:
backslashes:
double-quoted style: \r and \n
- constants: \x31
+ constants: \x31, etc.
deprecated backrefs: \1 (in substitution replacements)
case and quoting: \U \Q \E
stops on @ and $
check for embedded arrays
check for embedded scalars
if (backslash) {
- leave intact backslashes from leaveit (below)
deprecate \1 in substitution replacements
handle string-changing backslashes \l \U \Q \E, etc.
switch (what was escaped) {
handle \- in a transliteration (becomes a literal -)
+ if a pattern and not \N{, go treat as regular character
handle \132 (octal characters)
handle \x15 and \x{1234} (hex characters)
- handle \N{name} (named characters)
+ handle \N{name} (named characters, also \N{3,5} in a pattern)
handle \cV (control characters)
handle printf-style backslashes (\f, \r, \n, etc)
} (end switch)
while (s < send || dorange) {
+
/* get transliterations out of the way (they're most literal) */
if (PL_lex_inwhat == OP_TRANS) {
/* expand a range A-Z to the full set of characters. AIE! */
/* backslashes */
if (*s == '\\' && s+1 < send) {
+ char* e; /* Can be used for ending '}', etc. */
+
s++;
/* deprecate \1 in strings and substitution replacements */
--s;
break;
}
- /* skip any other backslash escapes in a pattern */
- else if (PL_lex_inpat) {
+ /* In a pattern, process \N, but skip any other backslash escapes.
+ * This is because we don't want to translate an escape sequence
+ * into a meta symbol and have the regex compiler use the meta
+ * symbol meaning, e.g. \x{2E} would be confused with a dot. But
+ * in spite of this, we do have to process \N here while the proper
+ * charnames handler is in scope. See bugs #56444 and #62056.
+ * There is a complication because \N in a pattern may also stand
+ * for 'match a non-nl', and not mean a charname, in which case its
+ * processing should be deferred to the regex compiler. To be a
+ * charname it must be followed immediately by a '{', and not look
+ * like \N followed by a curly quantifier, i.e., not something like
+ * \N{3,}. regcurly returns a boolean indicating if it is a legal
+ * quantifier */
+ else if (PL_lex_inpat
+ && (*s != 'N'
+ || s[1] != '{'
+ || regcurly(s + 1)))
+ {
*d++ = NATIVE_TO_NEED(has_utf8,'\\');
goto default_action;
}
- /* if we get here, it's either a quoted -, or a digit */
switch (*s) {
/* quoted - in transliterations */
}
NUM_ESCAPE_INSERT:
- /* Insert oct, hex, or \N{U+...} escaped character. There will
- * always be enough room in sv since such escapes will be
- * longer than any UTF-8 sequence they can end up as, except if
- * they force us to recode the rest of the string into utf8 */
+ /* Insert oct or hex escaped character. There will always be
+ * enough room in sv since such escapes will be longer than any
+ * UTF-8 sequence they can end up as, except if they force us
+ * to recode the rest of the string into utf8 */
/* Here uv is the ordinal of the next character being added in
- * unicode (converted from native). (It has to be done before
- * here because \N is interpreted as unicode, and oct and hex
- * as native.) */
+ * unicode (converted from native). */
if (!UNI_IS_INVARIANT(uv)) {
if (!has_utf8 && uv > 255) {
/* Might need to recode whatever we have accumulated so
}
continue;
- /* \N{LATIN SMALL LETTER A} is a named character, and so is
- * \N{U+0041} */
case 'N':
- ++s;
- if (*s == '{') {
- char* e = strchr(s, '}');
- SV *res;
- STRLEN len;
- const char *str;
-
- if (!e) {
+ /* In a non-pattern \N must be a named character, like \N{LATIN
+ * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
+ * mean to match a non-newline. For non-patterns, named
+ * characters are converted to their string equivalents. In
+ * patterns, named characters are not converted to their
+ * ultimate forms for the same reasons that other escapes
+ * aren't. Instead, they are converted to the \N{U+...} form
+ * to get the value from the charnames that is in effect right
+ * now, while preserving the fact that it was a named character
+ * so that the regex compiler knows this */
+
+ /* This section of code doesn't generally use the
+ * NATIVE_TO_NEED() macro to transform the input. I (khw) did
+ * a close examination of this macro and determined it is a
+ * no-op except on utfebcdic variant characters. Every
+ * character generated by this that would normally need to be
+ * enclosed by this macro is invariant, so the macro is not
+ * needed, and would complicate use of copy(). There are other
+ * parts of this file where the macro is used inconsistently,
+ * but are saved by it being a no-op */
+
+ /* The structure of this section of code (besides checking for
+ * errors and upgrading to utf8) is:
+ * Further disambiguate between the two meanings of \N, and if
+ * not a charname, go process it elsewhere
+ * If of form \N{U+...}, pass it through if a pattern; otherwise
+ * convert to utf8
+ * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a pattern;
+ * otherwise convert to utf8 */
+
+ /* Here, s points to the 'N'; the test below is guaranteed to
+ * succeed if we are being called on a pattern as we already
+ * know from a test above that the next character is a '{'.
+ * On a non-pattern \N must mean 'named sequence, which
+ * requires braces */
+ s++;
+ if (*s != '{') {
+ yyerror("Missing braces on \\N{}");
+ continue;
+ }
+ s++;
+
+ /* If there is no matching '}', it is an error outside of a
+ * pattern, or ambiguous inside. */
+ if (! (e = strchr(s, '}'))) {
+ if (! PL_lex_inpat) {
yyerror("Missing right brace on \\N{}");
- e = s - 1;
- goto cont_scan;
+ continue;
}
- if (e > s + 2 && s[1] == 'U' && s[2] == '+') {
- /* \N{U+...} The ... is a unicode value even on EBCDIC
- * machines */
- I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
- PERL_SCAN_DISALLOW_PREFIX;
- s += 3;
- len = e - s;
- uv = grok_hex(s, &len, &flags, NULL);
- if ( e > s && len != (STRLEN)(e - s) ) {
- uv = 0xFFFD;
+ else {
+
+ /* A missing brace means it can't be a legal character
+ * name, and it could be a legal "match non-newline".
+ * But it's kind of weird without an unescaped left
+ * brace, so warn. */
+ if (ckWARN(WARN_SYNTAX)) {
+ Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
+ "Missing right brace on \\N{} or unescaped left brace after \\N. Assuming the latter");
}
- s = e + 1;
- goto NUM_ESCAPE_INSERT;
+ s -= 3; /* Backup over cur char, {, N, to the '\' */
+ *d++ = NATIVE_TO_NEED(has_utf8,'\\');
+ goto default_action;
}
- res = newSVpvn(s + 1, e - s - 1);
- res = new_constant( NULL, 0, "charnames",
- res, NULL, s - 2, e - s + 3 );
- if (has_utf8)
- sv_utf8_upgrade(res);
- str = SvPV_const(res,len);
-#ifdef EBCDIC_NEVER_MIND
- /* charnames uses pack U and that has been
- * recently changed to do the below uni->native
- * mapping, so this would be redundant (and wrong,
- * the code point would be doubly converted).
- * But leave this in just in case the pack U change
- * gets revoked, but the semantics is still
- * desireable for charnames. --jhi */
- {
- UV uv = utf8_to_uvchr((const U8*)str, 0);
+ }
- if (uv < 0x100) {
- U8 tmpbuf[UTF8_MAXBYTES+1], *d;
+ /* Here it looks like a named character */
- d = uvchr_to_utf8(tmpbuf, UNI_TO_NATIVE(uv));
- sv_setpvn(res, (char *)tmpbuf, d - tmpbuf);
- str = SvPV_const(res, len);
- }
- }
-#endif
- /* If destination is not in utf8 but this new character is,
- * recode the dest to utf8 */
- if (!has_utf8 && SvUTF8(res)) {
+ if (PL_lex_inpat) {
+
+ /* XXX This block is temporary code. \N{} implies that the
+ * pattern is to have Unicode semantics, and therefore
+ * currently has to be encoded in utf8. By putting it in
+ * utf8 now, we save a whole pass in the regular expression
+ * compiler. Once that code is changed so Unicode
+ * semantics doesn't necessarily have to be in utf8, this
+ * block should be removed */
+ if (!has_utf8) {
SvCUR_set(sv, d - SvPVX_const(sv));
SvPOK_on(sv);
*d = '\0';
/* See Note on sizing above. */
sv_utf8_upgrade_flags_grow(sv,
- SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
- len + (STRLEN)(send - s) + 1);
+ SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+ /* 5 = '\N{' + cur char + NUL */
+ (STRLEN)(send - s) + 5);
d = SvPVX(sv) + SvCUR(sv);
has_utf8 = TRUE;
- } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
+ }
+ }
- /* See Note on sizing above. (NOTE: SvCUR() is not set
- * correctly here). */
- const STRLEN off = d - SvPVX_const(sv);
- d = SvGROW(sv, off + len + (STRLEN)(send - s) + 1) + off;
+ if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
+ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX;
+ STRLEN len;
+
+ /* For \N{U+...}, the '...' is a unicode value even on
+ * EBCDIC machines */
+ s += 2; /* Skip to next char after the 'U+' */
+ len = e - s;
+ uv = grok_hex(s, &len, &flags, NULL);
+ if (len == 0 || len != (STRLEN)(e - s)) {
+ yyerror("Invalid hexadecimal number in \\N{U+...}");
+ s = e + 1;
+ continue;
+ }
+
+ if (PL_lex_inpat) {
+
+ /* Pass through to the regex compiler unchanged. The
+ * reason we evaluated the number above is to make sure
+ * there wasn't a syntax error. It also makes sure
+ * that the syntax created below, \N{Uc1.c2...}, is
+ * internal-only */
+ s -= 5; /* Include the '\N{U+' */
+ Copy(s, d, e - s + 1, char); /* 1 = include the } */
+ d += e - s + 1;
+ }
+ else { /* Not a pattern: convert the hex to string */
+
+ /* If destination is not in utf8, unconditionally
+ * recode it to be so. This is because \N{} implies
+ * Unicode semantics, and scalars have to be in utf8
+ * to guarantee those semantics */
+ if (! has_utf8) {
+ SvCUR_set(sv, d - SvPVX_const(sv));
+ SvPOK_on(sv);
+ *d = '\0';
+ /* See Note on sizing above. */
+ sv_utf8_upgrade_flags_grow(
+ sv,
+ SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+ UNISKIP(uv) + (STRLEN)(send - e) + 1);
+ d = SvPVX(sv) + SvCUR(sv);
+ has_utf8 = TRUE;
+ }
+
+ /* Add the string to the output */
+ if (UNI_IS_INVARIANT(uv)) {
+ *d++ = (char) uv;
+ }
+ else d = (char*)uvuni_to_utf8((U8*)d, uv);
+ }
+ }
+ else { /* Here is \N{NAME} but not \N{U+...}. */
+
+ SV *res; /* result from charnames */
+ const char *str; /* the string in 'res' */
+ STRLEN len; /* its length */
+
+ /* Get the value for NAME */
+ res = newSVpvn(s, e - s);
+ res = new_constant( NULL, 0, "charnames",
+ /* includes all of: \N{...} */
+ res, NULL, s - 3, e - s + 4 );
+
+ /* Most likely res will be in utf8 already since the
+ * standard charnames uses pack U, but a custom translator
+ * can leave it otherwise, so make sure. XXX This can be
+ * revisited to not have charnames use utf8 for characters
+ * that don't need it when regexes don't have to be in utf8
+ * for Unicode semantics. If doing so, remember EBCDIC */
+ sv_utf8_upgrade(res);
+ str = SvPV_const(res, len);
+
+ /* Don't accept malformed input */
+ if (! is_utf8_string((U8 *) str, len)) {
+ yyerror("Malformed UTF-8 returned by \\N");
+ }
+ else if (PL_lex_inpat) {
+
+ if (! len) { /* The name resolved to an empty string */
+ Copy("\\N{}", d, 4, char);
+ d += 4;
+ }
+ else {
+ /* In order to not lose information for the regex
+ * compiler, pass the result in the specially made
+ * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
+ * the code points in hex of each character
+ * returned by charnames */
+
+ const char *str_end = str + len;
+ STRLEN char_length; /* cur char's byte length */
+ STRLEN output_length; /* and the number of bytes
+ after this is translated
+ into hex digits */
+ const STRLEN off = d - SvPVX_const(sv);
+
+ /* 2 hex per byte; 2 chars for '\N'; 2 chars for
+ * max('U+', '.'); and 1 for NUL */
+ char hex_string[2 * UTF8_MAXBYTES + 5];
+
+ /* Get the first character of the result. */
+ U32 uv = utf8n_to_uvuni((U8 *) str,
+ len,
+ &char_length,
+ UTF8_ALLOW_ANYUV);
+
+ /* The call to is_utf8_string() above hopefully
+ * guarantees that there won't be an error. But
+ * it's easy here to make sure. The function just
+ * above warns and returns 0 if invalid utf8, but
+ * it can also return 0 if the input is validly a
+ * NUL. Disambiguate */
+ if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
+ uv = UNICODE_REPLACEMENT;
+ }
+
+ /* Convert first code point to hex, including the
+ * boiler plate before it */
+ sprintf(hex_string, "\\N{U+%X", (unsigned int) uv);
+ output_length = strlen(hex_string);
+
+ /* Make sure there is enough space to hold it */
+ d = off + SvGROW(sv, off
+ + output_length
+ + (STRLEN)(send - e)
+ + 2); /* '}' + NUL */
+ /* And output it */
+ Copy(hex_string, d, output_length, char);
+ d += output_length;
+
+ /* For each subsequent character, append dot and
+ * its ordinal in hex */
+ while ((str += char_length) < str_end) {
+ const STRLEN off = d - SvPVX_const(sv);
+ U32 uv = utf8n_to_uvuni((U8 *) str,
+ str_end - str,
+ &char_length,
+ UTF8_ALLOW_ANYUV);
+ if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
+ uv = UNICODE_REPLACEMENT;
+ }
+
+ sprintf(hex_string, ".%X", (unsigned int) uv);
+ output_length = strlen(hex_string);
+
+ d = off + SvGROW(sv, off
+ + output_length
+ + (STRLEN)(send - e)
+ + 2); /* '}' + NUL */
+ Copy(hex_string, d, output_length, char);
+ d += output_length;
+ }
+
+ *d++ = '}'; /* Done. Add the trailing brace */
+ }
+ }
+ else { /* Here, not in a pattern. Convert the name to a
+ * string. */
+
+ /* If destination is not in utf8, unconditionally
+ * recode it to be so. This is because \N{} implies
+ * Unicode semantics, and scalars have to be in utf8
+ * to guarantee those semantics */
+ if (! has_utf8) {
+ SvCUR_set(sv, d - SvPVX_const(sv));
+ SvPOK_on(sv);
+ *d = '\0';
+ /* See Note on sizing above. */
+ sv_utf8_upgrade_flags_grow(sv,
+ SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+ len + (STRLEN)(send - s) + 1);
+ d = SvPVX(sv) + SvCUR(sv);
+ has_utf8 = TRUE;
+ } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
+
+ /* See Note on sizing above. (NOTE: SvCUR() is not
+ * set correctly here). */
+ const STRLEN off = d - SvPVX_const(sv);
+ d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
+ }
+ Copy(str, d, len, char);
+ d += len;
}
-#ifdef EBCDIC
- if (!dorange)
- native_range = FALSE; /* \N{} is guessed to be Unicode */
-#endif
- Copy(str, d, len, char);
- d += len;
SvREFCNT_dec(res);
- cont_scan:
- s = e + 1;
}
- else
- yyerror("Missing braces on \\N{}");
+#ifdef EBCDIC
+ if (!dorange)
+ native_range = FALSE; /* \N{} is defined to be Unicode */
+#endif
+ s = e + 1; /* Point to just after the '}' */
continue;
/* \c is a control character */
SvREFCNT_dec(msg);
return sv;
}
+
+ /* charnames doesn't work well if there have been errors found */
+ if (PL_error_count > 0 && strEQ(key,"charnames")) return res;
+
cvp = hv_fetch(table, key, keylen, FALSE);
if (!cvp || !SvOK(*cvp)) {
why1 = "$^H{";