--- /dev/null
+#
+# This script is written intentionally in UTF-8
+
+BEGIN {
+ if (ord("A") == 193) {
+ print "1..0 # Skip: EBCDIC\n";
+ exit 0;
+ }
+ $| = 1;
+}
+
+use strict;
+
+use Test::More tests => 10;
+use charnames ':full';
+
+use utf8;
+
+my $A_with_ogonek = "Ą";
+my $micro_sign = "µ";
+my $hex_first = "a\x{A2}Ą";
+my $hex_last = "aĄ\x{A2}";
+my $name_first = "b\N{MICRO SIGN}Ɓ";
+my $name_last = "bƁ\N{MICRO SIGN}";
+my $uname_first = "b\N{U+00B5}Ɓ";
+my $uname_last = "bƁ\N{U+00B5}";
+my $octal_first = "c\377Ć";
+my $octal_last = "cĆ\377";
+
+do {
+ use bytes;
+ is((join "", unpack("C*", $A_with_ogonek)), "196" . "132", 'single char above 0x100');
+ is((join "", unpack("C*", $micro_sign)), "194" . "181", 'single char in 0x80 .. 0xFF');
+ is((join "", unpack("C*", $hex_first)), "97" . "194" . "162" . "196" . "132", 'a . \x{A2} . char above 0x100');
+ is((join "", unpack("C*", $hex_last)), "97" . "196" . "132" . "194" . "162", 'a . char above 0x100 . \x{A2}');
+ is((join "", unpack("C*", $name_first)), "98" . "194" . "181" . "198" . "129", 'b . \N{MICRO SIGN} . char above 0x100');
+ is((join "", unpack("C*", $name_last)), "98" . "198" . "129" . "194" . "181", 'b . char above 0x100 . \N{MICRO SIGN}');
+ is((join "", unpack("C*", $uname_first)), "98" . "194" . "181" . "198" . "129", 'b . \N{U+00B5} . char above 0x100');
+ is((join "", unpack("C*", $uname_last)), "98" . "198" . "129" . "194" . "181", 'b . char above 0x100 . \N{U+00B5}');
+ is((join "", unpack("C*", $octal_first)), "99" . "195" . "191" . "196" . "134", 'c . \377 . char above 0x100');
+ is((join "", unpack("C*", $octal_last)), "99" . "196" . "134" . "195" . "191", 'c . char above 0x100 . \377');
+}
+__END__
+
handle \cV (control characters)
handle printf-style backslashes (\f, \r, \n, etc)
} (end switch)
+ continue
} (end if backslash)
+ handle regular character
} (end while character to read)
*/
{
dVAR;
register char *send = PL_bufend; /* end of the constant */
- SV *sv = newSV(send - start); /* sv for the constant */
+ SV *sv = newSV(send - start); /* sv for the constant. See
+ note below on sizing. */
register char *s = start; /* start of the constant */
register char *d = SvPVX(sv); /* destination for copies */
bool dorange = FALSE; /* are we in a translit range? */
bool didrange = FALSE; /* did we just finish a range? */
I32 has_utf8 = FALSE; /* Output constant is UTF8 */
- I32 this_utf8 = UTF; /* The source string is assumed to be UTF8 */
+ I32 this_utf8 = UTF; /* Is the source string assumed
+ to be UTF8? But, this can
+ show as true when the source
+ isn't utf8, as for example
+ when it is entirely composed
+ of hex constants */
+
+ /* Note on sizing: The scanned constant is placed into sv, which is
+ * initialized by newSV() assuming one byte of output for every byte of
+ * input. This routine expects newSV() to allocate an extra byte for a
+ * trailing NUL, which this routine will append if it gets to the end of
+ * the input. There may be more bytes of input than output (eg., \N{LATIN
+ * CAPITAL LETTER A}), or more output than input if the constant ends up
+ * recoded to utf8, but each time a construct is found that might increase
+ * the needed size, SvGROW() is called. Its size parameter each time is
+ * based on the best guess estimate at the time, namely the length used so
+ * far, plus the length the current construct will occupy, plus room for
+ * the trailing NUL, plus one byte for every input byte still unscanned */
+
UV uv;
#ifdef EBCDIC
UV literal_endpoint = 0;
goto default_action;
}
- /* \132 indicates an octal constant */
+ /* eg. \132 indicates the octal constant 0x132 */
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
{
I32 flags = 0;
STRLEN len = 3;
- uv = grok_oct(s, &len, &flags, NULL);
+ uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
s += len;
}
goto NUM_ESCAPE_INSERT;
- /* \x24 indicates a hex constant */
+ /* eg. \x24 indicates the hex constant 0x24 */
case 'x':
++s;
if (*s == '{') {
continue;
}
len = e - s;
- uv = grok_hex(s, &len, &flags, NULL);
+ uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
s = e + 1;
}
else {
{
STRLEN len = 2;
I32 flags = PERL_SCAN_DISALLOW_PREFIX;
- uv = grok_hex(s, &len, &flags, NULL);
+ uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
s += len;
}
}
NUM_ESCAPE_INSERT:
- /* Insert oct or hex escaped character.
- * There will always enough room in sv since such
- * escapes will be longer than any UTF-8 sequence
- * they can end up as. */
+ /* Insert oct, hex, or \N{U+...} escaped character. There will
+ * always be enough room in sv since such escapes will be
+ * longer than any UTF-8 sequence they can end up as, except if
+ * they force us to recode the rest of the string into utf8 */
- /* We need to map to chars to ASCII before doing the tests
- to cover EBCDIC
- */
- if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(uv))) {
+ /* Here uv is the ordinal of the next character being added in
+ * unicode (converted from native). (It has to be done before
+ * here because \N is interpreted as unicode, and oct and hex
+ * as native.) */
+ if (!UNI_IS_INVARIANT(uv)) {
if (!has_utf8 && uv > 255) {
- /* Might need to recode whatever we have
- * accumulated so far if it contains any
- * hibit chars.
- *
- * (Can't we keep track of that and avoid
- * this rescan? --jhi)
- */
- int hicount = 0;
- U8 *c;
- for (c = (U8 *) SvPVX(sv); c < (U8 *)d; c++) {
- if (!NATIVE_IS_INVARIANT(*c)) {
- hicount++;
- }
- }
- if (hicount) {
- const STRLEN offset = d - SvPVX_const(sv);
- U8 *src, *dst;
- d = SvGROW(sv, SvLEN(sv) + hicount + 1) + offset;
- src = (U8 *)d - 1;
- dst = src+hicount;
- d += hicount;
- while (src >= (const U8 *)SvPVX_const(sv)) {
- if (!NATIVE_IS_INVARIANT(*src)) {
- const U8 ch = NATIVE_TO_ASCII(*src);
- *dst-- = (U8)UTF8_EIGHT_BIT_LO(ch);
- *dst-- = (U8)UTF8_EIGHT_BIT_HI(ch);
- }
- else {
- *dst-- = *src;
- }
- src--;
- }
- }
+ /* Might need to recode whatever we have accumulated so
+ * far if it contains any chars variant in utf8 or
+ * utf-ebcdic. */
+
+ SvCUR_set(sv, d - SvPVX_const(sv));
+ SvPOK_on(sv);
+ *d = '\0';
+ sv_utf8_upgrade(sv);
+ /* See Note on sizing above. */
+ SvGROW(sv, SvCUR(sv) + UNISKIP(uv) + (STRLEN)(send - s) + 1);
+ d = SvPVX(sv) + SvCUR(sv);
+ has_utf8 = TRUE;
}
- if (has_utf8 || uv > 255) {
- d = (char*)uvchr_to_utf8((U8*)d, uv);
- has_utf8 = TRUE;
+ if (has_utf8) {
+ d = (char*)uvuni_to_utf8((U8*)d, uv);
if (PL_lex_inwhat == OP_TRANS &&
PL_sublex_info.sub_op) {
PL_sublex_info.sub_op->op_private |=
}
continue;
- /* \N{LATIN SMALL LETTER A} is a named character */
+ /* \N{LATIN SMALL LETTER A} is a named character, and so is
+ * \N{U+0041} */
case 'N':
++s;
if (*s == '{') {
goto cont_scan;
}
if (e > s + 2 && s[1] == 'U' && s[2] == '+') {
- /* \N{U+...} */
+ /* \N{U+...} The ... is a unicode value even on EBCDIC
+ * machines */
I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
PERL_SCAN_DISALLOW_PREFIX;
s += 3;
}
}
#endif
+ /* If destination is not in utf8 but this new character is,
+ * recode the dest to utf8 */
if (!has_utf8 && SvUTF8(res)) {
- const char * const ostart = SvPVX_const(sv);
- SvCUR_set(sv, d - ostart);
+ SvCUR_set(sv, d - SvPVX_const(sv));
SvPOK_on(sv);
*d = '\0';
sv_utf8_upgrade(sv);
- /* this just broke our allocation above... */
- SvGROW(sv, (STRLEN)(send - start));
+ /* See Note on sizing above. */
+ SvGROW(sv, SvCUR(sv) + len + (STRLEN)(send - s) + 1);
d = SvPVX(sv) + SvCUR(sv);
has_utf8 = TRUE;
- }
- if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
- const char * const odest = SvPVX_const(sv);
+ } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
- SvGROW(sv, (SvLEN(sv) + len - (e - s + 4)));
- d = SvPVX(sv) + (d - odest);
+ /* See Note on sizing above. (NOTE: SvCUR() is not set
+ * correctly here). */
+ const STRLEN off = d - SvPVX_const(sv);
+ d = SvGROW(sv, off + len + (STRLEN)(send - s) + 1) + off;
}
#ifdef EBCDIC
if (!dorange)
#endif
default_action:
- /* If we started with encoded form, or already know we want it
- and then encode the next character */
- if ((has_utf8 || this_utf8) && !NATIVE_IS_INVARIANT((U8)(*s))) {
+ /* If we started with encoded form, or already know we want it,
+ then encode the next character */
+ if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
STRLEN len = 1;
+
+
+ /* One might think that it is wasted effort in the case of the
+ * source being utf8 (this_utf8 == TRUE) to take the next character
+ * in the source, convert it to an unsigned value, and then convert
+ * it back again. But the source has not been validated here. The
+ * routine that does the conversion checks for errors like
+ * malformed utf8 */
+
const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
- s += len;
- if (need > len) {
- /* encoded value larger than old, need extra space (NOTE: SvCUR() not set here) */
+ if (!has_utf8) {
+ SvCUR_set(sv, d - SvPVX_const(sv));
+ SvPOK_on(sv);
+ *d = '\0';
+ sv_utf8_upgrade(sv);
+
+ /* See Note on sizing above. */
+ SvGROW(sv, SvCUR(sv) + need + (STRLEN)(send - s) + 1);
+ d = SvPVX(sv) + SvCUR(sv);
+ has_utf8 = TRUE;
+ } else if (need > len) {
+ /* encoded value larger than old, may need extra space (NOTE:
+ * SvCUR() is not set correctly here). See Note on sizing
+ * above. */
const STRLEN off = d - SvPVX_const(sv);
- d = SvGROW(sv, SvLEN(sv) + (need-len)) + off;
+ d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
}
+ s += len;
+
d = (char*)uvchr_to_utf8((U8*)d, nextuv);
- has_utf8 = TRUE;
#ifdef EBCDIC
if (uv > 255 && !dorange)
native_range = FALSE;