/* utf8.c
*
- * Copyright (C) 2000, 2001, 2002, 2003, by Larry Wall and others
+ * Copyright (C) 2000, 2001, 2002, 2003, 2004, by Larry Wall and others
*
* You may distribute under the terms of either the GNU General Public
* License or the Artistic License, as specified in the README file.
/*
=head1 Unicode Support
+This file contains various utility functions for manipulating UTF8-encoded
+strings. For the uninitiated, this is a method of representing arbitrary
+Unicode characters as a variable number of bytes, in such a way that
+characters in the ASCII range are unmodified, and a zero byte never appears
+within non-zero characters.
+
=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
U8* send;
STRLEN c;
- if (!len)
+ if (!len && s)
len = strlen((char *)s);
send = s + len;
U8* send;
STRLEN c;
- if (!len)
+ if (!len && s)
len = strlen((char *)s);
send = s + len;
if (!(uv > ouv)) {
/* These cannot be allowed. */
if (uv == ouv) {
- if (!(flags & UTF8_ALLOW_LONG)) {
+ if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
warning = UTF8_WARN_LONG;
goto malformed;
}
U8* pend;
U8* dstart = d;
+ if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
+ d[0] = 0;
+ *newlen = 1;
+ return d;
+ }
+
if (bytelen & 1)
- Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
+ Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %d", bytelen);
pend = p + bytelen;
if (!*swashp) /* load on-demand */
*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
- if (special) {
+ /* The 0xDF is the only special casing Unicode code point below 0x100. */
+ if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
/* It might be "special" (sometimes, but not always,
* a multicharacter mapping) */
HV *hv;
- SV *keysv;
- HE *he;
- SV *val;
-
- if ((hv = get_hv(special, FALSE)) &&
- (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
- (he = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
- (val = HeVAL(he))) {
- char *s;
+ SV **svp;
+
+ if ((hv = get_hv(special, FALSE)) &&
+ (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
+ (*svp)) {
+ char *s;
- s = SvPV(val, len);
+ s = SvPV(*svp, len);
if (len == 1)
len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
else {
U8 *t = (U8*)s, *tend = t + len, *d;
d = tmpbuf;
- if (SvUTF8(val)) {
+ if (SvUTF8(*svp)) {
STRLEN tlen = 0;
while (t < tend) {
SV* retval;
SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
dSP;
- HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
+ size_t pkg_len = strlen(pkg);
+ size_t name_len = strlen(name);
+ HV *stash = gv_stashpvn(pkg, pkg_len, FALSE);
SV* errsv_save;
if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ENTER;
errsv_save = newSVsv(ERRSV);
- Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
+ Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
+ Nullsv);
if (!SvTRUE(ERRSV))
sv_setsv(ERRSV, errsv_save);
SvREFCNT_dec(errsv_save);
PUSHSTACKi(PERLSI_MAGIC);
PUSHMARK(SP);
EXTEND(SP,5);
- PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
- PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
+ PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
+ PUSHs(sv_2mortal(newSVpvn(name, name_len)));
PUSHs(listsv);
PUSHs(sv_2mortal(newSViv(minbits)));
PUSHs(sv_2mortal(newSViv(none)));
in there (they will point at the beginning of the I<next> character).
If the pointers behind pe1 or pe2 are non-NULL, they are the end
pointers beyond which scanning will not continue under any
-circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
+circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
s2+l2 will be used as goal end pointers that will also stop the scan,
and which qualify towards defining a successful match: all the scans
that define an explicit length must reach their goal pointers for