From: Jarkko Hietaniemi Date: Thu, 28 Feb 2002 05:43:45 +0000 (+0000) Subject: Make shared hash keys to be \0-terminated: X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=e05949c7fbf3ae0363947bc70c1c662248b91b93;p=p5sagit%2Fp5-mst-13.2.git Make shared hash keys to be \0-terminated: one possible resolution for "UTF-8, weird \w behaviour after HASH-KEY-ification" http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2002-01/msg01327.html The hash keys were shared (the SvLEN(sv) = 0 was the giveaway). The hash keys weren't \0-terminated. This meant that the EOL ($) in regmatch() got the nextchr beyond the last character. Since the keys were UTF-8, the nextchr was \1, not the usual string-terminating \0. Wham, no match. I think another possible resolution could be to stop the nextchr computation in regmatch() from peeking beyond the last character of the string: nextchr = locinput < PL_regeol ? UCHARAT(locinput) : 0; p4raw-id: //depot/perl@14908 --- diff --git a/hv.c b/hv.c index e4cc6c9..7efa086 100644 --- a/hv.c +++ b/hv.c @@ -85,9 +85,10 @@ S_save_hek(pTHX_ const char *str, I32 len, U32 hash) is_utf8 = TRUE; } - New(54, k, HEK_BASESIZE + len + 1, char); + New(54, k, HEK_BASESIZE + len + 2, char); hek = (HEK*)k; Copy(str, HEK_KEY(hek), len, char); + HEK_KEY(hek)[len] = 0; HEK_LEN(hek) = len; HEK_HASH(hek) = hash; HEK_UTF8(hek) = (char)is_utf8; diff --git a/hv.h b/hv.h index 688663a..369bf3c 100644 --- a/hv.h +++ b/hv.h @@ -23,6 +23,8 @@ struct hek { U32 hek_hash; /* hash of key */ I32 hek_len; /* length of hash key */ char hek_key[1]; /* variable-length hash key */ + /* the hash-key is \0-terminated */ + /* after the \0 there is a byte telling whether the key is UTF8 */ }; /* hash structure: */ @@ -211,7 +213,7 @@ C. #define HEK_HASH(hek) (hek)->hek_hash #define HEK_LEN(hek) (hek)->hek_len #define HEK_KEY(hek) (hek)->hek_key -#define HEK_UTF8(hek) (*(HEK_KEY(hek)+HEK_LEN(hek))) +#define HEK_UTF8(hek) (*(HEK_KEY(hek)+HEK_LEN(hek)+1)) /* calculate HV array allocation */ #if defined(STRANGE_MALLOC) || defined(MYMALLOC)