From: Jarkko Hietaniemi Date: Wed, 2 Jan 2002 15:12:57 +0000 (+0000) Subject: One more iteration of the ibcmp_utf8() interface, X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=d07ddd77a31b1e57c2f358652e4f3f85d2e29ad4;p=p5sagit%2Fp5-mst-13.2.git One more iteration of the ibcmp_utf8() interface, hopefully this is a convergent iteration... p4raw-id: //depot/perl@14014 --- diff --git a/embed.pl b/embed.pl index 929b014..3a72d20 100755 --- a/embed.pl +++ b/embed.pl @@ -1333,7 +1333,7 @@ Apd |HE* |hv_store_ent |HV* tb|SV* key|SV* val|U32 hash Apd |void |hv_undef |HV* tb Ap |I32 |ibcmp |const char* a|const char* b|I32 len Ap |I32 |ibcmp_locale |const char* a|const char* b|I32 len -Apd |I32 |ibcmp_utf8 |const char* a|I32 len1|bool u1|char **ae|const char* b|I32 len2|bool u2|char **be +Apd |I32 |ibcmp_utf8 |const char* a|char **pe1|UV l1|bool u1|const char* b|char **pe2|UV l2|bool u2 p |bool |ingroup |Gid_t testgid|Uid_t effective p |void |init_argv_symbols|int|char ** p |void |init_debugger diff --git a/pod/perlapi.pod b/pod/perlapi.pod index 44669f5..915e40c 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -1133,18 +1133,25 @@ Found in file hv.c Return true if the strings s1 and s2 differ case-insensitively, false if not (if they are equal case-insensitively). If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true, -the string s2 is assumed to be in UTF-8-encoded Unicode. +the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2 +are false, the respective string is assumed to be in native 8-bit +encoding. + +If the pe1 and pe2 are non-NULL, the scanning pointers will be copied +in there (they will point at the beginning of the I character). +If the pointers behind pe1 or pe2 are non-NULL, they are the end +pointers beyond which scanning will not continue under any +circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and +s2+l2 will be used as goal end pointers that will also stop the scan, +and which qualify towards defining a successful match: all the scans +that define an explicit length must reach their goal pointers for +a match to succeed). For case-insensitiveness, the "casefolding" of Unicode is used instead of upper/lowercasing both the characters, see http://www.unicode.org/unicode/reports/tr21/ (Case Mappings). -If either length is (STRLEN)-1 the scan will continue until a match is -found. If both lengths are (STRLEN)-1, true is returned (as a sign of -non-match). In the case of a match, the f1 and f2 are updated to record -how far the comparison proceeded. - - I32 ibcmp_utf8(const char* a, I32 len1, bool u1, char **ae, const char* b, I32 len2, bool u2, char **be) + I32 ibcmp_utf8(const char* a, char **pe1, UV l1, bool u1, const char* b, char **pe2, UV l2, bool u2) =for hackers Found in file utf8.c diff --git a/proto.h b/proto.h index f95f047..9196ddd 100644 --- a/proto.h +++ b/proto.h @@ -315,7 +315,7 @@ PERL_CALLCONV HE* Perl_hv_store_ent(pTHX_ HV* tb, SV* key, SV* val, U32 hash); PERL_CALLCONV void Perl_hv_undef(pTHX_ HV* tb); PERL_CALLCONV I32 Perl_ibcmp(pTHX_ const char* a, const char* b, I32 len); PERL_CALLCONV I32 Perl_ibcmp_locale(pTHX_ const char* a, const char* b, I32 len); -PERL_CALLCONV I32 Perl_ibcmp_utf8(pTHX_ const char* a, I32 len1, bool u1, char **ae, const char* b, I32 len2, bool u2, char **be); +PERL_CALLCONV I32 Perl_ibcmp_utf8(pTHX_ const char* a, char **pe1, UV l1, bool u1, const char* b, char **pe2, UV l2, bool u2); PERL_CALLCONV bool Perl_ingroup(pTHX_ Gid_t testgid, Uid_t effective); PERL_CALLCONV void Perl_init_argv_symbols(pTHX_ int, char **); PERL_CALLCONV void Perl_init_debugger(pTHX); diff --git a/regexec.c b/regexec.c index c189b14..51b55f6 100644 --- a/regexec.c +++ b/regexec.c @@ -980,14 +980,16 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta U8 tmpbuf [UTF8_MAXLEN+1]; U8 foldbuf[UTF8_MAXLEN_FOLD+1]; STRLEN len, foldlen; + char* se; if (c1 == c2) { while (s <= e) { c = utf8_to_uvchr((U8*)s, &len); if ( c == c1 && (ln == len || - !ibcmp_utf8(s, (STRLEN)-1, do_utf8, 0, - m, ln, UTF, 0)) + ((se = e + 1) && + !ibcmp_utf8(s, &se, 0, do_utf8, + m, 0 , ln, UTF))) && (norun || regtry(prog, s)) ) goto got_it; else { @@ -997,8 +999,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta && (f == c1 || f == c2) && (ln == foldlen || !ibcmp_utf8((char *)foldbuf, - (STRLEN)-1, do_utf8, 0, - m, ln, UTF, 0)) + 0, foldlen, do_utf8, + m, + 0, ln, UTF)) && (norun || regtry(prog, s)) ) goto got_it; } @@ -1022,8 +1025,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if ( (c == c1 || c == c2) && (ln == len || - !ibcmp_utf8(s, (STRLEN)-1, do_utf8, 0, - m, ln, UTF, 0)) + ((se = e + 1) && + !ibcmp_utf8(s, &se, 0, do_utf8, + m, 0, ln, UTF))) && (norun || regtry(prog, s)) ) goto got_it; else { @@ -1033,8 +1037,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta && (f == c1 || f == c2) && (ln == foldlen || !ibcmp_utf8((char *)foldbuf, - (STRLEN)-1, do_utf8, 0, - m, ln, UTF, 0)) + 0, foldlen, do_utf8, + m, + 0, ln, UTF)) && (norun || regtry(prog, s)) ) goto got_it; } @@ -2336,20 +2341,17 @@ S_regmatch(pTHX_ regnode *prog) s = STRING(scan); ln = STR_LEN(scan); - { + if (do_utf8 || UTF) { + /* Either target or the pattern are utf8. */ char *l = locinput; - char *e = s + ln; - - if (do_utf8 || UTF) { - /* Either target or the pattern are utf8. */ + char *e = PL_regeol; - if (ibcmp_utf8(s, e - s, TRUE, 0, - l, (STRLEN)-1, TRUE, &l)) - sayNO; - locinput = l; - nextchr = UCHARAT(locinput); - break; - } + if (ibcmp_utf8(s, 0, ln, do_utf8, + l, &e, 0, UTF)) + sayNO; + locinput = e; + nextchr = UCHARAT(locinput); + break; } /* Neither the target and the pattern are utf8. */ diff --git a/t/op/pat.t b/t/op/pat.t index b797bdf..5cdb2e5 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -6,7 +6,7 @@ $| = 1; -print "1..828\n"; +print "1..834\n"; BEGIN { chdir 't' if -d 't'; @@ -2380,11 +2380,7 @@ print "# some Unicode properties\n"; print "# GREEK CAPITAL LETTER SIGMA vs COMBINING GREEK PERISPOMENI\n"; my $SIGMA = "\N{GREEK CAPITAL LETTER SIGMA}"; - - my $hSIGMA = sprintf "%04x", ord $SIGMA; - - my $char = "\N{COMBINING GREEK PERISPOMENI}"; - my $code = sprintf "%04x", ord($char); + my $char = "\N{COMBINING GREEK PERISPOMENI}"; # Before #13843 this was failing by matching falsely. print "_:$char:_" =~ m/_:$SIGMA:_/i ? "not ok 786\n" : "ok 786\n"; @@ -2558,3 +2554,27 @@ print "# some Unicode properties\n"; } } } + +{ + print "# more SIGMAs\n"; + + my $SIGMA = "\x{03A3}"; # CAPITAL + my $Sigma = "\x{03C2}"; # SMALL FINAL + my $sigma = "\x{03C3}"; # SMALL + + my $S3 = "$SIGMA$Sigma$sigma"; + + print ":$S3:" =~ /:(($SIGMA)+):/i && $1 eq $S3 && $2 eq $sigma ? + "ok 829\n" : "not ok 829\n"; + print ":$S3:" =~ /:(($Sigma)+):/i && $1 eq $S3 && $2 eq $sigma ? + "ok 830\n" : "not ok 830\n"; + print ":$S3:" =~ /:(($sigma)+):/i && $1 eq $S3 && $2 eq $sigma ? + "ok 831\n" : "not ok 831\n"; + + print ":$S3:" =~ /:(([$SIGMA])+):/i && $1 eq $S3 && $2 eq $sigma ? + "ok 832\n" : "not ok 832\n"; + print ":$S3:" =~ /:(([$Sigma])+):/i && $1 eq $S3 && $2 eq $sigma ? + "ok 833\n" : "not ok 833\n"; + print ":$S3:" =~ /:(([$sigma])+):/i && $1 eq $S3 && $2 eq $sigma ? + "ok 834\n" : "not ok 834\n"; +} diff --git a/utf8.c b/utf8.c index a59b1ed..2b5ae42 100644 --- a/utf8.c +++ b/utf8.c @@ -1707,48 +1707,61 @@ Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags) } /* -=for apidoc A|I32|ibcmp_utf8|const char *s1|register I32 len1|bool u1|char **f1|const char *s2|register I32 len2|bool u2|char **f2 +=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2 Return true if the strings s1 and s2 differ case-insensitively, false if not (if they are equal case-insensitively). If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true, -the string s2 is assumed to be in UTF-8-encoded Unicode. +the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2 +are false, the respective string is assumed to be in native 8-bit +encoding. + +If the pe1 and pe2 are non-NULL, the scanning pointers will be copied +in there (they will point at the beginning of the I character). +If the pointers behind pe1 or pe2 are non-NULL, they are the end +pointers beyond which scanning will not continue under any +circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and +s2+l2 will be used as goal end pointers that will also stop the scan, +and which qualify towards defining a successful match: all the scans +that define an explicit length must reach their goal pointers for +a match to succeed). For case-insensitiveness, the "casefolding" of Unicode is used instead of upper/lowercasing both the characters, see http://www.unicode.org/unicode/reports/tr21/ (Case Mappings). -If either length is (STRLEN)-1 the scan will continue until a match is -found. If both lengths are (STRLEN)-1, true is returned (as a sign of -non-match). In the case of a match, the f1 and f2 are updated to record -how far the comparison proceeded. - =cut */ I32 -Perl_ibcmp_utf8(pTHX_ const char *s1, register I32 len1, bool u1, char **f1, const char *s2, register I32 len2, bool u2, char **f2) +Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2) { register U8 *p1 = (U8*)s1; register U8 *p2 = (U8*)s2; - register U8 *e1, *q1 = 0; - register U8 *e2, *q2 = 0; - STRLEN l1 = 0, l2 = 0; + register U8 *e1 = 0, *f1 = 0, *q1 = 0; + register U8 *e2 = 0, *f2 = 0, *q2 = 0; + STRLEN n1 = 0, n2 = 0; U8 foldbuf1[UTF8_MAXLEN_FOLD+1]; U8 foldbuf2[UTF8_MAXLEN_FOLD+1]; U8 natbuf[1+1]; STRLEN foldlen1, foldlen2; - bool inf1, inf2, match; + bool match; - inf1 = len1 == (STRLEN)-1; - inf2 = len2 == (STRLEN)-1; - if (inf1 && inf2) - return 1; /* mismatch */ - if (!inf1) - e1 = p1 + len1; - if (!inf2) - e2 = p2 + len2; - - while ((p1 < e1 || inf1) && (p2 < e2 || inf2)) { - if (l1 == 0) { + if (pe1) + e1 = *(U8**)pe1; + if (e1 == 0 || (l1 && l1 < e1 - (U8*)s1)) + f1 = (U8*)s1 + l1; + if (pe2) + e2 = *(U8**)pe2; + if (e2 == 0 || (l2 && l2 < e2 - (U8*)s2)) + f2 = (U8*)s2 + l2; + + if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)) + return 1; /* mismatch; possible infinite loop or false positive */ + + while ((e1 == 0 || p1 < e1) && + (f1 == 0 || p1 < f1) && + (e2 == 0 || p2 < e2) && + (f2 == 0 || p2 < f2)) { + if (n1 == 0) { if (u1) to_utf8_fold(p1, foldbuf1, &foldlen1); else { @@ -1756,41 +1769,44 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, register I32 len1, bool u1, char **f1, con to_utf8_fold(natbuf, foldbuf1, &foldlen1); } q1 = foldbuf1; - l1 = foldlen1; + n1 = foldlen1; } - if (l2 == 0) { + if (n2 == 0) { if (u2) to_utf8_fold(p2, foldbuf2, &foldlen2); else { - natbuf[0] = NATIVE_TO_UNI(*p1); + natbuf[0] = NATIVE_TO_UNI(*p2); to_utf8_fold(natbuf, foldbuf2, &foldlen2); } q2 = foldbuf2; - l2 = foldlen2; + n2 = foldlen2; } - while (l1 && l2) { - if (UTF8SKIP(q1) != UTF8SKIP(q2) || - memNE((char*)q1, (char*)q2, UTF8SKIP(q1))) + while (n1 && n2) { + if ( UTF8SKIP(q1) != UTF8SKIP(q2) || + (UTF8SKIP(q1) == 1 && *q1 != *q2) || + memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) ) return 1; /* mismatch */ - l1 -= UTF8SKIP(q1); + n1 -= UTF8SKIP(q1); q1 += UTF8SKIP(q1); - l2 -= UTF8SKIP(q2); + n2 -= UTF8SKIP(q2); q2 += UTF8SKIP(q2); } - if (l1 == 0) + if (n1 == 0) p1 += u1 ? UTF8SKIP(p1) : 1; - if (l2 == 0) + if (n2 == 0) p2 += u2 ? UTF8SKIP(p2) : 1; } - match = (inf1 ? 1 : p1 == e1) && (inf2 ? 1 : p2 == e2); + /* A match is defined by all the scans that specified + * an explicit length reaching their final goals. */ + match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2); if (match) { - if (f1) - *f1 = (char *)p1; - if (f2) - *f2 = (char *)p2; + if (pe1) + *pe1 = (char*)p1; + if (pe2) + *pe2 = (char*)p2; } return match ? 0 : 1; /* 0 match, 1 mismatch */