Fix for "a\x{100}" =~ /A/i.
Jarkko Hietaniemi [Wed, 28 Nov 2001 03:23:04 +0000 (03:23 +0000)]
p4raw-id: //depot/perl@13332

embed.h
embed.pl
global.sym
proto.h
regexec.c
t/op/pat.t
utf8.c

diff --git a/embed.h b/embed.h
index 269a117..0674b75 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define hv_undef               Perl_hv_undef
 #define ibcmp                  Perl_ibcmp
 #define ibcmp_locale           Perl_ibcmp_locale
+#define ibcmp_utf8             Perl_ibcmp_utf8
 #define ingroup                        Perl_ingroup
 #define init_argv_symbols      Perl_init_argv_symbols
 #define init_debugger          Perl_init_debugger
 #define hv_undef(a)            Perl_hv_undef(aTHX_ a)
 #define ibcmp(a,b,c)           Perl_ibcmp(aTHX_ a,b,c)
 #define ibcmp_locale(a,b,c)    Perl_ibcmp_locale(aTHX_ a,b,c)
+#define ibcmp_utf8(a,b,c,d,e)  Perl_ibcmp_utf8(aTHX_ a,b,c,d,e)
 #define ingroup(a,b)           Perl_ingroup(aTHX_ a,b)
 #define init_argv_symbols(a,b) Perl_init_argv_symbols(aTHX_ a,b)
 #define init_debugger()                Perl_init_debugger(aTHX)
index f2603ef..02327d9 100755 (executable)
--- a/embed.pl
+++ b/embed.pl
@@ -1333,6 +1333,7 @@ Apd       |HE*    |hv_store_ent   |HV* tb|SV* key|SV* val|U32 hash
 Apd    |void   |hv_undef       |HV* tb
 Ap     |I32    |ibcmp          |const char* a|const char* b|I32 len
 Ap     |I32    |ibcmp_locale   |const char* a|const char* b|I32 len
+Ap     |I32    |ibcmp_utf8     |const char* a|bool ua|const char* b|bool ub|I32 len
 p      |bool   |ingroup        |Gid_t testgid|Uid_t effective
 p      |void   |init_argv_symbols|int|char **
 p      |void   |init_debugger
index 86e1491..802dd25 100644 (file)
@@ -153,6 +153,7 @@ Perl_hv_store_ent
 Perl_hv_undef
 Perl_ibcmp
 Perl_ibcmp_locale
+Perl_ibcmp_utf8
 Perl_init_stacks
 Perl_init_tm
 Perl_instr
diff --git a/proto.h b/proto.h
index a042c5a..1073831 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -315,6 +315,7 @@ PERL_CALLCONV HE*   Perl_hv_store_ent(pTHX_ HV* tb, SV* key, SV* val, U32 hash);
 PERL_CALLCONV void     Perl_hv_undef(pTHX_ HV* tb);
 PERL_CALLCONV I32      Perl_ibcmp(pTHX_ const char* a, const char* b, I32 len);
 PERL_CALLCONV I32      Perl_ibcmp_locale(pTHX_ const char* a, const char* b, I32 len);
+PERL_CALLCONV I32      Perl_ibcmp_utf8(pTHX_ const char* a, bool ua, const char* b, bool ub, I32 len);
 PERL_CALLCONV bool     Perl_ingroup(pTHX_ Gid_t testgid, Uid_t effective);
 PERL_CALLCONV void     Perl_init_argv_symbols(pTHX_ int, char **);
 PERL_CALLCONV void     Perl_init_debugger(pTHX);
index efdd8df..a8acb06 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -962,14 +962,17 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                if (c1 == c2)
                    while (s <= e) {
                        if ( utf8_to_uvchr((U8*)s, &len) == c1
-                            && regtry(prog, s) )
+                            && (ln == 1 ||
+                                ibcmp_utf8(s, do_utf8, m, UTF, ln)) )
                            goto got_it;
                        s += len;
                    }
                else
                    while (s <= e) {
                        UV c = utf8_to_uvchr((U8*)s, &len);
-                       if ( (c == c1 || c == c2) && regtry(prog, s) )
+                       if ( (c == c1 || c == c2)
+                            && (ln == 1 ||
+                                ibcmp_utf8(s, do_utf8, m, UTF, ln)) )
                            goto got_it;
                        s += len;
                    }
index a94fcaf..6b4b061 100755 (executable)
@@ -6,7 +6,7 @@
 
 $| = 1;
 
-print "1..753\n";
+print "1..757\n";
 
 BEGIN {
     chdir 't' if -d 't';
@@ -2248,6 +2248,8 @@ print "# some Unicode properties\n";
 }
 
 {
+    # Script=, Block=, Category=
+
     print "not " unless "\x{0100}" =~ /\p{Script=Latin}/;
     print "ok 748\n";
 
@@ -2259,6 +2261,8 @@ print "# some Unicode properties\n";
 }
 
 {
+    print "# the basic character classes and Unicode \n";
+
     # 0100;LATIN CAPITAL LETTER A WITH MACRON;Lu;0;L;0041 0304;;;;N;LATIN CAPITAL LETTER A MACRON;;;0101;
     print "not " unless "\x{0100}" =~ /\w/;
     print "ok 751\n";
@@ -2271,3 +2275,19 @@ print "# some Unicode properties\n";
     print "not " unless "\x{1680}" =~ /\s/;
     print "ok 753\n";
 }
+
+{
+    print "# folding matches and Unicode\n";
+
+    print "not " unless "a\x{100}" =~ /A/i;
+    print "ok 754\n";
+
+    print "not " unless "A\x{100}" =~ /A/i;
+    print "ok 755\n";
+
+    print "not " unless "a\x{100}" =~ /a/i;
+    print "ok 756\n";
+
+    print "not " unless "A\x{100}" =~ /A/i;
+    print "ok 757\n";
+}
diff --git a/utf8.c b/utf8.c
index f900724..d7b0784 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1543,6 +1543,51 @@ Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
 char *
 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
 {
-  return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
-                            pvlim, flags);
+     return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
+                               pvlim, flags);
+}
+
+I32
+Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, const char *s2, bool u2, register I32 len)
+{
+     register U8 *a = (U8*)s1;
+     register U8 *b = (U8*)s2;
+     STRLEN la, lb;
+     UV ca, cb;
+     STRLEN ulen1, ulen2;
+     U8 tmpbuf1[UTF8_MAXLEN*3+1];
+     U8 tmpbuf2[UTF8_MAXLEN*3+1];
+
+     while (len) {
+         if (u1)
+              ca = utf8_to_uvchr((U8*)a, &la);
+         else {
+              ca = *a;
+              la = 1;
+         }
+         if (u2)
+              cb = utf8_to_uvchr((U8*)b, &lb);
+         else {
+              cb = *b;
+              lb = 1;
+         }
+         if (ca != cb) {
+              if (u1)
+                   to_uni_lower(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
+              else
+                   ulen1 = 1;
+              if (u2)
+                   to_uni_lower(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
+              else
+                   ulen2 = 1;
+              if (ulen1 != ulen2
+                  || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
+                  || memNE(tmpbuf1, tmpbuf2, ulen1))
+                   return 1;
+         }
+         a += la;
+         b += lb;
+    }
+    return 0;
 }
+