Finish up (ha!) the Unicode case folding;
Jarkko Hietaniemi [Sat, 5 Jan 2002 22:09:20 +0000 (22:09 +0000)]
enhance regex dumping code.

p4raw-id: //depot/perl@14096

12 files changed:
MANIFEST
embed.fnc
embed.h
lib/unifold.t [new file with mode: 0644]
pp_hot.c
proto.h
regcomp.c
regcomp.h
regexec.c
t/op/pat.t
utf8.c
utf8.h

index 401b91a..deaa26a 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -1698,6 +1698,7 @@ lib/unicore/UCD.html              Unicode character database
 lib/unicore/Unicode.html       Unicode character database
 lib/unicore/Unicode.txt                Unicode character database
 lib/unicore/version            The version of the Unicode
+lib/unifold.t                  See if Unicode folding works
 lib/UNIVERSAL.pm               Base class for ALL classes
 lib/User/grent.pm              By-name interface to Perl's builtin getgr*
 lib/User/grent.t               See if User::grwent works
index da7e2ce..e534f52 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -584,7 +584,7 @@ Ap  |void   |push_scope
 p      |OP*    |ref            |OP* o|I32 type
 p      |OP*    |refkids        |OP* o|I32 type
 Ap     |void   |regdump        |regexp* r
-Ap     |SV*    |regclass_swash |struct regnode *n|bool doinit|SV **initsvp
+Ap     |SV*    |regclass_swash |struct regnode *n|bool doinit|SV **listsvp|SV **altsvp
 Ap     |I32    |pregexec       |regexp* prog|char* stringarg \
                                |char* strend|char* strbeg|I32 minend \
                                |SV* screamer|U32 nosave
@@ -1134,6 +1134,7 @@ s |I32    |regrepeat      |regnode *p|I32 max
 s      |I32    |regrepeat_hard |regnode *p|I32 max|I32 *lp
 s      |I32    |regtry         |regexp *prog|char *startpos
 s      |bool   |reginclass     |regnode *n|U8 *p|bool do_utf8sv_is_utf8
+s      |bool   |reginclasslen  |regnode *n|U8 *p|STRLEN *lenp|bool do_utf8sv_is_utf8
 s      |CHECKPOINT|regcppush   |I32 parenfloor
 s      |char*|regcppop
 s      |char*|regcp_set_to     |I32 ss
diff --git a/embed.h b/embed.h
index 8a5cc4e..6203634 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define regrepeat_hard         S_regrepeat_hard
 #define regtry                 S_regtry
 #define reginclass             S_reginclass
+#define reginclasslen          S_reginclasslen
 #define regcppush              S_regcppush
 #define regcppop               S_regcppop
 #define regcp_set_to           S_regcp_set_to
 #define ref(a,b)               Perl_ref(aTHX_ a,b)
 #define refkids(a,b)           Perl_refkids(aTHX_ a,b)
 #define regdump(a)             Perl_regdump(aTHX_ a)
-#define regclass_swash(a,b,c)  Perl_regclass_swash(aTHX_ a,b,c)
+#define regclass_swash(a,b,c,d)        Perl_regclass_swash(aTHX_ a,b,c,d)
 #define pregexec(a,b,c,d,e,f,g)        Perl_pregexec(aTHX_ a,b,c,d,e,f,g)
 #define pregfree(a)            Perl_pregfree(aTHX_ a)
 #define pregcomp(a,b,c)                Perl_pregcomp(aTHX_ a,b,c)
 #define regrepeat_hard(a,b,c)  S_regrepeat_hard(aTHX_ a,b,c)
 #define regtry(a,b)            S_regtry(aTHX_ a,b)
 #define reginclass(a,b,c)      S_reginclass(aTHX_ a,b,c)
+#define reginclasslen(a,b,c,d) S_reginclasslen(aTHX_ a,b,c,d)
 #define regcppush(a)           S_regcppush(aTHX_ a)
 #define regcppop()             S_regcppop(aTHX)
 #define regcp_set_to(a)                S_regcp_set_to(aTHX_ a)
diff --git a/lib/unifold.t b/lib/unifold.t
new file mode 100644 (file)
index 0000000..d4e819e
--- /dev/null
@@ -0,0 +1,45 @@
+BEGIN {
+    chdir 't' if -d 't';
+    @INC = '../lib';
+}
+
+use File::Spec;
+
+my $CF = File::Spec->catfile(File::Spec->catdir(File::Spec->updir,
+                                              "lib", "unicore"),
+                           "CaseFold.txt");
+
+if (open(CF, $CF)) {
+    my @CF;
+
+    while (<CF>) {
+        if (/^([0-9A-F]+); ([CFSI]); ((?:[0-9A-F]+)(?: [0-9A-F]+)*); \# (.+)/) {
+            next if $2 eq 'S'; # we are going for 'F'ull case folding
+           push @CF, [$1, $2, $3, $4];
+       }
+    }
+
+    die qq[$0: failed to find casefoldings from "$CF"\n] unless @CF;
+
+    print "1..", scalar @CF, "\n";
+
+    my $i = 0;
+    for my $cf (@CF) {
+       my ($code, $status, $mapping, $name) = @$cf;
+       $i++;
+       my $a = pack("U0U*", hex $code);
+       my $b = pack("U0U*", map { hex } split " ", $mapping);
+       my $t0 = ":$a:" =~ /:$a:/   ?  1 : 0;
+       my $t1 = ":$a:" =~ /:$a:/i  ?  1 : 0;
+       my $t2 = ":$a:" =~ /:[$a]:/i ? 1 : 0;
+       my $t3 = ":$a:" =~ /:$b:/i   ? 1 : 0;
+       my $t4 = ":$a:" =~ /:[$b]:/i ? 1 : 0;
+       my $t5 = ":$b:" =~ /:$a:/i   ? 1 : 0;
+       my $t6 = ":$b:" =~ /:[$a]:/i ? 1 : 0;
+       print $t0 && $t1 && $t2 && $t3 && $t4 && $t5 && $t6 ?
+           "ok $i \# - $code - $name - $mapping - - $status\n" :
+           "not ok $i \# - $code - $name - $mapping - $t0 $t1 $t2 $t3 $t4 $t5 $t6 - $status\n";
+    }
+} else {
+    die qq[$0: failed to open "$CF": $!\n];
+}
index 29ec96b..df52bb1 100644 (file)
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -1235,7 +1235,10 @@ PP(pp_match)
        pm = PL_curpm;
        rx = PM_GETRE(pm);
     }
-    if (rx->minlen > len) goto failure;
+    if (rx->minlen > len &&
+       !PL_reg_match_utf8 /* ANYOFs can balloon to EXACTFs */
+       )
+      goto failure;
 
     truebase = t = s;
 
diff --git a/proto.h b/proto.h
index 52d634e..ea837ec 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -619,7 +619,7 @@ PERL_CALLCONV void  Perl_push_scope(pTHX);
 PERL_CALLCONV OP*      Perl_ref(pTHX_ OP* o, I32 type);
 PERL_CALLCONV OP*      Perl_refkids(pTHX_ OP* o, I32 type);
 PERL_CALLCONV void     Perl_regdump(pTHX_ regexp* r);
-PERL_CALLCONV SV*      Perl_regclass_swash(pTHX_ struct regnode *n, bool doinit, SV **initsvp);
+PERL_CALLCONV SV*      Perl_regclass_swash(pTHX_ struct regnode *n, bool doinit, SV **listsvp, SV **altsvp);
 PERL_CALLCONV I32      Perl_pregexec(pTHX_ regexp* prog, char* stringarg, char* strend, char* strbeg, I32 minend, SV* screamer, U32 nosave);
 PERL_CALLCONV void     Perl_pregfree(pTHX_ struct regexp* r);
 PERL_CALLCONV regexp*  Perl_pregcomp(pTHX_ char* exp, char* xend, PMOP* pm);
@@ -1164,6 +1164,7 @@ STATIC I32        S_regrepeat(pTHX_ regnode *p, I32 max);
 STATIC I32     S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp);
 STATIC I32     S_regtry(pTHX_ regexp *prog, char *startpos);
 STATIC bool    S_reginclass(pTHX_ regnode *n, U8 *p, bool do_utf8sv_is_utf8);
+STATIC bool    S_reginclasslen(pTHX_ regnode *n, U8 *p, STRLEN *lenp, bool do_utf8sv_is_utf8);
 STATIC CHECKPOINT      S_regcppush(pTHX_ I32 parenfloor);
 STATIC char*   S_regcppop(pTHX);
 STATIC char*   S_regcp_set_to(pTHX_ I32 ss);
index aacae22..d7ae068 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -3427,7 +3427,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
     SV *listsv = Nullsv;
     register char *e;
     UV n;
-    bool optimize_invert = TRUE;
+    bool optimize_invert   = TRUE;
+    AV* unicode_alternate  = 0;
 
     ret = reganode(pRExC_state, ANYOF, 0);
 
@@ -4028,18 +4029,38 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                         /* If folding and foldable and a single
                          * character, insert also the folded version
                          * to the charclass. */
-                        if (f != value && foldlen == UNISKIP(f))
-                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", f);
+                        if (f != value) {
+                             if (foldlen == UNISKIP(f))
+                                 Perl_sv_catpvf(aTHX_ listsv,
+                                                "%04"UVxf"\n", f);
+                             else {
+                                 /* Any multicharacter foldings
+                                  * require the following transform:
+                                  * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
+                                  * where E folds into "pq" and F folds
+                                  * into "rst", all other characters
+                                  * fold to single characters.  We save
+                                  * away these multicharacter foldings,
+                                  * to be later saved as part of the
+                                  * additional "s" data. */
+                                 SV *sv;
+
+                                 if (!unicode_alternate)
+                                     unicode_alternate = newAV();
+                                 sv = newSVpvn((char*)foldbuf, foldlen);
+                                 SvUTF8_on(sv);
+                                 av_push(unicode_alternate, sv);
+                             }
+                        }
 
                         /* If folding and the value is one of the Greek
                          * sigmas insert a few more sigmas to make the
                          * folding rules of the sigmas to work right.
                          * Note that not all the possible combinations
                          * are handled here: some of them are handled
-                         * handled by the standard folding rules, and
-                         * some of them (literal or EXACTF cases) are
-                         * handled during runtime in
-                         * regexec.c:S_find_byclass(). */
+                         * by the standard folding rules, and some of
+                         * them (literal or EXACTF cases) are handled
+                         * during runtime in regexec.c:S_find_byclass(). */
                         if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
                              Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
                                             (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
@@ -4094,8 +4115,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
        AV *av = newAV();
        SV *rv;
 
+       /* The 0th element stores the character class description
+        * in its textual form: used later (regexec.c:Perl_regclass_swatch())
+        * to initialize the appropriate swash (which gets stored in
+        * the 1st element), and also useful for dumping the regnode.
+        * The 2nd element stores the multicharacter foldings,
+        * used later (regexec.c:s_reginclasslen()). */
        av_store(av, 0, listsv);
        av_store(av, 1, NULL);
+       av_store(av, 2, (SV*)unicode_alternate);
        rv = newRV_noinc((SV*)av);
        n = add_data(pRExC_state, 1, "s");
        RExC_rx->data->data[n] = (void*)rv;
@@ -4625,7 +4653,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
 
        {
            SV *lv;
-           SV *sw = regclass_swash(o, FALSE, &lv);
+           SV *sw = regclass_swash(o, FALSE, &lv, 0);
        
            if (lv) {
                if (sw) {
@@ -4714,16 +4742,26 @@ Perl_re_intuit_string(pTHX_ regexp *prog)
 void
 Perl_pregfree(pTHX_ struct regexp *r)
 {
-    DEBUG_r(if (!PL_colorset) reginitcolors());
+#ifdef DEBUGGING
+    SV *dsv = PERL_DEBUG_PAD_ZERO(0);
+#endif
 
     if (!r || (--r->refcnt > 0))
        return;
-    DEBUG_r(PerlIO_printf(Perl_debug_log,
-                     "%sFreeing REx:%s `%s%.60s%s%s'\n",
-                     PL_colors[4],PL_colors[5],PL_colors[0],
-                     r->precomp,
-                     PL_colors[1],
-                     (strlen(r->precomp) > 60 ? "..." : "")));
+    DEBUG_r({
+         bool utf8 = r->reganch & ROPT_UTF8;
+         char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
+                                 UNI_DISPLAY_ISPRINT);
+        int len = SvCUR(dsv);
+        if (!PL_colorset)
+             reginitcolors();
+        PerlIO_printf(Perl_debug_log,
+                      "%sFreeing REx:%s `%s%*.*s%s%s'\n",
+                      PL_colors[4],PL_colors[5],PL_colors[0],
+                      len, len, s,
+                      PL_colors[1],
+                      len > 60 ? "..." : "");
+    });
 
     if (r->precomp)
        Safefree(r->precomp);
@@ -4779,7 +4817,7 @@ Perl_pregfree(pTHX_ struct regexp *r)
                new_comppad = NULL;
                break;
            case 'n':
-               break;
+               break;
            default:
                Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
            }
index 16cf957..9053242 100644 (file)
--- a/regcomp.h
+++ b/regcomp.h
@@ -365,7 +365,9 @@ typedef struct re_scream_pos_data_s
  *   n - Root of op tree for (?{EVAL}) item
  *   o - Start op for (?{EVAL}) item
  *   p - Pad for (?{EVAL} item
- *   s - swash for unicode-style character class
+ *   s - swash for unicode-style character class, and the multicharacter
+ *       strings resulting from casefolding the single-character entries
+ *       in the character class
  * 20010712 mjd@plover.com
  * (Remember to update re_dup() and pregfree() if you add any items.)
  */
index fe9ad4b..ee8f602 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1535,7 +1535,8 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
     SV* oreplsv = GvSV(PL_replgv);
     bool do_utf8 = DO_UTF8(sv);
 #ifdef DEBUGGING
-    SV *dsv = PERL_DEBUG_PAD_ZERO(0);
+    SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
+    SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
 #endif
 
     PL_regcc = 0;
@@ -1552,7 +1553,9 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
     }
 
     minlen = prog->minlen;
-    if (strend - startpos < minlen) {
+    if (strend - startpos < minlen &&
+       !PL_reg_match_utf8 /* ANYOFs can balloon to EXACTFs */
+       ) {
         DEBUG_r(PerlIO_printf(Perl_debug_log,
                              "String too short [regexec_flags]...\n"));
        goto phooey;
@@ -1621,20 +1624,26 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
     }
 
     DEBUG_r({
-        char *s   = do_utf8 ? sv_uni_display(dsv, sv, 60, 0) : startpos;
-        int   len = do_utf8 ? strlen(s) : strend - startpos;
+        char *s0   = UTF ?
+          pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
+                         UNI_DISPLAY_ISPRINT) :
+          prog->precomp;
+        int   len0 = UTF ? SvCUR(dsv0) : prog->prelen;
+        char *s1   = do_utf8 ? sv_uni_display(dsv1, sv, 60,
+                                              UNI_DISPLAY_ISPRINT) : startpos;
+        int   len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
         if (!PL_colorset)
             reginitcolors();
         PerlIO_printf(Perl_debug_log,
-                      "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
+                      "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
                       PL_colors[4],PL_colors[5],PL_colors[0],
-                      prog->precomp,
+                      len0, len0, s0,
                       PL_colors[1],
-                      (strlen(prog->precomp) > 60 ? "..." : ""),
+                      len0 > 60 ? "..." : "",
                       PL_colors[0],
-                      (int)(len > 60 ? 60 : len),
-                      s, PL_colors[1],
-                      (len > 60 ? "..." : "")
+                      (int)(len1 > 60 ? 60 : len1),
+                      s1, PL_colors[1],
+                      (len1 > 60 ? "..." : "")
              );
     });
 
@@ -1805,8 +1814,24 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
            strend = HOPc(strend, -(minlen - 1));
        DEBUG_r({
            SV *prop = sv_newmortal();
+           char *s0;
+           char *s1;
+           int len0;
+           int len1;
+
            regprop(prop, c);
-           PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), UTF ? sv_uni_display(dsv, sv, 60, 0) : s);
+           s0 = UTF ?
+             pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
+                            UNI_DISPLAY_ISPRINT) :
+             SvPVX(prop);
+           len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
+           s1 = UTF ?
+             sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_ISPRINT) : s;
+           len1 = UTF ? SvCUR(dsv1) : strend - s;
+           PerlIO_printf(Perl_debug_log,
+                         "Matching stclass `%*.*s' against `%*.*s'\n",
+                         len0, len0, s0,
+                         len1, len1, s1);
        });
        if (find_byclass(prog, c, s, strend, startpos, 0))
            goto got_it;
@@ -2369,11 +2394,13 @@ S_regmatch(pTHX_ regnode *prog)
            break;
        case ANYOF:
            if (do_utf8) {
-               if (!reginclass(scan, (U8*)locinput, do_utf8))
+               STRLEN inclasslen = PL_regeol - locinput;
+
+               if (!reginclasslen(scan, (U8*)locinput, &inclasslen, do_utf8))
                    sayNO;
                if (locinput >= PL_regeol)
                    sayNO;
-               locinput += PL_utf8skip[nextchr];
+               locinput += inclasslen;
                nextchr = UCHARAT(locinput);
            }
            else {
@@ -4107,10 +4134,11 @@ S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
 */
 
 SV *
-Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
+Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
 {
-    SV *sw = NULL;
-    SV *si = NULL;
+    SV *sw  = NULL;
+    SV *si  = NULL;
+    SV *alt = NULL;
 
     if (PL_regdata && PL_regdata->count) {
        U32 n = ARG(node);
@@ -4118,10 +4146,14 @@ Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
        if (PL_regdata->what[n] == 's') {
            SV *rv = (SV*)PL_regdata->data[n];
            AV *av = (AV*)SvRV((SV*)rv);
-           SV **a;
+           SV **a, **b;
        
-           si = *av_fetch(av, 0, FALSE);
-           a  =  av_fetch(av, 1, FALSE);
+           /* See the end of regcomp.c:S_reglass() for
+            * documentation of these array elements. */
+
+           si  = *av_fetch(av, 0, FALSE);
+           a   =  av_fetch(av, 1, FALSE);
+           b   =  av_fetch(av, 2, FALSE);
        
            if (a)
                sw = *a;
@@ -4129,11 +4161,15 @@ Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
                sw = swash_init("utf8", "", si, 1, 0);
                (void)av_store(av, 1, sw);
            }
+           if (b)
+               alt = *b;
        }
     }
        
-    if (initsvp)
-       *initsvp = si;
+    if (listsvp)
+       *listsvp = si;
+    if (altsvp)
+       *altsvp  = alt;
 
     return sw;
 }
@@ -4143,16 +4179,20 @@ Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
  */
 
 STATIC bool
-S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
+S_reginclasslen(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
 {
     char flags = ANYOF_FLAGS(n);
     bool match = FALSE;
     UV c;
     STRLEN len = 0;
+    STRLEN plen;
 
     c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
 
+    plen = lenp ? *lenp : UNISKIP(c);
     if (do_utf8 || (flags & ANYOF_UNICODE)) {
+        if (lenp)
+           *lenp = 0;
        if (do_utf8 && !ANYOF_RUNTIME(n)) {
            if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
                match = TRUE;
@@ -4160,24 +4200,46 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
        if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
            match = TRUE;
        if (!match) {
-           SV *sw = regclass_swash(n, TRUE, 0);
+           AV *av;
+           SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
        
            if (sw) {
                if (swash_fetch(sw, p, do_utf8))
                    match = TRUE;
                else if (flags & ANYOF_FOLD) {
-                   U8 foldbuf[UTF8_MAXLEN_FOLD+1];
-                   STRLEN foldlen;
-
-                   to_utf8_fold(p, foldbuf, &foldlen);
-                   if (swash_fetch(sw, foldbuf, do_utf8))
-                       match = TRUE;
-                   to_utf8_upper(p, foldbuf, &foldlen);
-                   if (swash_fetch(sw, foldbuf, do_utf8))
-                       match = TRUE;
+                   U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
+                   STRLEN tmplen;
+
+                   if (!match && lenp && av) {
+                       I32 i;
+                     
+                       for (i = 0; i <= av_len(av); i++) {
+                           SV* sv = *av_fetch(av, i, FALSE);
+                           STRLEN len;
+                           char *s = SvPV(sv, len);
+                       
+                           if (len <= plen && memEQ(s, p, len)) {
+                               *lenp = len;
+                               match = TRUE;
+                               break;
+                           }
+                       }
+                   }
+                   if (!match) {
+                       to_utf8_fold(p, tmpbuf, &tmplen);
+                       if (swash_fetch(sw, tmpbuf, do_utf8))
+                           match = TRUE;
+                   }
+                   if (!match) {
+                       to_utf8_upper(p, tmpbuf, &tmplen);
+                       if (swash_fetch(sw, tmpbuf, do_utf8))
+                           match = TRUE;
+                   }
                }
            }
        }
+       if (match && lenp && *lenp == 0)
+           *lenp = UNISKIP(c);
     }
     if (!match && c < 256) {
        if (ANYOF_BITMAP_TEST(n, c))
@@ -4238,6 +4300,12 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
     return (flags & ANYOF_INVERT) ? !match : match;
 }
 
+STATIC bool
+S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
+{
+    return S_reginclasslen(aTHX_ n, p, 0, do_utf8);
+}
+
 STATIC U8 *
 S_reghop(pTHX_ U8 *s, I32 off)
 {
index bc2ed37..a504186 100755 (executable)
@@ -2602,7 +2602,8 @@ print "# some Unicode properties\n";
     print "SS" =~
        /\N{LATIN SMALL LETTER SHARP S}/i   ? "ok 840\n" : "not ok 840\n";
 
-# Fix coming up.
+# These are a bit tricky.  Since the LATIN SMALL LETTER SHARP S is U+00DF,
+# the ANYOF reduces to a byte.  The Unicodeness needs to be caught earlier.
 #    print "ss" =~
 #      /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 841\n" : "not ok 841\n";
 #
diff --git a/utf8.c b/utf8.c
index 93c1128..0a25c03 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1663,7 +1663,8 @@ Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 Build to the scalar dsv a displayable version of the string spv,
 length len, the displayable version being at most pvlim bytes long
 (if longer, the rest is truncated and "..." will be appended).
-The flags argument is currently unused but available for future extensions.
+The flags argument can have UNI_DISPLAY_ISPRINT set to display
+isprint() characters as themselves.
 The pointer to the PV of the dsv is returned.
 
 =cut */
@@ -1681,7 +1682,10 @@ Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
              break;
         }
         u = utf8_to_uvchr((U8*)s, 0);
-        Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
+        if ((flags & UNI_DISPLAY_ISPRINT) && u < 256 && isprint(u))
+            Perl_sv_catpvf(aTHX_ dsv, "%c", u);
+        else
+            Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
     }
     if (truncated)
         sv_catpvn(dsv, "...", 3);
diff --git a/utf8.h b/utf8.h
index d907d26..96f1b74 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -193,3 +193,5 @@ END_EXTERN_C
 #define UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA 0x03C2
 #define UNICODE_GREEK_SMALL_LETTER_SIGMA       0x03C3
 
+#define UNI_DISPLAY_ISPRINT    0x0001
+