Integrate mainline
Nick Ing-Simmons [Sat, 30 Dec 2000 19:46:45 +0000 (19:46 +0000)]
p4raw-id: //depot/perlio@8271

22 files changed:
Changes
MANIFEST
doop.c
embed.h
embed.pl
lib/unicode/distinct.pm [new file with mode: 0644]
mg.c
op.c
op.h
patchlevel.h
perl.h
pod/perlapi.pod
pp.c
pp_ctl.c
pp_hot.c
proto.h
regcomp.c
regcomp.h
regexec.c
sv.c
t/op/split.t
utf8.c

diff --git a/Changes b/Changes
index dcb702f..b042b34 100644 (file)
--- a/Changes
+++ b/Changes
@@ -32,6 +32,392 @@ Version v5.7.1              Development release working toward v5.8
 --------------
 
 ____________________________________________________________________________
+[  8268] By: jhi                                   on 2000/12/30  17:18:40
+        Log: Add a test for Unicode split //.  The #8267 was the cure.
+     Branch: perl
+          ! t/op/split.t
+____________________________________________________________________________
+[  8267] By: jhi                                   on 2000/12/30  17:14:19
+        Log: Subject: more UTF8 test suites and an UTF8 patch
+             From: Inaba Hiroto <inaba@st.rim.or.jp>
+             Date: Sat, 30 Dec 2000 14:27:10 +0900
+             Message-ID: <3A4D722D.243AFD88@st.rim.or.jp> 
+             
+             Just the patch part for now, and the pragma renamed
+             as unicode::distinct.
+     Branch: perl
+          + lib/unicode/distinct.pm
+          ! MANIFEST doop.c embed.h embed.pl mg.c op.c op.h perl.h
+          ! pod/perlapi.pod pp.c pp_ctl.c pp_hot.c proto.h regcomp.c
+          ! regcomp.h regexec.c sv.c toke.c utf8.c
+____________________________________________________________________________
+[  8266] By: nick                                  on 2000/12/30  16:40:49
+        Log: Integrate mainline
+     Branch: perlio
+         +> lib/Pod/Text/Overstrike.pm
+         !> MANIFEST doio.c hints/dec_osf.sh hv.c lib/Pod/Man.pm
+         !> lib/Pod/Text/Color.pm lib/Pod/Text/Termcap.pm op.c
+         !> pod/pod2text.PL sv.c t/lib/syslfs.t t/op/join.t t/op/lfs.t
+         !> t/pragma/constant.t t/pragma/sub_lval.t t/pragma/utf8.t util.c
+         !> vms/vms.c vms/vmsish.h vms/vmspipe.com
+____________________________________________________________________________
+[  8265] By: jhi                                   on 2000/12/30  07:28:55
+        Log: The sv_catsv() fix, take two.
+     Branch: perl
+          ! sv.c t/op/join.t
+____________________________________________________________________________
+[  8264] By: jhi                                   on 2000/12/30  06:19:18
+        Log: Undo all the join-related changes since #8248: relevant
+             portions of 8248, 8249, 8250, 8251, 8260, 8263 must go.
+             The new sv_catsv() doesn't fly so it must go back to
+             the drawing board.
+     Branch: perl
+          ! sv.c t/op/join.t t/pragma/utf8.t
+____________________________________________________________________________
+[  8263] By: jhi                                   on 2000/12/30  01:08:32
+        Log: (Retracted by #8264)  Tweak sv_catsv() some more.
+     Branch: perl
+          ! sv.c
+____________________________________________________________________________
+[  8262] By: jhi                                   on 2000/12/30  00:45:14
+        Log: Retract #8261.
+     Branch: perl
+          ! hv.c util.c
+____________________________________________________________________________
+[  8261] By: jhi                                   on 2000/12/30  00:38:32
+        Log: (Retracted by #8261). (Unsuccessful memory access tweaks.)
+     Branch: perl
+          ! hv.c util.c
+____________________________________________________________________________
+[  8260] By: jhi                                   on 2000/12/29  22:51:33
+        Log: (Retracted by #8264)  More fixing for #8251.
+     Branch: perl
+          ! sv.c
+____________________________________________________________________________
+[  8259] By: jhi                                   on 2000/12/29  18:27:30
+        Log: Subject: podlators 1.06 released
+             From: Russ Allbery <rra@stanford.edu>
+             Date: 25 Dec 2000 05:09:30 -0800
+             Message-ID: <ylwvcosl4l.fsf@windlord.stanford.edu>
+     Branch: perl
+          + lib/Pod/Text/Overstrike.pm
+          ! MANIFEST lib/Pod/Man.pm lib/Pod/Text/Color.pm
+          ! lib/Pod/Text/Termcap.pm pod/pod2text.PL
+____________________________________________________________________________
+[  8258] By: jhi                                   on 2000/12/29  18:20:45
+        Log: Make the large file tests more robust/talkative as suggested by
+             
+             Subject: Re: [ID 20001229.001] Not OK: perl v5.7.0 +DEVEL8221 on i686-linux 2.4.0-test13pre4-ac2 -2 (UNINSTALLED)
+             From: andreas.koenig@anima.de (Andreas J. Koenig)
+             Date: 29 Dec 2000 14:23:01 +0100
+             Message-ID: <m3snn7uzt6.fsf@ak-71.mind.de>
+     Branch: perl
+          ! t/lib/syslfs.t t/op/lfs.t
+____________________________________________________________________________
+[  8257] By: jhi                                   on 2000/12/29  17:48:04
+        Log: Further VMS piping fixes from Charles Lane:
+             
+             In summary, error messages produced when a subprocess terminated
+             abnormally were being sent not just to the parent process, but to
+             grandparents, because of default values for error output that were
+             not completely overridden when the subprocess was started.
+             
+             This patch fixes this behavior by defining user-mode (i.e., temporary
+             for the duration of the program) logical names for SYS$OUTPUT and
+             SYS$ERROR when they are (re)opened inside Perl.  And a bunch of other
+             changes to make it so that the user-mode logicals are the ones that
+             control where Perl's error messages go if it terminates abnormally.
+             
+             I also added some gratuitous fixes to the indentation of braces in
+             the piping code.  It just looked ugly, before.
+     Branch: perl
+          ! doio.c vms/vms.c vms/vmsish.h vms/vmspipe.com
+____________________________________________________________________________
+[  8256] By: jhi                                   on 2000/12/29  17:45:12
+        Log: Subject: [PATCH] Idea: Declare multiple constants at once (fwd)
+             From: "Casey R. Tweten" <crt@kiski.net>
+             Date: Fri, 29 Dec 2000 12:03:00 -0500 (EST)
+             Message-ID: <Pine.OSF.4.21.0012291201150.17186-100000@home.kiski.net>
+             
+             Tests for for #8240.
+     Branch: perl
+          ! t/pragma/constant.t
+____________________________________________________________________________
+[  8255] By: jhi                                   on 2000/12/29  17:43:07
+        Log: Subject: Re: [PATCH] Interesting syntax idea
+             From: Simon Cozens <simon@cozens.net>
+             Date: Fri, 29 Dec 2000 14:34:04 +0000
+             Message-ID: <20001229143404.A7762@deep-dark-truthful-mirror.perlhacker.org>
+             
+             Tests for #8254.
+     Branch: perl
+          ! t/pragma/sub_lval.t
+____________________________________________________________________________
+[  8254] By: jhi                                   on 2000/12/29  17:42:11
+        Log: Subject: [PATCH] Interesting syntax idea
+             From: Simon Cozens <simon@cozens.net>
+             Date: Wed, 27 Dec 2000 05:08:57 +0000
+             Message-ID: <20001227050857.A11296@deep-dark-truthful-mirror.perlhacker.org>
+             
+             Make opens + bareword assigns do typeglob assigns.
+     Branch: perl
+          ! op.c
+____________________________________________________________________________
+[  8253] By: jhi                                   on 2000/12/29  17:36:45
+        Log: Output the (apparent) version of gcc, as suggested by
+             
+             Subject: [ID 20001226.001] dec_osf.sh mis-parses gcc version 2.95.2 as less than 2.95, causing installation-failure
+             From: jhbrown@ai.mit.edu (Jeremy H. Brown)
+             Date: 26 Dec 2000 04:29:17 -0500
+             Message-Id: <uv6puify1hu.fsf@suspiria.ai.mit.edu>
+             
+             (The patch as such didn't any more apply as the misparsing had
+             already been addressed in #6474.)  Also change the wording about
+             gcc 2.95.2, for Jeremy it didn't break sdbm, for me it did.
+     Branch: perl
+          ! hints/dec_osf.sh
+____________________________________________________________________________
+[  8252] By: nick                                  on 2000/12/29  12:14:31
+        Log: Integrate mainline
+     Branch: perlio
+         !> (integrate 27 files)
+____________________________________________________________________________
+[  8251] By: jhi                                   on 2000/12/29  08:45:46
+        Log: (Retracted by #8264)
+             
+             (Fixed by #8260.)
+             
+             sv_catsv() needs one more byte space than seems reasonable.
+             (for join() and PERL_DESTRUCT_LEVEL=2, built with debugging).
+             Curiouser and curiouser.
+     Branch: perl
+          ! sv.c
+____________________________________________________________________________
+[  8250] By: jhi                                   on 2000/12/29  07:57:52
+        Log: More split // UTF-8 tests.
+     Branch: perl
+          ! t/pragma/utf8.t
+____________________________________________________________________________
+[  8249] By: jhi                                   on 2000/12/29  07:54:51
+        Log: (Retracted by #8264)  More join() testing which was good because
+             it revealed a bug in #8248 (the UTF8_EIGHT_BIT_LO() was wrong).
+     Branch: perl
+          ! pp.c t/op/join.t utf8.c utf8.h
+____________________________________________________________________________
+[  8248] By: jhi                                   on 2000/12/29  07:08:32
+        Log: (Retracted by #8264)  Externally: join() was still quite UTF-8-unaware.
+             Internally: sv_catsv() wasn't quite okay on UTF-8, it assumed
+             that the only cases to care about are byte+byte and byte+character.
+             
+             TODO: See how well pp_concat() could be implemented in terms
+             of sv_catsv().
+     Branch: perl
+          ! doop.c sv.c t/op/join.t utf8.h
+____________________________________________________________________________
+[  8247] By: jhi                                   on 2000/12/29  06:35:23
+        Log: Signedness nit.
+     Branch: perl
+          ! pp_hot.c
+____________________________________________________________________________
+[  8246] By: jhi                                   on 2000/12/29  01:23:31
+        Log: Subject: Re: [ID 20001226.002] Not OK: perl v5.7.0 +DEVEL8221 on i86pc-solaris 2.8 (UNINSTALLED)
+             From: Lupe Christoph <lupe@lupe-christoph.de>
+             Date: Thu, 28 Dec 2000 23:00:00 +0100
+             Message-ID: <20001228230000.F2574@alanya.lupe-christoph.de>
+             
+             Suggest rsync --delete --dry-run.
+     Branch: perl
+          ! pod/perlhack.pod
+____________________________________________________________________________
+[  8245] By: jhi                                   on 2000/12/28  23:57:05
+        Log: The maxiters upper limit sanity check (guarding against
+             non-progress) assumed bytes instead of characters in s///
+             and split().
+     Branch: perl
+          ! pp.c pp_hot.c
+____________________________________________________________________________
+[  8244] By: jhi                                   on 2000/12/28  23:34:08
+        Log: Make some panic messages a bit more logical.
+     Branch: perl
+          ! doop.c pod/perldiag.pod pp.c pp_hot.c
+____________________________________________________________________________
+[  8243] By: jhi                                   on 2000/12/28  22:59:16
+        Log: Subject: [PATCH] lvalue AUTOLOAD. No, really.
+             From: Simon Cozens <simon@cozens.net>
+             Date: Wed, 27 Dec 2000 02:30:03 +0000
+             Message-ID: <20001227023003.A7677@deep-dark-truthful-mirror.perlhacker.org>
+     Branch: perl
+          ! pp.c t/pragma/sub_lval.t
+____________________________________________________________________________
+[  8242] By: jhi                                   on 2000/12/28  22:56:53
+        Log: Subject: [PATCH blead] Fix B::Terse indentation
+             From: Daniel Chetlin <daniel@chetlin.com>
+             Date: Wed, 27 Dec 2000 06:43:30 -0800
+             Message-ID: <20001227064329.B9573@darkstar>
+     Branch: perl
+          ! ext/B/B.pm ext/B/B/Terse.pm t/lib/b.t
+____________________________________________________________________________
+[  8241] By: jhi                                   on 2000/12/28  22:48:59
+        Log: The latter patch from the
+             
+             Subject: [PATCH: perl@8211] directory depth typo in one win32 Makefile
+             From: Peter Prymmer <pvhp@forte.com> 
+             Date: Fri, 22 Dec 2000 14:52:12 -0800 (PST)
+             Message-ID: <Pine.OSF.4.10.10012221433140.195493-100000@aspara.forte.com>
+             
+             (the former patch from the above should have been
+             taken care of by Nick I-S)
+     Branch: perl
+          ! win32/Makefile
+____________________________________________________________________________
+[  8240] By: jhi                                   on 2000/12/28  22:45:22
+        Log: Subject: Re: [PATCH] Idea: Declare multiple constants at once
+             From: "Casey R. Tweten" <crt@kiski.net>
+             Date: Fri, 22 Dec 2000 10:35:53 -0500 (EST)
+             Message-ID: <Pine.OSF.4.21.0012221032030.28992-100000@home.kiski.net>
+     Branch: perl
+          ! lib/constant.pm
+____________________________________________________________________________
+[  8239] By: jhi                                   on 2000/12/28  22:37:45
+        Log: Subject: [PATCH] Re: [ID 19991001.003] sort(sub(arg)) misparsed as sort sub args
+             From: Simon Cozens <simon@cozens.net>
+             Date: Wed, 27 Dec 2000 14:12:44 +0000
+             Message-ID: <20001227141244.A13344@deep-dark-truthful-mirror.perlhacker.org>
+     Branch: perl
+          ! t/op/method.t t/op/sort.t toke.c
+____________________________________________________________________________
+[  8238] By: jhi                                   on 2000/12/28  22:30:32
+        Log: Subject: [PATCH perl@8229] Call.pm
+             From: "Paul Marquess" <Paul.Marquess@btinternet.com>
+             Date: Mon, 25 Dec 2000 10:47:15 -0000
+             Message-ID: <000201c06e60$0b967760$a20a140a@bfs.phone.com>
+     Branch: perl
+          ! ext/Filter/Util/Call/Call.pm
+____________________________________________________________________________
+[  8237] By: jhi                                   on 2000/12/28  22:19:21
+        Log: Subject: [PATCH blead] Fix problem with `&' prototype
+             From: Daniel Chetlin <daniel@chetlin.com>
+             Date: Wed, 27 Dec 2000 15:55:32 -0800
+             Message-ID: <20001227155532.D9573@darkstar>
+     Branch: perl
+          ! op.c t/comp/proto.t
+____________________________________________________________________________
+[  8236] By: jhi                                   on 2000/12/28  22:09:25
+        Log: Subject: Re: [PATCH blead] Fix segfault in gv_handler/mg_find
+             From: Daniel Chetlin <daniel@chetlin.com>
+             Date: Sun, 24 Dec 2000 04:09:49 -0800
+             Message-ID: <20001224040949.B3090@darkstar>
+     Branch: perl
+          ! gv.c mg.c t/op/attrs.t
+____________________________________________________________________________
+[  8235] By: jhi                                   on 2000/12/28  22:07:11
+        Log: Subject: [PATCH] Win32::Spawn() didn't inherit cwd and env correctly
+             From: Jan Dubois <jand@ActiveState.com>
+             Date: Tue, 26 Dec 2000 20:57:31 -0800
+             Message-ID: <reti4ts0php3anruv0qcjru3tl850g3sfd@4ax.com>
+     Branch: perl
+          ! win32/win32.c
+____________________________________________________________________________
+[  8234] By: jhi                                   on 2000/12/28  21:52:42
+        Log: Subject: Re: [PATCH] Warn on use of reference as array elem
+             From: Simon Cozens <simon@cozens.net>
+             Date: Thu, 28 Dec 2000 20:33:13 +0000
+             Message-ID: <20001228203313.A2607@deep-dark-truthful-mirror.perlhacker.org>
+     Branch: perl
+          ! pod/perldiag.pod pp_hot.c t/pragma/warn/pp_hot
+____________________________________________________________________________
+[  8233] By: jhi                                   on 2000/12/28  19:40:49
+        Log: Integrate perlio.
+     Branch: perl
+         !> ext/Encode/Makefile.PL ext/Encode/compile
+____________________________________________________________________________
+[  8232] By: nick                                  on 2000/12/23  16:06:00
+        Log: Encode's Makefile.PL fix not good for dmake $(MAKEFILE) is set to -f Makefile
+             and fails to make '-f'. (Also handle case where xxxx.c files have not been deleted.)
+     Branch: perlio
+          ! ext/Encode/Makefile.PL
+____________________________________________________________________________
+[  8231] By: nick                                  on 2000/12/23  14:30:34
+        Log: "Compiled" encode build cleanup
+             - Makefile uses catfile $(MAKEFILE) etc. for platform issues.
+             - .c files do not export sub-tables
+     Branch: perlio
+          ! ext/Encode/Makefile.PL ext/Encode/compile
+____________________________________________________________________________
+[  8230] By: nick                                  on 2000/12/23  12:50:37
+        Log: Integrate mainline
+     Branch: perlio
+         !> INSTALL lib/CGI.pm lib/CPAN.pm lib/Pod/Select.pm
+         !> lib/Text/ParseWords.pm lib/Win32.pod pod/perl.pod
+         !> pod/perl5004delta.pod pod/perl5005delta.pod
+         !> pod/perl56delta.pod pod/perldelta.pod pod/perldiag.pod
+         !> pod/perlembed.pod pod/perlfaq4.pod pod/perllocale.pod
+         !> pod/perlmodlib.pod pod/perlrequick.pod pod/perlretut.pod
+         !> pod/perlsub.pod
+____________________________________________________________________________
+[  8229] By: jhi                                   on 2000/12/22  15:32:12
+        Log: Integrate perlio.
+     Branch: perl
+         +> win32/distclean.bat
+         !> MANIFEST t/io/utf8.t
+____________________________________________________________________________
+[  8228] By: jhi                                   on 2000/12/22  15:29:40
+        Log: Subject: [PATCH 5.6.1-TRIAL1 and @8223]; was Re: Perlbug 20000322.006 status +update
+             From: Robin Barker <rmb1@cise.npl.co.uk>
+             Date: Fri, 22 Dec 2000 12:17:38 GMT
+             Message-Id: <200012221217.MAA21332@tempest.npl.co.uk>
+     Branch: perl
+          ! lib/CGI.pm lib/CPAN.pm lib/Pod/Select.pm
+          ! lib/Text/ParseWords.pm lib/Win32.pod pod/perl.pod
+          ! pod/perl5004delta.pod pod/perl5005delta.pod
+          ! pod/perl56delta.pod pod/perldelta.pod pod/perldiag.pod
+          ! pod/perlembed.pod pod/perlfaq4.pod pod/perllocale.pod
+          ! pod/perlmodlib.pod pod/perlrequick.pod pod/perlretut.pod
+          ! pod/perlsub.pod
+____________________________________________________________________________
+[  8227] By: jhi                                   on 2000/12/22  15:24:28
+        Log: Subject: Re: A Configure option like 'otherlibdirs' but for *pre*pending?
+             From: "John L. Allen" <allen@grumman.com>
+             Date: Thu, 21 Dec 2000 14:39:58 -0500 (EST)
+             Message-ID: <Pine.SOL.3.91.1001221133227.23511A-100000@gateway.grumman.com>
+             
+             Document APPLLIB_EXP.
+     Branch: perl
+          ! INSTALL
+____________________________________________________________________________
+[  8226] By: nick                                  on 2000/12/21  22:11:50
+        Log: Handy script for when one forgets to "dmake clean" 
+     Branch: perlio
+          + win32/distclean.bat
+          ! MANIFEST
+____________________________________________________________________________
+[  8225] By: nick                                  on 2000/12/21  21:54:04
+        Log: CRLF platform issue with io/utf8 fix.
+     Branch: perlio
+          ! t/io/utf8.t
+____________________________________________________________________________
+[  8224] By: nick                                  on 2000/12/21  21:02:20
+        Log: Integrate mainline
+     Branch: perlio
+         !> Changes Configure Makefile.SH Porting/Glossary
+         !> Porting/config.sh Porting/config_H configure.com
+         !> epoc/config.sh ext/Thread/Thread.xs patchlevel.h
+         !> pod/perlfaq3.pod pod/perlfunc.pod pod/perltoc.pod pp_sys.c
+         !> sv.c t/io/fs.t t/op/misc.t t/op/utf8decode.t t/pragma/utf8.t
+         !> win32/config.bc win32/config.gc win32/config.vc
+____________________________________________________________________________
+[  8223] By: jhi                                   on 2000/12/21  17:09:16
+        Log: Update Changes.
+     Branch: perl
+          ! Changes patchlevel.h
+____________________________________________________________________________
+[  8222] By: jhi                                   on 2000/12/21  16:24:01
+        Log: Metaconfig unit changes for #8221.
+     Branch: metaconfig/U/perl
+         +> issymlink.U
+          - testsyml.U
+          ! Mksymlinks.U
+____________________________________________________________________________
 [  8221] By: jhi                                   on 2000/12/21  16:23:48
         Log: Rename testsyml to issymlink.
      Branch: perl
index 7445fd7..18ae760 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -1072,6 +1072,7 @@ lib/unicode/To/Upper.pl                           Unicode character database
 lib/unicode/UCD301.html                                Unicode character database
 lib/unicode/UCDFF301.html                      Unicode character database
 lib/unicode/Unicode.301                                Unicode character database
+lib/unicode/distinct.pm                Perl pragma to strictly distinguish UTF8 data and non-UTF data
 lib/unicode/mktables.PL                                Unicode character database generator
 lib/unicode/syllables.txt                      Unicode character database
 lib/utf8.pm                                    Pragma to control Unicode support
diff --git a/doop.c b/doop.c
index 3548556..8256b93 100644 (file)
--- a/doop.c
+++ b/doop.c
 #endif
 #endif
 
-#define HALF_UTF8_UPGRADE(start,end) \
-    STMT_START {                               \
-      if ((start)<(end)) {                     \
-       U8* NeWsTr;                             \
-       STRLEN LeN = (end) - (start);           \
-       NeWsTr = bytes_to_utf8(start, &LeN);    \
-       Safefree(start);                        \
-       (start) = NeWsTr;                       \
-       (end) = (start) + LeN;                  \
-      }                                                \
-    } STMT_END
-
 STATIC I32
 S_do_trans_simple(pTHX_ SV *sv)
 {
@@ -41,7 +29,6 @@ S_do_trans_simple(pTHX_ SV *sv)
     U8 *send;
     U8 *dstart;
     I32 matches = 0;
-    I32 sutf = SvUTF8(sv);
     STRLEN len;
     short *tbl;
     I32 ch;
@@ -54,7 +41,7 @@ S_do_trans_simple(pTHX_ SV *sv)
     send = s + len;
 
     /* First, take care of non-UTF8 input strings, because they're easy */
-    if (!sutf) {
+    if (!SvUTF8(sv)) {
        while (s < send) {
            if ((ch = tbl[*s]) >= 0) {
                matches++;
@@ -79,7 +66,10 @@ S_do_trans_simple(pTHX_ SV *sv)
        c = utf8_to_uv(s, send - s, &ulen, 0);
         if (c < 0x100 && (ch = tbl[(short)c]) >= 0) {
             matches++;
-            d = uv_to_utf8(d,ch);
+            if (ch < 0x80)
+                *d++ = ch;
+            else
+                d = uv_to_utf8(d,ch);
             s += ulen;
         }
        else { /* No match -> copy */
@@ -88,8 +78,7 @@ S_do_trans_simple(pTHX_ SV *sv)
         }
     }
     *d = '\0';
-    sv_setpvn(sv, (const char*)dstart, d - dstart);
-    Safefree(dstart);
+    sv_setpvn(sv, (char*)dstart, d - dstart);
     SvUTF8_on(sv);
     SvSETMAGIC(sv);
     return matches;
@@ -101,7 +90,6 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
     U8 *s;
     U8 *send;
     I32 matches = 0;
-    I32 hasutf = SvUTF8(sv);
     STRLEN len;
     short *tbl;
 
@@ -112,22 +100,20 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
     s = (U8*)SvPV(sv, len);
     send = s + len;
 
-    while (s < send) {
-        if (hasutf && *s & 0x80)
-            s += UTF8SKIP(s);
-        else {
-            UV c;
-            STRLEN ulen;
-            ulen = 1;
-            if (hasutf)
-                c = utf8_to_uv(s, send - s, &ulen, 0);
-            else
-                c = *s;
-            if (c < 0x100 && tbl[c] >= 0)
+    if (!SvUTF8(sv))
+       while (s < send) {
+            if (tbl[*s++] >= 0)
                 matches++;
-            s += ulen;
-        }
-    }
+       }
+    else
+       while (s < send) {
+           UV c;
+           STRLEN ulen;
+           c = utf8_to_uv(s, send - s, &ulen, 0);
+           if (c < 0x100 && tbl[c] >= 0)
+               matches++;
+           s += ulen;
+       }
 
     return matches;
 }
@@ -139,7 +125,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
     U8 *send;
     U8 *d;
     U8 *dstart;
-    I32 hasutf = SvUTF8(sv);
+    I32 isutf8;
     I32 matches = 0;
     STRLEN len;
     short *tbl;
@@ -150,64 +136,96 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
        Perl_croak(aTHX_ "panic: do_trans_complex");
 
     s = (U8*)SvPV(sv, len);
+    isutf8 = SvUTF8(sv);
     send = s + len;
 
-    Newz(0, d, len*2+1, U8);
-    dstart = d;
-
-    if (PL_op->op_private & OPpTRANS_SQUASH) {
-       U8* p = send;
-
-       while (s < send) {
-            if (hasutf && *s & 0x80)
-                s += UTF8SKIP(s);
-            else {
-               if ((ch = tbl[*s]) >= 0) {
+    if (!isutf8) {
+       dstart = d = s;
+       if (PL_op->op_private & OPpTRANS_SQUASH) {
+           U8* p = send;
+           while (s < send) {
+               if ((ch = tbl[*s]) >= 0) {
                    *d = ch;
                    matches++;
-           if (p != d - 1 || *p != *d)
-                       p = d++;
-               }
-               else if (ch == -1)      /* -1 is unmapped character */
-                   *d++ = *s;          /* -2 is delete character */
-               s++;
-            }
+                   if (p != d - 1 || *p != *d)
+                       p = d++;
+               }
+               else if (ch == -1)      /* -1 is unmapped character */
+                   *d++ = *s;  
+               else if (ch == -2)      /* -2 is delete character */
+                   matches++;
+               s++;
+           }
        }
+       else {
+           while (s < send) {
+               if ((ch = tbl[*s]) >= 0) {
+                   matches++;
+                   *d++ = ch;
+               }
+               else if (ch == -1)      /* -1 is unmapped character */
+                   *d++ = *s;
+               else if (ch == -2)      /* -2 is delete character */
+                   matches++;
+               s++;
+           }
+       }
+       SvCUR_set(sv, d - dstart);
     }
-    else {
-       while (s < send) {
-           UV comp;
-            if (hasutf && *s & 0x80)
-                comp = utf8_to_uv_simple(s, NULL);
-           else
-                comp = *s;
-           
-           ch = tbl[comp];
-           
-           if (ch == -1) { /* -1 is unmapped character */
-                ch = comp;
-               matches--;
-           }
-
-           if (ch >= 0)
-               d = uv_to_utf8(d, ch);
-           
-           matches++;
-
-           s += hasutf && *s & 0x80 ? UNISKIP(*s) : 1;
-            
+    else { /* isutf8 */
+       Newz(0, d, len*2+1, U8);
+       dstart = d;
+
+       if (PL_op->op_private & OPpTRANS_SQUASH) {
+           U8* p = send;
+           UV pch = 0xfeedface;
+           while (s < send) {
+               STRLEN len;
+               UV comp = utf8_to_uv_simple(s, &len);
+
+               if (comp > 0xff)
+                   d = uv_to_utf8(d, comp);    /* always unmapped */
+               else if ((ch = tbl[comp]) >= 0) {
+                   matches++;
+                   if (ch != pch) {
+                       d = uv_to_utf8(d, ch);
+                       pch = ch;
+                   }
+                   s += len;
+                   continue;
+               }
+               else if (ch == -1)      /* -1 is unmapped character */
+                   d = uv_to_utf8(d, comp);
+               else if (ch == -2)      /* -2 is delete character */
+                   matches++;
+               s += len;
+               pch = 0xfeedface;
+           }
        }
+       else {
+           while (s < send) {
+               STRLEN len;
+               UV comp = utf8_to_uv_simple(s, &len);
+               if (comp > 0xff)
+                   d = uv_to_utf8(d, comp);    /* always unmapped */
+               else if ((ch = tbl[comp]) >= 0) {
+                   d = uv_to_utf8(d, ch);
+                   matches++;
+               }
+               else if (ch == -1) {    /* -1 is unmapped character */
+                   d = uv_to_utf8(d, comp);
+               }
+               else if (ch == -2)      /* -2 is delete character */
+                   matches++;
+               s += len;
+           }
+       }
+       *d = '\0';
+       sv_setpvn(sv, (char*)dstart, d - dstart);
+       SvUTF8_on(sv);
     }
-
-    *d = '\0';
-
-    sv_setpvn(sv, (const char*)dstart, d - dstart);
-    Safefree(dstart);
-    if (hasutf)
-        SvUTF8_on(sv);
     SvSETMAGIC(sv);
     return matches;
-
 }
 
 STATIC I32
@@ -217,7 +235,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
     U8 *send;
     U8 *d;
     U8 *start;
-    U8 *dstart;
+    U8 *dstart, *dend;
     I32 matches = 0;
     STRLEN len;
 
@@ -228,11 +246,19 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
     UV extra = none + 1;
     UV final;
     UV uv;
-    I32 isutf;
-    I32 howmany;
+    I32 isutf8;
+    U8 hibit = 0;
 
-    isutf = SvUTF8(sv);
     s = (U8*)SvPV(sv, len);
+    isutf8 = SvUTF8(sv);
+    if (!isutf8) {
+       U8 *t = s, *e = s + len;
+       while (t < e)
+           if ((hibit = *t++ & 0x80))
+               break;
+       if (hibit)
+           s = bytes_to_utf8(s, &len);
+    }
     send = s + len;
     start = s;
 
@@ -241,41 +267,46 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
        final = SvUV(*svp);
 
     /* d needs to be bigger than s, in case e.g. upgrading is required */
-    Newz(0, d, len*2+1, U8);
+    New(0, d, len*3+UTF8_MAXLEN, U8);
+    dend = d + len * 3;
     dstart = d;
+
     while (s < send) {
        if ((uv = swash_fetch(rv, s)) < none) {
            s += UTF8SKIP(s);
            matches++;
-            if ((uv & 0x80) && !isutf++)
-                HALF_UTF8_UPGRADE(dstart,d);
            d = uv_to_utf8(d, uv);
        }
        else if (uv == none) {
-           int i;
-           i = UTF8SKIP(s);
-            if (i > 1 && !isutf++)
-                HALF_UTF8_UPGRADE(dstart,d);
+           int i = UTF8SKIP(s);
            while(i--)
                *d++ = *s++;
        }
        else if (uv == extra) {
-           int i;
-           i = UTF8SKIP(s);
+           int i = UTF8SKIP(s);
            s += i;
            matches++;
-            if (i > 1 && !isutf++)
-                HALF_UTF8_UPGRADE(dstart,d);
            d = uv_to_utf8(d, final);
        }
        else
            s += UTF8SKIP(s);
+
+       if (d >= dend) {
+           STRLEN clen = d - dstart;
+           STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
+           Renew(dstart, nlen+UTF8_MAXLEN, U8);
+           d = dstart + clen;
+           dend = dstart + nlen;
+       }
     }
     *d = '\0';
-    sv_setpvn(sv, (const char*)dstart, d - dstart);
+    sv_setpvn(sv, (char*)dstart, d - dstart);
     SvSETMAGIC(sv);
-    if (isutf)
-        SvUTF8_on(sv);
+    SvUTF8_on(sv);
+    if (hibit)
+       Safefree(start);
+    if (!isutf8 && !(PL_hints & HINT_UTF8))
+       sv_utf8_downgrade(sv, TRUE);
 
     return matches;
 }
@@ -284,7 +315,7 @@ STATIC I32
 S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */
 {
     U8 *s;
-    U8 *send;
+    U8 *start, *send;
     I32 matches = 0;
     STRLEN len;
 
@@ -293,10 +324,17 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */
     SV** svp = hv_fetch(hv, "NONE", 4, FALSE);
     UV none = svp ? SvUV(*svp) : 0x7fffffff;
     UV uv;
+    U8 hibit = 0;
 
     s = (U8*)SvPV(sv, len);
-    if (!SvUTF8(sv))
-        s = bytes_to_utf8(s, &len);
+    if (!SvUTF8(sv)) {
+       U8 *t = s, *e = s + len;
+       while (t < e)
+           if ((hibit = *t++ & 0x80))
+               break;
+       if (hibit)
+           start = s = bytes_to_utf8(s, &len);
+    }
     send = s + len;
 
     while (s < send) {
@@ -304,6 +342,8 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */
            matches++;
        s += UTF8SKIP(s);
     }
+    if (hibit)
+        Safefree(start);
 
     return matches;
 }
@@ -312,7 +352,7 @@ STATIC I32
 S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
 {
     U8 *s;
-    U8 *send;
+    U8 *start, *send;
     U8 *d;
     I32 matches = 0;
     I32 squash   = PL_op->op_private & OPpTRANS_SQUASH;
@@ -325,41 +365,45 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
     UV final;
     UV uv;
     STRLEN len;
-    U8 *dst;
-    I32 isutf = SvUTF8(sv);
+    U8 *dstart, *dend;
+    I32 isutf8;
+    U8 hibit = 0;
 
     s = (U8*)SvPV(sv, len);
+    isutf8 = SvUTF8(sv);
+    if (!isutf8) {
+       U8 *t = s, *e = s + len;
+       while (t < e)
+           if ((hibit = *t++ & 0x80))
+               break;
+       if (hibit)
+           s = bytes_to_utf8(s, &len);
+    }
     send = s + len;
+    start = s;
 
     svp = hv_fetch(hv, "FINAL", 5, FALSE);
     if (svp)
        final = SvUV(*svp);
 
-    Newz(0, d, len*2+1, U8);
-       dst = d;
+    New(0, d, len*3+UTF8_MAXLEN, U8);
+    dend = d + len * 3;
+    dstart = d;
 
     if (squash) {
        UV puv = 0xfeedface;
        while (s < send) {
-            if (SvUTF8(sv))
-               uv = swash_fetch(rv, s);
-           else {
-               U8 tmpbuf[2];
-               uv = *s++;
-               if (uv < 0x80)
-                   tmpbuf[0] = uv;
-               else {
-                   tmpbuf[0] = (( uv >>  6)         | 0xc0);
-                   tmpbuf[1] = (( uv        & 0x3f) | 0x80);
-               }
-               uv = swash_fetch(rv, tmpbuf);
+           uv = swash_fetch(rv, s);
+           
+           if (d >= dend) {
+               STRLEN clen = d - dstart, nlen = dend - dstart + len;
+               Renew(dstart, nlen+UTF8_MAXLEN, U8);
+               d = dstart + clen;
+               dend = dstart + nlen;
            }
-
            if (uv < none) {
                matches++;
                if (uv != puv) {
-                    if ((uv & 0x80) && !isutf++)
-                        HALF_UTF8_UPGRADE(dst,d);
                    d = uv_to_utf8(d, uv);
                    puv = uv;
                }
@@ -367,9 +411,9 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
                continue;
            }
            else if (uv == none) {      /* "none" is unmapped character */
-               STRLEN ulen;
-               *d++ = (U8)utf8_to_uv(s, send - s, &ulen, 0);
-               s += ulen;
+               int i = UTF8SKIP(s);
+               while(i--)
+                   *d++ = *s++;
                puv = 0xfeedface;
                continue;
            }
@@ -388,18 +432,12 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
     }
     else {
        while (s < send) {
-            if (SvUTF8(sv))
-               uv = swash_fetch(rv, s);
-           else {
-               U8 tmpbuf[2];
-               uv = *s++;
-               if (uv < 0x80)
-                   tmpbuf[0] = uv;
-               else {
-                   tmpbuf[0] = (( uv >>  6)         | 0xc0);
-                   tmpbuf[1] = (( uv        & 0x3f) | 0x80);
-               }
-               uv = swash_fetch(rv, tmpbuf);
+           uv = swash_fetch(rv, s);
+           if (d >= dend) {
+               STRLEN clen = d - dstart, nlen = dend - dstart + len;
+               Renew(dstart, nlen+UTF8_MAXLEN, U8);
+               d = dstart + clen;
+               dend = dstart + nlen;
            }
            if (uv < none) {
                matches++;
@@ -408,9 +446,9 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
                continue;
            }
            else if (uv == none) {      /* "none" is unmapped character */
-               STRLEN ulen;
-               *d++ = (U8)utf8_to_uv(s, send - s, &ulen, 0);
-               s += ulen;
+               int i = UTF8SKIP(s);
+               while(i--)
+                   *d++ = *s++;
                continue;
            }
            else if (uv == extra && !del) {
@@ -423,12 +461,13 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
            s += UTF8SKIP(s);
        }
     }
-    if (dst)
-       sv_usepvn(sv, (char*)dst, d - dst);
-    else {
-       *d = '\0';
-       SvCUR_set(sv, d - (U8*)SvPVX(sv));
-    }
+    *d = '\0';
+    sv_setpvn(sv, (char*)dstart, d - dstart);
+    SvUTF8_on(sv);
+    if (hibit)
+       Safefree(start);
+    if (!isutf8 && !(PL_hints & HINT_UTF8))
+       sv_utf8_downgrade(sv, TRUE);
     SvSETMAGIC(sv);
 
     return matches;
diff --git a/embed.h b/embed.h
index 3b54154..414a642 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define regcp_set_to           S_regcp_set_to
 #define cache_re               S_cache_re
 #define reghop                 S_reghop
+#define reghop3                        S_reghop3
 #define reghopmaybe            S_reghopmaybe
+#define reghopmaybe3           S_reghopmaybe3
 #define find_byclass           S_find_byclass
 #endif
 #if defined(PERL_IN_RUN_C) || defined(PERL_DECL_PROT)
 #define regcp_set_to(a)                S_regcp_set_to(aTHX_ a)
 #define cache_re(a)            S_cache_re(aTHX_ a)
 #define reghop(a,b)            S_reghop(aTHX_ a,b)
+#define reghop3(a,b,c)         S_reghop3(aTHX_ a,b,c)
 #define reghopmaybe(a,b)       S_reghopmaybe(aTHX_ a,b)
+#define reghopmaybe3(a,b,c)    S_reghopmaybe3(aTHX_ a,b,c)
 #define find_byclass(a,b,c,d,e,f)      S_find_byclass(aTHX_ a,b,c,d,e,f)
 #endif
 #if defined(PERL_IN_RUN_C) || defined(PERL_DECL_PROT)
 #define cache_re               S_cache_re
 #define S_reghop               CPerlObj::S_reghop
 #define reghop                 S_reghop
+#define S_reghop3              CPerlObj::S_reghop3
+#define reghop3                        S_reghop3
 #define S_reghopmaybe          CPerlObj::S_reghopmaybe
 #define reghopmaybe            S_reghopmaybe
+#define S_reghopmaybe3         CPerlObj::S_reghopmaybe3
+#define reghopmaybe3           S_reghopmaybe3
 #define S_find_byclass         CPerlObj::S_find_byclass
 #define find_byclass           S_find_byclass
 #endif
index 32f3ddc..7b83635 100755 (executable)
--- a/embed.pl
+++ b/embed.pl
@@ -2407,7 +2407,9 @@ s |char*|regcppop
 s      |char*|regcp_set_to     |I32 ss
 s      |void   |cache_re       |regexp *prog
 s      |U8*    |reghop         |U8 *pos|I32 off
+s      |U8*    |reghop3        |U8 *pos|I32 off|U8 *lim
 s      |U8*    |reghopmaybe    |U8 *pos|I32 off
+s      |U8*    |reghopmaybe3   |U8 *pos|I32 off|U8 *lim
 s      |char*  |find_byclass   |regexp * prog|regnode *c|char *s|char *strend|char *startpos|I32 norun
 #endif
 
diff --git a/lib/unicode/distinct.pm b/lib/unicode/distinct.pm
new file mode 100644 (file)
index 0000000..6471ac8
--- /dev/null
@@ -0,0 +1,35 @@
+package unicode:distinct;
+
+our $VERSION = '0.01';
+
+$unicode::distinct::hint_bits = 0x01000000;
+
+sub import {
+    $^H |= $unicode::distinct::hint_bits;
+}
+
+sub unimport {
+    $^H &= ~$unicode::distinct::hint_bits;
+}
+
+1;
+__END__
+
+=head1 NAME
+
+unicode::distinct - Perl pragma to strictly distinguish UTF8 data and non-UTF data.
+
+=head1 SYNOPSIS
+
+    use unicode::distinct;
+    no unicode::distinct;
+
+=head1 DESCRIPTION
+
+ *NOT YET*
+
+=head1 SEE ALSO
+
+L<perlunicode>, L<utf8>
+
+=cut
diff --git a/mg.c b/mg.c
index 821c325..340c1e8 100644 (file)
--- a/mg.c
+++ b/mg.c
@@ -366,6 +366,11 @@ Perl_magic_regdatum_get(pTHX_ SV *sv, MAGIC *mg)
                    i = t;
                else                    /* @- */
                    i = s;
+               
+               if (i > 0 && DO_UTF8(PL_reg_sv)) {
+                   char *b = rx->subbeg;
+                   i = Perl_utf8_length(aTHX_ (U8*)b, (U8*)(b+i));
+               }
                sv_setiv(sv,i);
            }
     }
@@ -1410,7 +1415,14 @@ Perl_magic_setsubstr(pTHX_ SV *sv, MAGIC *mg)
 {
     STRLEN len;
     char *tmps = SvPV(sv,len);
-    sv_insert(LvTARG(sv),LvTARGOFF(sv),LvTARGLEN(sv), tmps, len);
+    if (DO_UTF8(sv)) {
+       sv_utf8_upgrade(LvTARG(sv));
+       sv_insert(LvTARG(sv),LvTARGOFF(sv),LvTARGLEN(sv), tmps, len);
+       SvUTF8_on(LvTARG(sv));
+    }
+    else
+        sv_insert(LvTARG(sv),LvTARGOFF(sv),LvTARGLEN(sv), tmps, len);
+
     return 0;
 }
 
diff --git a/op.c b/op.c
index 28e7e98..8879854 100644 (file)
--- a/op.c
+++ b/op.c
@@ -6772,6 +6772,8 @@ Perl_peep(pTHX_ register OP *o)
            if (!fields || !GvHV(*fields))
                break;
            key = SvPV(*svp, keylen);
+           if (SvUTF8(*svp))
+               keylen = -keylen;
            indsvp = hv_fetch(GvHV(*fields), key, keylen, FALSE);
            if (!indsvp) {
                Perl_croak(aTHX_ "No such pseudo-hash field \"%s\" in variable %s of type %s",
@@ -6837,6 +6839,8 @@ Perl_peep(pTHX_ register OP *o)
                 key_op = (SVOP*)key_op->op_sibling) {
                svp = cSVOPx_svp(key_op);
                key = SvPV(*svp, keylen);
+               if (SvUTF8(*svp))
+                   keylen = -keylen;
                indsvp = hv_fetch(GvHV(*fields), key, keylen, FALSE);
                if (!indsvp) {
                    Perl_croak(aTHX_ "No such pseudo-hash field \"%s\" "
diff --git a/op.h b/op.h
index 55b85a5..e0fc3bc 100644 (file)
--- a/op.h
+++ b/op.h
@@ -250,6 +250,9 @@ struct pmop {
 #define PMdf_USED      0x01            /* pm has been used once already */
 #define PMdf_TAINTED   0x02            /* pm compiled from tainted pattern */
 #define PMdf_UTF8      0x04            /* pm compiled from utf8 data */
+#define PMdf_DYN_UTF8  0x08
+
+#define PMdf_CMP_UTF8  (PMdf_UTF8|PMdf_DYN_UTF8)
 
 #define PMf_RETAINT    0x0001          /* taint $1 etc. if target tainted */
 #define PMf_ONCE       0x0002          /* use pattern only once per reset */
index ee006c3..4037587 100644 (file)
@@ -70,7 +70,7 @@
 #if !defined(PERL_PATCHLEVEL_H_IMPLICIT) && !defined(LOCAL_PATCH_COUNT)
 static char    *local_patches[] = {
         NULL
-       ,"DEVEL8221"
+       ,"DEVEL8268"
        ,NULL
 };
 
diff --git a/perl.h b/perl.h
index cccf728..77ef4c9 100644 (file)
--- a/perl.h
+++ b/perl.h
@@ -2698,6 +2698,7 @@ enum {            /* pass one of these to get_vtbl */
 
 #define HINT_FILETEST_ACCESS   0x00400000
 #define HINT_UTF8              0x00800000
+#define HINT_UTF8_DISTINCT     0x01000000
 
 /* Various states of an input record separator SV (rs, nrs) */
 #define RsSNARF(sv)   (! SvOK(sv))
index f7ad2d3..ba6a836 100644 (file)
@@ -2582,8 +2582,9 @@ Found in file sv.c
 
 =item sv_catsv
 
-Concatenates the string from SV C<ssv> onto the end of the string in SV
-C<dsv>.  Handles 'get' magic, but not 'set' magic.  See C<sv_catsv_mg>.
+Concatenates the string from SV C<ssv> onto the end of the string in
+SV C<dsv>.  Modifies C<dsv> but not C<ssv>.  Handles 'get' magic, but
+not 'set' magic.  See C<sv_catsv_mg>.
 
        void    sv_catsv(SV* dsv, SV* ssv)
 
@@ -3283,8 +3284,7 @@ Found in file utf8.c
 
 Returns the character value of the first character in the string C<s>
 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
-C<retlen> will be set to the length, in bytes, of that character,
-and the pointer C<s> will be advanced to the end of the character.
+C<retlen> will be set to the length, in bytes, of that character.
 
 If C<s> does not point to a well-formed UTF8 character, the behaviour
 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
@@ -3306,8 +3306,7 @@ Found in file utf8.c
 
 Returns the character value of the first character in the string C<s>
 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
-length, in bytes, of that character, and the pointer C<s> will be
-advanced to the end of the character.
+length, in bytes, of that character.
 
 If C<s> does not point to a well-formed UTF8 character, zero is
 returned and retlen is set, if possible, to -1.
diff --git a/pp.c b/pp.c
index 1150697..ff4508c 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -5739,9 +5739,9 @@ PP(pp_split)
     AV *ary;
     register IV limit = POPi;                  /* note, negative is forever */
     SV *sv = POPs;
-    bool do_utf8 = DO_UTF8(sv);
     STRLEN len;
     register char *s = SvPV(sv, len);
+    bool do_utf8 = DO_UTF8(sv);
     char *strend = s + len;
     register PMOP *pm;
     register REGEXP *rx;
@@ -5878,7 +5878,7 @@ PP(pp_split)
        SV *csv = CALLREG_INTUIT_STRING(aTHX_ rx);
 
        len = rx->minlen;
-       if (len == 1 && !tail) {
+       if (len == 1 && !(rx->reganch & ROPT_UTF8) && !tail) {
            STRLEN n_a;
            char c = *SvPV(csv, n_a);
            while (--limit) {
@@ -5895,7 +5895,10 @@ PP(pp_split)
                XPUSHs(dstr);
                /* The rx->minlen is in characters but we want to step
                 * s ahead by bytes. */
-               s = m + (do_utf8 ? SvCUR(csv) : len);
+               if (do_utf8)
+                   s = (char*)utf8_hop((U8*)m, len);
+               else
+                   s = m + len; /* Fake \n at the end */
            }
        }
        else {
@@ -5914,7 +5917,10 @@ PP(pp_split)
                XPUSHs(dstr);
                /* The rx->minlen is in characters but we want to step
                 * s ahead by bytes. */
-               s = m + (do_utf8 ? SvCUR(csv) : len); /* Fake \n at the end */
+               if (do_utf8)
+                   s = (char*)utf8_hop((U8*)m, len);
+               else
+                   s = m + len; /* Fake \n at the end */
            }
        }
     }
index aff5815..dd4bae9 100644 (file)
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -115,7 +115,9 @@ PP(pp_regcomp)
 
            pm->op_pmflags = pm->op_pmpermflags;        /* reset case sensitivity */
            if (DO_UTF8(tmpstr))
-               pm->op_pmdynflags |= PMdf_UTF8;
+               pm->op_pmdynflags |= PMdf_DYN_UTF8;
+           else
+               pm->op_pmdynflags &= ~PMdf_DYN_UTF8;
            pm->op_pmregexp = CALLREGCOMP(aTHX_ t, t + len, pm);
            PL_reginterp_cnt = 0;               /* XXXX Be extra paranoid - needed
                                           inside tie/overload accessors.  */
@@ -185,12 +187,12 @@ PP(pp_substcont)
            SvPVX(targ) = SvPVX(dstr);
            SvCUR_set(targ, SvCUR(dstr));
            SvLEN_set(targ, SvLEN(dstr));
+           if (DO_UTF8(dstr))
+               SvUTF8_on(targ);
            SvPVX(dstr) = 0;
            sv_free(dstr);
 
            TAINT_IF(cx->sb_rxtainted & 1);
-           if (pm->op_pmdynflags & PMdf_UTF8)
-               SvUTF8_on(targ); /* could also copy SvUTF8(dstr)? */
            PUSHs(sv_2mortal(newSViv((I32)cx->sb_iters - 1)));
 
            (void)SvPOK_only_UTF8(targ);
@@ -1439,8 +1441,13 @@ Perl_die_where(pTHX_ char *message, STRLEN msglen)
                    }
                }
            }
-           else
+           else {
                sv_setpvn(ERRSV, message, msglen);
+               if (PL_hints & HINT_UTF8)
+                   SvUTF8_on(ERRSV);
+               else
+                   SvUTF8_off(ERRSV);
+           }
        }
        else
            message = SvPVx(ERRSV, msglen);
@@ -2754,7 +2761,7 @@ Perl_sv_compile_2op(pTHX_ SV *sv, OP** startop, char *code, AV** avp)
 #else
     SAVEVPTR(PL_op);
 #endif
-    PL_hints = 0;
+    PL_hints &= HINT_UTF8;
 
     PL_op = &dummy;
     PL_op->op_type = OP_ENTEREVAL;
index c7555c4..f9c5960 100644 (file)
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -140,30 +140,31 @@ PP(pp_concat)
   djSP; dATARGET; tryAMAGICbin(concat,opASSIGN);
   {
     dPOPTOPssrl;
-    STRLEN len;
-    U8 *s;
+    STRLEN len, llen, rlen;
+    U8 *s, *l, *r;
     bool left_utf8;
     bool right_utf8;
 
-    if (TARG == right && SvGMAGICAL(right))
-        mg_get(right);
-    if (SvGMAGICAL(left))
+    r = (U8*)SvPV(right,rlen);
+
+    if (TARG != left)
+        l = (U8*)SvPV(left,llen);
+    else if (SvGMAGICAL(left))
         mg_get(left);
 
     left_utf8  = DO_UTF8(left);
     right_utf8 = DO_UTF8(right);
 
-    if (left_utf8 != right_utf8) {
+    if (left_utf8 != right_utf8 && !IN_BYTE) {
         if (TARG == right && !right_utf8) {
             sv_utf8_upgrade(TARG); /* Now straight binary copy */
             SvUTF8_on(TARG);
         }
         else {
             /* Set TARG to PV(left), then add right */
-            U8 *l, *c, *olds = NULL;
+            U8 *c, *olds = NULL;
             STRLEN targlen;
-           s = (U8*)SvPV(right,len);
-           right_utf8 |= DO_UTF8(right);
+           s = r; len = rlen;
             if (TARG == right) {
                /* Take a copy since we're about to overwrite TARG */
                olds = s = (U8*)savepvn((char*)s, len);
@@ -174,12 +175,12 @@ PP(pp_concat)
                else
                    sv_setpv(left, ""); /* Suppress warning. */
            }
-            l = (U8*)SvPV(left, targlen);
-           left_utf8 |= DO_UTF8(left);
             if (TARG != left)
-                sv_setpvn(TARG, (char*)l, targlen);
-            if (!left_utf8)
-                sv_utf8_upgrade(TARG);
+                sv_setpvn(TARG, (char*)l, llen);
+            if (!left_utf8) {
+               SvUTF8_off(TARG);
+               sv_utf8_upgrade(TARG);
+           }
             /* Extend TARG to length of right (s) */
             targlen = SvCUR(TARG) + len;
             if (!right_utf8) {
@@ -207,17 +208,16 @@ PP(pp_concat)
     }
 
     if (TARG != left) {
-       s = (U8*)SvPV(left,len);
        if (TARG == right) {
-           sv_insert(TARG, 0, 0, (char*)s, len);
+           sv_insert(TARG, 0, 0, (char*)l, llen);
            SETs(TARG);
            RETURN;
        }
-       sv_setpvn(TARG, (char *)s, len);
+       sv_setpvn(TARG, (char *)l, llen);
     }
     else if (!SvOK(TARG) && SvTYPE(TARG) <= SVt_PVMG)
        sv_setpv(TARG, "");     /* Suppress warning. */
-    s = (U8*)SvPV(right,len);
+    s = r; len = rlen;
     if (SvOK(TARG)) {
 #if defined(PERL_Y2KWARN)
        if ((SvIOK(right) || SvNOK(right)) && ckWARN(WARN_Y2K)) {
@@ -235,7 +235,7 @@ PP(pp_concat)
     }
     else
        sv_setpvn(TARG, (char *)s, len);        /* suppress warning */
-    if (left_utf8)
+    if (left_utf8 && !IN_BYTE)
        SvUTF8_on(TARG);
     SETTARG;
     RETURN;
@@ -1240,7 +1240,8 @@ play_it_again:
        if (update_minmatch++)
            minmatch = had_zerolen;
     }
-    if (rx->reganch & RE_USE_INTUIT) {
+    if (rx->reganch & RE_USE_INTUIT &&
+       DO_UTF8(TARG) == ((rx->reganch & ROPT_UTF8) != 0)) {
        s = CALLREG_INTUIT_START(aTHX_ rx, TARG, s, strend, r_flags, NULL);
 
        if (!s)
@@ -1337,7 +1338,13 @@ yup:                                     /* Confirmed by INTUIT */
     if (global) {
        rx->subbeg = truebase;
        rx->startp[0] = s - truebase;
-       rx->endp[0] = s - truebase + rx->minlen;
+       if (DO_UTF8(PL_reg_sv)) {
+           char *t = (char*)utf8_hop((U8*)s, rx->minlen);
+           rx->endp[0] = t - truebase;
+       }
+       else {
+           rx->endp[0] = s - truebase + rx->minlen;
+       }
        rx->sublen = strend - truebase;
        goto gotcha;
     }
@@ -2000,6 +2007,8 @@ PP(pp_subst)
     if (CALLREGEXEC(aTHX_ rx, s, strend, orig, 0, TARG, NULL,
                    r_flags | REXEC_CHECKED))
     {
+       bool isutf8;
+
        if (force_on_match) {
            force_on_match = 0;
            s = SvPV_force(TARG, len);
@@ -2045,6 +2054,7 @@ PP(pp_subst)
        SvPVX(TARG) = SvPVX(dstr);
        SvCUR_set(TARG, SvCUR(dstr));
        SvLEN_set(TARG, SvLEN(dstr));
+       isutf8 = DO_UTF8(dstr);
        SvPVX(dstr) = 0;
        sv_free(dstr);
 
@@ -2053,6 +2063,8 @@ PP(pp_subst)
        PUSHs(sv_2mortal(newSViv((I32)iters)));
 
        (void)SvPOK_only(TARG);
+       if (isutf8)
+           SvUTF8_on(TARG);
        TAINT_IF(rxtainted);
        SvSETMAGIC(TARG);
        SvTAINT(TARG);
diff --git a/proto.h b/proto.h
index 1bcb5cd..4c5499e 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -1147,7 +1147,9 @@ STATIC char*      S_regcppop(pTHX);
 STATIC char*   S_regcp_set_to(pTHX_ I32 ss);
 STATIC void    S_cache_re(pTHX_ regexp *prog);
 STATIC U8*     S_reghop(pTHX_ U8 *pos, I32 off);
+STATIC U8*     S_reghop3(pTHX_ U8 *pos, I32 off, U8 *lim);
 STATIC U8*     S_reghopmaybe(pTHX_ U8 *pos, I32 off);
+STATIC U8*     S_reghopmaybe3(pTHX_ U8 *pos, I32 off, U8 *lim);
 STATIC char*   S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun);
 #endif
 
index 8748271..bbd91c6 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -127,6 +127,7 @@ typedef struct RExC_state_t {
     I32                extralen;
     I32                seen_zerolen;
     I32                seen_evals;
+    I32                utf8;
 #if ADD_TO_REGEXEC
     char       *starttry;              /* -Dr: where regtry was called. */
 #define RExC_starttry  (pRExC_state->starttry)
@@ -148,6 +149,7 @@ typedef struct RExC_state_t {
 #define RExC_extralen  (pRExC_state->extralen)
 #define RExC_seen_zerolen      (pRExC_state->seen_zerolen)
 #define RExC_seen_evals        (pRExC_state->seen_evals)
+#define RExC_utf8      (pRExC_state->utf8)
 
 #define        ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 #define        ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
@@ -229,8 +231,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 #define SCF_DO_STCLASS         (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 #define SCF_WHILEM_VISITED_POS 0x2000
 
-#define RF_utf8                8
-#define UTF (PL_reg_flags & RF_utf8)
+#define UTF RExC_utf8
 #define LOC (RExC_flags16 & PMf_LOCALE)
 #define FOLD (RExC_flags16 & PMf_FOLD)
 
@@ -469,7 +470,7 @@ S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *c
     ANYOF_CLASS_ZERO(cl);
     for (value = 0; value < 256; ++value)
        ANYOF_BITMAP_SET(cl, value);
-    cl->flags = ANYOF_EOS;
+    cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
     if (LOC)
        cl->flags |= ANYOF_LOCALE;
 }
@@ -483,6 +484,8 @@ S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
     for (value = 0; value <= ANYOF_MAX; value += 2)
        if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
            return 1;
+    if (!(cl->flags & ANYOF_UNICODE_ALL))
+       return 0;
     for (value = 0; value < 256; ++value)
        if (!ANYOF_BITMAP_TEST(cl, value))
            return 0;
@@ -530,6 +533,16 @@ S_cl_and(pTHX_ struct regnode_charclass_class *cl,
     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
     if (!(and_with->flags & ANYOF_EOS))
        cl->flags &= ~ANYOF_EOS;
+
+    if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
+       cl->flags &= ~ANYOF_UNICODE_ALL;
+       cl->flags |= ANYOF_UNICODE;
+       ARG_SET(cl, ARG(and_with));
+    }
+    if (!(and_with->flags & ANYOF_UNICODE_ALL))
+       cl->flags &= ~ANYOF_UNICODE_ALL;
+    if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
+       cl->flags &= ~ANYOF_UNICODE;
 }
 
 /* 'OR' a given class with another one.  Can create false positives */
@@ -580,6 +593,16 @@ S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, str
     }
     if (or_with->flags & ANYOF_EOS)
        cl->flags |= ANYOF_EOS;
+
+    if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
+       ARG(cl) != ARG(or_with)) {
+       cl->flags |= ANYOF_UNICODE_ALL;
+       cl->flags &= ~ANYOF_UNICODE;
+    }
+    if (or_with->flags & ANYOF_UNICODE_ALL) {
+       cl->flags |= ANYOF_UNICODE_ALL;
+       cl->flags &= ~ANYOF_UNICODE;
+    }
 }
 
 /* REx optimizer.  Converts nodes into quickier variants "in place".
@@ -787,15 +810,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
        }
        else if (OP(scan) == EXACT) {
            I32 l = STR_LEN(scan);
+           UV uc = *((U8*)STRING(scan));
            if (UTF) {
-               unsigned char *s = (unsigned char *)STRING(scan);
-               unsigned char *e = s + l;
-               I32 newl = 0;
-               while (s < e) {
-                   newl++;
-                   s += UTF8SKIP(s);
-               }
-               l = newl;
+               U8 *s = (U8*)STRING(scan);
+               l = utf8_length(s, s + l);
+               uc = utf8_to_uv_simple(s, NULL);
            }
            min += l;
            if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
@@ -815,21 +834,22 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
                /* Check whether it is compatible with what we know already! */
                int compat = 1;
 
-               if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 
-                   && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan))
+               if (uc >= 0x100 ||
+                   !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 
+                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
                    && (!(data->start_class->flags & ANYOF_FOLD)
-                       || !ANYOF_BITMAP_TEST(data->start_class,
-                                             PL_fold[*(U8*)STRING(scan)])))
+                       || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
                    compat = 0;
                ANYOF_CLASS_ZERO(data->start_class);
                ANYOF_BITMAP_ZERO(data->start_class);
                if (compat)
-                   ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
+                   ANYOF_BITMAP_SET(data->start_class, uc);
                data->start_class->flags &= ~ANYOF_EOS;
            }
            else if (flags & SCF_DO_STCLASS_OR) {
                /* false positive possible if the class is case-folded */
-               ANYOF_BITMAP_SET(data->start_class, *STRING(scan));     
+               if (uc < 0x100)
+                   ANYOF_BITMAP_SET(data->start_class, uc);    
                data->start_class->flags &= ~ANYOF_EOS;
                cl_and(data->start_class, &and_with);
            }
@@ -837,19 +857,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
        }
        else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
            I32 l = STR_LEN(scan);
+           UV uc = *((U8*)STRING(scan));
 
            /* Search for fixed substrings supports EXACT only. */
            if (flags & SCF_DO_SUBSTR) 
                scan_commit(pRExC_state, data);
            if (UTF) {
-               unsigned char *s = (unsigned char *)STRING(scan);
-               unsigned char *e = s + l;
-               I32 newl = 0;
-               while (s < e) {
-                   newl++;
-                   s += UTF8SKIP(s);
-               }
-               l = newl;
+               U8 *s = (U8 *)STRING(scan);
+               l = utf8_length(s, s + l);
+               uc = utf8_to_uv_simple(s, NULL);
            }
            min += l;
            if (data && (flags & SCF_DO_SUBSTR))
@@ -858,15 +874,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
                /* Check whether it is compatible with what we know already! */
                int compat = 1;
 
-               if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 
-                   && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan))
-                   && !ANYOF_BITMAP_TEST(data->start_class, 
-                                         PL_fold[*(U8*)STRING(scan)]))
+               if (uc >= 0x100 ||
+                   !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 
+                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
+                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))
                    compat = 0;
                ANYOF_CLASS_ZERO(data->start_class);
                ANYOF_BITMAP_ZERO(data->start_class);
                if (compat) {
-                   ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
+                   ANYOF_BITMAP_SET(data->start_class, uc);
                    data->start_class->flags &= ~ANYOF_EOS;
                    data->start_class->flags |= ANYOF_FOLD;
                    if (OP(scan) == EXACTFL)
@@ -877,7 +893,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
                if (data->start_class->flags & ANYOF_FOLD) {
                    /* false positive possible if the class is case-folded.
                       Assume that the locale settings are the same... */
-                   ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); 
+                   if (uc < 0x100)
+                       ANYOF_BITMAP_SET(data->start_class, uc);
                    data->start_class->flags &= ~ANYOF_EOS;
                }
                cl_and(data->start_class, &and_with);
@@ -1580,11 +1597,10 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
        FAIL("NULL regexp argument");
 
     /* XXXX This looks very suspicious... */
-    if (pm->op_pmdynflags & PMdf_UTF8) {
-       PL_reg_flags |= RF_utf8;
-    }
+    if (pm->op_pmdynflags & PMdf_CMP_UTF8)
+        RExC_utf8 = 1;
     else
-       PL_reg_flags = 0;
+        RExC_utf8 = 0;
 
     RExC_precomp = savepvn(exp, xend - exp);
     DEBUG_r(if (!PL_colorset) reginitcolors());
@@ -1705,9 +1721,9 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
        /* Starting-point info. */
       again:
        if (PL_regkind[(U8)OP(first)] == EXACT) {
-           if (OP(first) == EXACT);    /* Empty, get anchored substr later. */
-           else if ((OP(first) == EXACTF || OP(first) == EXACTFL)
-                    && !UTF)
+           if (OP(first) == EXACT)
+               ;       /* Empty, get anchored substr later. */
+           else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
                r->regstclass = first;
        }
        else if (strchr((char*)PL_simple,OP(first)))
@@ -3164,6 +3180,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
     SV *listsv;
     register char *e;
     UV n;
+    bool dont_optimize_invert = FALSE;
 
     ret = reganode(pRExC_state, ANYOF, 0);
 
@@ -3350,6 +3367,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isALNUM(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
                    break;
                case ANYOF_NALNUM:
@@ -3360,6 +3378,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isALNUM(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
                    break;
                case ANYOF_ALNUMC:
@@ -3370,6 +3389,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isALNUMC(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
                    break;
                case ANYOF_NALNUMC:
@@ -3380,6 +3400,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isALNUMC(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
                    break;
                case ANYOF_ALPHA:
@@ -3390,6 +3411,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isALPHA(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
                    break;
                case ANYOF_NALPHA:
@@ -3400,6 +3422,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isALPHA(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
                    break;
                case ANYOF_ASCII:
@@ -3415,6 +3438,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                                ANYOF_BITMAP_SET(ret, value);
 #endif /* EBCDIC */
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
                    break;
                case ANYOF_NASCII:
@@ -3430,6 +3454,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                                ANYOF_BITMAP_SET(ret, value);
 #endif /* EBCDIC */
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
                    break;
                case ANYOF_BLANK:
@@ -3440,6 +3465,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isBLANK(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
                    break;
                case ANYOF_NBLANK:
@@ -3450,6 +3476,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isBLANK(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
                    break;
                case ANYOF_CNTRL:
@@ -3460,6 +3487,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isCNTRL(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
                    break;
                case ANYOF_NCNTRL:
@@ -3470,6 +3498,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isCNTRL(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
                    break;
                case ANYOF_DIGIT:
@@ -3480,6 +3509,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                        for (value = '0'; value <= '9'; value++)
                            ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
                    break;
                case ANYOF_NDIGIT:
@@ -3492,6 +3522,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                        for (value = '9' + 1; value < 256; value++)
                            ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
                    break;
                case ANYOF_GRAPH:
@@ -3502,6 +3533,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isGRAPH(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
                    break;
                case ANYOF_NGRAPH:
@@ -3512,6 +3544,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isGRAPH(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
                    break;
                case ANYOF_LOWER:
@@ -3522,6 +3555,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isLOWER(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
                    break;
                case ANYOF_NLOWER:
@@ -3532,6 +3566,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isLOWER(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
                    break;
                case ANYOF_PRINT:
@@ -3542,6 +3577,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isPRINT(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
                    break;
                case ANYOF_NPRINT:
@@ -3552,6 +3588,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isPRINT(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
                    break;
                case ANYOF_PSXSPC:
@@ -3562,6 +3599,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isPSXSPC(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
                    break;
                case ANYOF_NPSXSPC:
@@ -3572,6 +3610,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isPSXSPC(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
                    break;
                case ANYOF_PUNCT:
@@ -3582,6 +3621,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isPUNCT(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
                    break;
                case ANYOF_NPUNCT:
@@ -3592,6 +3632,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isPUNCT(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
                    break;
                case ANYOF_SPACE:
@@ -3602,6 +3643,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isSPACE(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
                    break;
                case ANYOF_NSPACE:
@@ -3612,6 +3654,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isSPACE(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
                    break;
                case ANYOF_UPPER:
@@ -3622,6 +3665,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isUPPER(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
                    break;
                case ANYOF_NUPPER:
@@ -3632,6 +3676,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isUPPER(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
                    break;
                case ANYOF_XDIGIT:
@@ -3642,6 +3687,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isXDIGIT(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
                    break;
                case ANYOF_NXDIGIT:
@@ -3652,6 +3698,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isXDIGIT(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   dont_optimize_invert = TRUE;
                    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
                    break;
                default:
@@ -3755,12 +3802,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
     }
 
     /* optimize inverted simple patterns (e.g. [^a-z]) */
-    if (!SIZE_ONLY &&
+    if (!SIZE_ONLY && !dont_optimize_invert &&
        /* If the only flag is inversion. */
        (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
        for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
            ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
-       ANYOF_FLAGS(ret) = 0;
+       ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
     }
 
     if (!SIZE_ONLY) { 
@@ -4218,6 +4265,8 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
 
        if (flags & ANYOF_UNICODE)
            sv_catpv(sv, "{unicode}");
+       else if (flags & ANYOF_UNICODE_ALL)
+           sv_catpv(sv, "{all-unicode}");
 
        {
            SV *lv;
index c8094e1..066e31f 100644 (file)
--- a/regcomp.h
+++ b/regcomp.h
@@ -192,6 +192,7 @@ struct regnode_charclass_class {
 
 /* There is a character or a range past 0xff */
 #define ANYOF_UNICODE          0x20
+#define ANYOF_UNICODE_ALL      0x40    /* Can match any char past 0xff */
 
 /* Are there any runtime flags on in this node? */
 #define ANYOF_RUNTIME(s)       (ANYOF_FLAGS(s) & 0x0f)
index bdbdb59..be683a3 100644 (file)
--- a/regexec.c
+++ b/regexec.c
  */
 
 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
-#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
+#define CHR_DIST(a,b) (DO_UTF8(PL_reg_sv) ? utf8_distance(a,b) : a - b)
 
 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
-#define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
-#define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
+#define HOP(pos,off) (DO_UTF8(PL_reg_sv) ? reghop((U8*)pos, off) : (U8*)(pos + off))
+#define HOPMAYBE(pos,off) (DO_UTF8(PL_reg_sv) ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
 #define HOPc(pos,off) ((char*)HOP(pos,off))
 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
 
+#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
+#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
+#define HOP3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
+#define HOPMAYBE3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
+#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
+#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
+
 static void restore_pos(pTHXo_ void *arg);
 
 
@@ -354,11 +361,15 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
                      (strend - strpos > 60 ? "..." : ""))
        );
 
-    if (prog->minlen > strend - strpos) {
+    if (prog->reganch & ROPT_UTF8)
+       PL_reg_flags |= RF_utf8;
+
+    if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
        DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
        goto fail;
     }
     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
+    PL_regeol = strend;
     check = prog->check_substr;
     if (prog->reganch & ROPT_ANCH) {   /* Match at beg-of-str or after \n */
        ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
@@ -377,8 +388,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
            /* Substring at constant offset from beg-of-str... */
            I32 slen;
 
-           PL_regeol = strend;                 /* Used in HOP() */
-           s = HOPc(strpos, prog->check_offset_min);
+           s = HOP3c(strpos, prog->check_offset_min, strend);
            if (SvTAIL(check)) {
                slen = SvCUR(check);    /* >= 1 */
 
@@ -412,7 +422,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
        if (!ml_anch) {
            I32 end = prog->check_offset_max + CHR_SVLEN(check)
                                         - (SvTAIL(check) != 0);
-           I32 eshift = strend - s - end;
+           I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 
            if (end_shift < eshift)
                end_shift = eshift;
@@ -451,8 +461,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
            *data->scream_olds = s;
     }
     else
-       s = fbm_instr((unsigned char*)s + start_shift,
-                     (unsigned char*)strend - end_shift,
+       s = fbm_instr(HOP3(s, start_shift, strend),
+                     HOP3(strend, -end_shift, strbeg),
                      check, PL_multiline ? FBMrf_MULTILINE : 0);
 
     /* Update the count-of-usability, remove useless subpatterns,
@@ -491,34 +501,33 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
        if (check == prog->float_substr) {
          do_other_anchored:
            {
-               char *last = s - start_shift, *last1, *last2;
+               char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
                char *s1 = s;
 
-               tmp = PL_bostr;
                t = s - prog->check_offset_max;
                if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
                    && (!(prog->reganch & ROPT_UTF8)
-                       || (PL_bostr = strpos, /* Used in regcopmaybe() */
-                           (t = reghopmaybe_c(s, -(prog->check_offset_max)))
+                       || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
                            && t > strpos)))
                    /* EMPTY */;
                else
                    t = strpos;
-               t += prog->anchored_offset;
+               t = HOP3c(t, prog->anchored_offset, strend);
                if (t < other_last)     /* These positions already checked */
                    t = other_last;
-               PL_bostr = tmp;
-               last2 = last1 = strend - prog->minlen;
+               last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
                if (last < last1)
                    last1 = last;
  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
                /* On end-of-str: see comment below. */
                s = fbm_instr((unsigned char*)t,
-                             (unsigned char*)last1 + prog->anchored_offset
-                                + SvCUR(prog->anchored_substr)
-                                - (SvTAIL(prog->anchored_substr)!=0),
-                             prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
-               DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
+                             HOP3(HOP3(last1, prog->anchored_offset, strend)
+                                  + SvCUR(prog->anchored_substr),
+                                  -(SvTAIL(prog->anchored_substr)!=0), strbeg),
+                             prog->anchored_substr,
+                             PL_multiline ? FBMrf_MULTILINE : 0);
+               DEBUG_r(PerlIO_printf(Perl_debug_log,
+                       "%s anchored substr `%s%.*s%s'%s",
                        (s ? "Found" : "Contradicts"),
                        PL_colors[0],
                          (int)(SvCUR(prog->anchored_substr)
@@ -533,17 +542,16 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
                    }
                    DEBUG_r(PerlIO_printf(Perl_debug_log,
                        ", trying floating at offset %ld...\n",
-                       (long)(s1 + 1 - i_strpos)));
-                   PL_regeol = strend;                 /* Used in HOP() */
-                   other_last = last1 + prog->anchored_offset + 1;
-                   s = HOPc(last, 1);
+                       (long)(HOP3c(s1, 1, strend) - i_strpos)));
+                   other_last = HOP3c(last1, prog->anchored_offset+1, strend);
+                   s = HOP3c(last, 1, strend);
                    goto restart;
                }
                else {
                    DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
                          (long)(s - i_strpos)));
-                   t = s - prog->anchored_offset;
-                   other_last = s + 1;
+                   t = HOP3c(s, -prog->anchored_offset, strbeg);
+                   other_last = HOP3c(s, 1, strend);
                    s = s1;
                    if (t == strpos)
                        goto try_at_start;
@@ -555,11 +563,12 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
                char *last, *last1;
                char *s1 = s;
 
-               t = s - start_shift;
-               last1 = last = strend - prog->minlen + prog->float_min_offset;
-               if (last - t > prog->float_max_offset)
-                   last = t + prog->float_max_offset;
-               s = t + prog->float_min_offset;
+               t = HOP3c(s, -start_shift, strbeg);
+               last1 = last =
+                   HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
+               if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
+                   last = HOP3c(t, prog->float_max_offset, strend);
+               s = HOP3c(t, prog->float_min_offset, strend);
                if (s < other_last)
                    s = other_last;
  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
@@ -587,8 +596,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
                        ", trying anchored starting at offset %ld...\n",
                        (long)(s1 + 1 - i_strpos)));
                    other_last = last + 1;
-                   PL_regeol = strend;                 /* Used in HOP() */
-                   s = HOPc(t, 1);
+                   s = HOP3c(t, 1, strend);
                    goto restart;
                }
                else {
@@ -604,13 +612,10 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
     }
 
     t = s - prog->check_offset_max;
-    tmp = PL_bostr;
     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
         && (!(prog->reganch & ROPT_UTF8)
-           || (PL_bostr = strpos, /* Used in regcopmaybe() */
-               ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
-                && t > strpos)))) {
-       PL_bostr = tmp;
+           || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
+                && t > strpos))) {
        /* Fixed substring is found far enough so that the match
           cannot start at strpos. */
       try_at_offset:
@@ -668,7 +673,6 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
        ++BmUSEFUL(prog->check_substr); /* hooray/5 */
     }
     else {
-       PL_bostr = tmp;
        /* The found string does not prohibit matching at strpos,
           - no optimization of calling REx engine can be performed,
           unless it was an MBOL and we are not after MBOL,
@@ -721,13 +725,16 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
           regstclass does not come from lookahead...  */
        /* If regstclass takes bytelength more than 1: If charlength==1, OK.
           This leaves EXACTF only, which is dealt with in find_byclass().  */
+       U8* str = (U8*)STRING(prog->regstclass);
        int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
-                   ? STR_LEN(prog->regstclass)
+                   ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
                    : 1);
        char *endpos = (prog->anchored_substr || ml_anch)
-               ? s + (prog->minlen? cl_l : 0)
-               : (prog->float_substr ? check_at - start_shift + cl_l
-                                     : strend) ;
+               ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
+               : (prog->float_substr
+                  ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
+                          cl_l, strend)
+                  : strend);
        char *startpos = strbeg;
 
        t = s;
@@ -754,8 +761,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
                if (prog->anchored_substr == check) {
                    DEBUG_r( what = "anchored" );
                  hop_and_restart:
-                   PL_regeol = strend; /* Used in HOP() */
-                   s = HOPc(t, 1);
+                   s = HOP3c(t, 1, strend);
                    if (s + start_shift + end_shift > strend) {
                        /* XXXX Should be taken into account earlier? */
                        DEBUG_r( PerlIO_printf(Perl_debug_log,
@@ -854,8 +860,14 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
        case EXACTF:
            m = STRING(c);
            ln = STR_LEN(c);
-           c1 = *(U8*)m;
-           c2 = PL_fold[c1];
+           if (UTF) {
+               c1 = to_utf8_lower((U8*)m);
+               c2 = to_utf8_upper((U8*)m);
+           }
+           else {
+               c1 = *(U8*)m;
+               c2 = PL_fold[c1];
+           }
            goto do_exactf;
        case EXACTFL:
            m = STRING(c);
@@ -867,27 +879,45 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 
            if (norun && e < s)
                e = s;                  /* Due to minlen logic of intuit() */
-           /* Here it is NOT UTF!  */
-           if (c1 == c2) {
-               while (s <= e) {
-                   if ( *(U8*)s == c1
-                        && (ln == 1 || !(OP(c) == EXACTF
-                                         ? ibcmp(s, m, ln)
-                                         : ibcmp_locale(s, m, ln)))
-                        && (norun || regtry(prog, s)) )
-                       goto got_it;
-                   s++;
-               }
-           } else {
-               while (s <= e) {
-                   if ( (*(U8*)s == c1 || *(U8*)s == c2)
-                        && (ln == 1 || !(OP(c) == EXACTF
-                                         ? ibcmp(s, m, ln)
-                                         : ibcmp_locale(s, m, ln)))
-                        && (norun || regtry(prog, s)) )
-                       goto got_it;
-                   s++;
-               }
+
+           if (do_utf8) {
+               STRLEN len;
+               if (c1 == c2)
+                   while (s <= e) {
+                       if ( utf8_to_uv_simple((U8*)s, &len) == c1
+                            && regtry(prog, s) )
+                           goto got_it;
+                       s += len;
+                   }
+               else
+                   while (s <= e) {
+                       UV c = utf8_to_uv_simple((U8*)s, &len);
+                       if ( (c == c1 || c == c2) && regtry(prog, s) )
+                           goto got_it;
+                       s += len;
+                   }
+           }
+           else {
+               if (c1 == c2)
+                   while (s <= e) {
+                       if ( *(U8*)s == c1
+                            && (ln == 1 || !(OP(c) == EXACTF
+                                             ? ibcmp(s, m, ln)
+                                             : ibcmp_locale(s, m, ln)))
+                            && (norun || regtry(prog, s)) )
+                           goto got_it;
+                       s++;
+                   }
+               else
+                   while (s <= e) {
+                       if ( (*(U8*)s == c1 || *(U8*)s == c2)
+                            && (ln == 1 || !(OP(c) == EXACTF
+                                             ? ibcmp(s, m, ln)
+                                             : ibcmp_locale(s, m, ln)))
+                            && (norun || regtry(prog, s)) )
+                           goto got_it;
+                       s++;
+                   }
            }
            break;
        case BOUNDL:
@@ -898,7 +928,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                if (s == startpos)
                    tmp = '\n';
                else {
-                   U8 *r = reghop((U8*)s, -1);
+                   U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
                    
                    tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
                }
@@ -940,7 +970,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                if (s == startpos)
                    tmp = '\n';
                else {
-                   U8 *r = reghop((U8*)s, -1);
+                   U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
                    
                    tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
                }
@@ -1346,6 +1376,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
     I32 scream_pos = -1;               /* Internal iterator of scream. */
     char *scream_olds;
     SV* oreplsv = GvSV(PL_replgv);
+    bool do_utf8 = DO_UTF8(sv);
 
     PL_regcc = 0;
 
@@ -1361,12 +1392,22 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
     }
 
     minlen = prog->minlen;
-    if (strend - startpos < minlen) goto phooey;
+    if (do_utf8) {
+      if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
+    }
+    else {
+      if (strend - startpos < minlen) goto phooey;
+    }
 
     if (startpos == strbeg)    /* is ^ valid at stringarg? */
        PL_regprev = '\n';
     else {
-       PL_regprev = (U32)stringarg[-1];
+        if (prog->reganch & ROPT_UTF8 && do_utf8) {
+           U8 *s = reghop3((U8*)stringarg, -1, (U8*)strbeg);
+           PL_regprev = utf8_to_uv(s, (U8*)stringarg - s, NULL, 0);
+       }
+       else
+           PL_regprev = (U32)stringarg[-1];
        if (!PL_multiline && PL_regprev == '\n')
            PL_regprev = '\0';          /* force ^ to NOT match */
     }
@@ -1454,7 +1495,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
 
            if (minlen)
                dontbother = minlen - 1;
-           end = HOPc(strend, -dontbother) - 1;
+           end = HOP3c(strend, -dontbother, strbeg) - 1;
            /* for multiline we only have to try after newlines */
            if (prog->check_substr) {
                if (s == startpos)
@@ -1500,7 +1541,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
        int did_match = 0;
 #endif
 
-       if (UTF) {
+       if (do_utf8) {
            while (s < strend) {
                if (*s == ch) {
                    DEBUG_r( did_match = 1 );
@@ -1529,18 +1570,19 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
                              "Did not find anchored character...\n"));
     }
     /*SUPPRESS 560*/
-    else if (prog->anchored_substr != Nullsv
-            || (prog->float_substr != Nullsv 
-                && prog->float_max_offset < strend - s)) {
+    else if (do_utf8 == (UTF!=0) &&
+            (prog->anchored_substr != Nullsv
+             || (prog->float_substr != Nullsv 
+                 && prog->float_max_offset < strend - s))) {
        SV *must = prog->anchored_substr 
            ? prog->anchored_substr : prog->float_substr;
        I32 back_max = 
            prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
        I32 back_min = 
            prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
-       char *last = HOPc(strend,       /* Cannot start after this */
+       char *last = HOP3c(strend,      /* Cannot start after this */
                          -(I32)(CHR_SVLEN(must)
-                                - (SvTAIL(must) != 0) + back_min));
+                                - (SvTAIL(must) != 0) + back_min), strbeg);
        char *last1;            /* Last position checked before */
 #ifdef DEBUGGING
        int did_match = 0;
@@ -1558,9 +1600,9 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
        strend = HOPc(strend, -dontbother);
        while ( (s <= last) &&
                ((flags & REXEC_SCREAM) 
-                ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
+                ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
                                    end_shift, &scream_pos, 0))
-                : (s = fbm_instr((unsigned char*)HOP(s, back_min),
+                : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
                                  (unsigned char*)strend, must, 
                                  PL_multiline ? FBMrf_MULTILINE : 0))) ) {
            DEBUG_r( did_match = 1 );
@@ -1574,7 +1616,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
                last1 = HOPc(s, -back_min);
                s = t;          
            }
-           if (UTF) {
+           if (do_utf8) {
                while (s <= last1) {
                    if (regtry(prog, s))
                        goto got_it;
@@ -1655,7 +1697,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
            dontbother = minlen - 1;
        strend -= dontbother;              /* this one's always in bytes! */
        /* We don't know much -- general case. */
-       if (UTF) {
+       if (do_utf8) {
            for (;;) {
                if (regtry(prog, s))
                    goto got_it;
@@ -1926,20 +1968,25 @@ S_regmatch(pTHX_ regnode *prog)
            SV *prop = sv_newmortal();
            int docolor = *PL_colors[0];
            int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
-           int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
+           int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
            /* The part of the string before starttry has one color
               (pref0_len chars), between starttry and current
               position another one (pref_len - pref0_len chars),
               after the current position the third one.
               We assume that pref0_len <= pref_len, otherwise we
               decrease pref0_len.  */
-           int pref_len = (locinput - PL_bostr > (5 + taill) - l 
-                           ? (5 + taill) - l : locinput - PL_bostr);
-           int pref0_len = pref_len  - (locinput - PL_reg_starttry);
+           int pref_len = (locinput - PL_bostr) > (5 + taill) - l 
+               ? (5 + taill) - l : locinput - PL_bostr;
+           int pref0_len;
 
+           while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
+               pref_len++;
+           pref0_len = pref_len  - (locinput - PL_reg_starttry);
            if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
                l = ( PL_regeol - locinput > (5 + taill) - pref_len 
                      ? (5 + taill) - pref_len : PL_regeol - locinput);
+           while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
+               l--;
            if (pref0_len < 0)
                pref0_len = 0;
            if (pref0_len > pref_len)
@@ -2013,7 +2060,7 @@ S_regmatch(pTHX_ regnode *prog)
                sayNO;
            break;
        case SANY:
-           if (DO_UTF8(PL_reg_sv)) {
+           if (do_utf8) {
                locinput += PL_utf8skip[nextchr];
                if (locinput > PL_regeol)
                    sayNO;
@@ -2025,20 +2072,46 @@ S_regmatch(pTHX_ regnode *prog)
            nextchr = UCHARAT(++locinput);
            break;
        case REG_ANY:
-           if (DO_UTF8(PL_reg_sv)) {
+           if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
+               sayNO;
+           if (do_utf8) {
                locinput += PL_utf8skip[nextchr];
                if (locinput > PL_regeol)
                    sayNO;
                nextchr = UCHARAT(locinput);
-               break;
            }
-           if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
-               sayNO;
-           nextchr = UCHARAT(++locinput);
+           else
+               nextchr = UCHARAT(++locinput);
            break;
        case EXACT:
            s = STRING(scan);
            ln = STR_LEN(scan);
+           if (do_utf8 != (UTF!=0)) {
+               char *l = locinput;
+               char *e = s + ln;
+               STRLEN len;
+               if (do_utf8)
+                   while (s < e) {
+                       if (l >= PL_regeol)
+                           sayNO;
+                       if (*((U8*)s) != utf8_to_uv_simple((U8*)l, &len))
+                           sayNO;
+                       s++;
+                       l += len;
+                   }
+               else
+                   while (s < e) {
+                       if (l >= PL_regeol)
+                           sayNO;
+                       if (*((U8*)l) != utf8_to_uv_simple((U8*)s, &len))
+                           sayNO;
+                       s += len;
+                       l++;
+                   }
+               locinput = l;
+               nextchr = UCHARAT(locinput);
+               break;
+           }
            /* Inline the first character, for speed. */
            if (UCHARAT(s) != nextchr)
                sayNO;
@@ -2056,21 +2129,19 @@ S_regmatch(pTHX_ regnode *prog)
            s = STRING(scan);
            ln = STR_LEN(scan);
 
-           if (UTF) {
+           if (do_utf8) {
                char *l = locinput;
-               char *e = s + ln;
+               char *e;
+               e = s + ln;
                c1 = OP(scan) == EXACTF;
                while (s < e) {
-                   if (l >= PL_regeol)
-                       sayNO;
-                   if (utf8_to_uv((U8*)s, e - s, 0, 0) !=
-                       (c1 ?
-                        toLOWER_utf8((U8*)l) :
-                        toLOWER_LC_utf8((U8*)l)))
-                   {
+                   if (l >= PL_regeol) {
                        sayNO;
                    }
-                   s += UTF8SKIP(s);
+                   if ((UTF ? utf8_to_uv((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
+                       (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
+                           sayNO;
+                   s += UTF ? UTF8SKIP(s) : 1;
                    l += UTF8SKIP(l);
                }
                locinput = l;
@@ -2201,7 +2272,7 @@ S_regmatch(pTHX_ regnode *prog)
        case SPACE:
            if (!nextchr)
                sayNO;
-           if (DO_UTF8(PL_reg_sv)) {
+           if (do_utf8) {
                if (nextchr & 0x80) {
                    if (!(OP(scan) == SPACE
                          ? swash_fetch(PL_utf8_space, (U8*)locinput)
@@ -2231,7 +2302,7 @@ S_regmatch(pTHX_ regnode *prog)
        case NSPACE:
            if (!nextchr && locinput >= PL_regeol)
                sayNO;
-           if (DO_UTF8(PL_reg_sv)) {
+           if (do_utf8) {
                if (OP(scan) == NSPACE
                    ? swash_fetch(PL_utf8_space, (U8*)locinput)
                    : isSPACE_LC_utf8((U8*)locinput))
@@ -2253,7 +2324,7 @@ S_regmatch(pTHX_ regnode *prog)
        case DIGIT:
            if (!nextchr)
                sayNO;
-           if (DO_UTF8(PL_reg_sv)) {
+           if (do_utf8) {
                if (!(OP(scan) == DIGIT
                      ? swash_fetch(PL_utf8_digit, (U8*)locinput)
                      : isDIGIT_LC_utf8((U8*)locinput)))
@@ -2275,7 +2346,7 @@ S_regmatch(pTHX_ regnode *prog)
        case NDIGIT:
            if (!nextchr && locinput >= PL_regeol)
                sayNO;
-           if (DO_UTF8(PL_reg_sv)) {
+           if (do_utf8) {
                if (OP(scan) == NDIGIT
                    ? swash_fetch(PL_utf8_digit, (U8*)locinput)
                    : isDIGIT_LC_utf8((U8*)locinput))
@@ -2315,7 +2386,7 @@ S_regmatch(pTHX_ regnode *prog)
                break;
 
            s = PL_bostr + ln;
-           if (UTF && OP(scan) != REF) {       /* REF can do byte comparison */
+           if (do_utf8 && OP(scan) != REF) {   /* REF can do byte comparison */
                char *l = locinput;
                char *e = PL_bostr + PL_regendp[n];
                /*
@@ -2420,7 +2491,6 @@ S_regmatch(pTHX_ regnode *prog)
                        I32 onpar = PL_regnpar;
 
                        pm.op_pmflags = 0;
-                       pm.op_pmdynflags = (UTF||DO_UTF8(ret) ? PMdf_UTF8 : 0);
                        re = CALLREGCOMP(aTHX_ t, t + len, &pm);
                        if (!(SvFLAGS(ret) 
                              & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
@@ -3035,13 +3105,23 @@ S_regmatch(pTHX_ regnode *prog)
            * when we know what character comes next.
            */
            if (PL_regkind[(U8)OP(next)] == EXACT) {
-               c1 = (U8)*STRING(next);
-               if (OP(next) == EXACTF)
-                   c2 = PL_fold[c1];
-               else if (OP(next) == EXACTFL)
-                   c2 = PL_fold_locale[c1];
-               else
-                   c2 = c1;
+               U8 *s = (U8*)STRING(next);
+               if (!UTF) {
+                   c2 = c1 = *s;
+                   if (OP(next) == EXACTF)
+                       c2 = PL_fold[c1];
+                   else if (OP(next) == EXACTFL)
+                       c2 = PL_fold_locale[c1];
+               }
+               else { /* UTF */
+                   if (OP(next) == EXACTF) {
+                       c1 = to_utf8_lower(s);
+                       c2 = to_utf8_upper(s);
+                   }
+                   else {
+                       c2 = c1 = utf8_to_uv_simple(s, NULL);
+                   }
+               }
            }
            else
                c1 = c2 = -1000;
@@ -3054,29 +3134,65 @@ S_regmatch(pTHX_ regnode *prog)
                locinput = PL_reginput;
                REGCP_SET(lastcp);
                if (c1 != -1000) {
-                   char *e = locinput + n - ln; /* Should not check after this */
+                   char *e; /* Should not check after this */
                    char *old = locinput;
 
-                   if (e >= PL_regeol || (n == REG_INFTY))
+                   if  (n == REG_INFTY) {
                        e = PL_regeol - 1;
+                       if (do_utf8)
+                           while (UTF8_IS_CONTINUATION(*(U8*)e))
+                               e--;
+                   }
+                   else if (do_utf8) {
+                       int m = n - ln;
+                       for (e = locinput;
+                            m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
+                           e += UTF8SKIP(e);
+                   }
+                   else {
+                       e = locinput + n - ln;
+                       if (e >= PL_regeol)
+                           e = PL_regeol - 1;
+                   }
                    while (1) {
+                       int count;
                        /* Find place 'next' could work */
-                       if (c1 == c2) {
-                           while (locinput <= e && *locinput != c1)
-                               locinput++;
-                       } else {
-                           while (locinput <= e 
-                                  && *locinput != c1
-                                  && *locinput != c2)
-                               locinput++;                         
+                       if (!do_utf8) {
+                           if (c1 == c2) {
+                               while (locinput <= e && *locinput != c1)
+                                   locinput++;
+                           } else {
+                               while (locinput <= e 
+                                      && *locinput != c1
+                                      && *locinput != c2)
+                                   locinput++;
+                           }
+                           count = locinput - old;
+                       }
+                       else {
+                           STRLEN len;
+                           if (c1 == c2) {
+                               for (count = 0;
+                                    locinput <= e &&
+                                        utf8_to_uv_simple((U8*)locinput, &len) != c1;
+                                    count++)
+                                   locinput += len;
+                               
+                           } else {
+                               for (count = 0; locinput <= e; count++) {
+                                   UV c = utf8_to_uv_simple((U8*)locinput, &len);
+                                   if (c == c1 || c == c2)
+                                       break;
+                                   locinput += len;                        
+                               }
+                           }
                        }
                        if (locinput > e) 
                            sayNO;
                        /* PL_reginput == old now */
                        if (locinput != old) {
                            ln = 1;     /* Did some */
-                           if (regrepeat(scan, locinput - old) <
-                                locinput - old)
+                           if (regrepeat(scan, count) < count)
                                sayNO;
                        }
                        /* PL_reginput == locinput now */
@@ -3084,15 +3200,24 @@ S_regmatch(pTHX_ regnode *prog)
                        PL_reginput = locinput; /* Could be reset... */
                        REGCP_UNWIND(lastcp);
                        /* Couldn't or didn't -- move forward. */
-                       old = locinput++;
+                       old = locinput;
+                       if (do_utf8)
+                           locinput += UTF8SKIP(locinput);
+                       else
+                           locinput++;
                    }
                }
                else
                while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
+                   UV c;
+                   if (c1 != -1000) {
+                       if (do_utf8)
+                           c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
+                       else
+                           c = UCHARAT(PL_reginput); 
+                   }
                    /* If it could work, try it. */
-                   if (c1 == -1000 ||
-                       UCHARAT(PL_reginput) == c1 ||
-                       UCHARAT(PL_reginput) == c2)
+                   if (c1 == -1000 || c == c1 || c == c2)
                    {
                        TRYPAREN(paren, n, PL_reginput);
                        REGCP_UNWIND(lastcp);
@@ -3122,11 +3247,16 @@ S_regmatch(pTHX_ regnode *prog)
                }
                REGCP_SET(lastcp);
                if (paren) {
+                   UV c;
                    while (n >= ln) {
+                       if (c1 != -1000) {
+                           if (do_utf8)
+                               c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
+                           else
+                               c = UCHARAT(PL_reginput); 
+                       }
                        /* If it could work, try it. */
-                       if (c1 == -1000 ||
-                           UCHARAT(PL_reginput) == c1 ||
-                           UCHARAT(PL_reginput) == c2)
+                       if (c1 == -1000 || c == c1 || c == c2)
                            {
                                TRYPAREN(paren, n, PL_reginput);
                                REGCP_UNWIND(lastcp);
@@ -3137,11 +3267,16 @@ S_regmatch(pTHX_ regnode *prog)
                    }
                }
                else {
+                   UV c;
                    while (n >= ln) {
+                       if (c1 != -1000) {
+                           if (do_utf8)
+                               c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
+                           else
+                               c = UCHARAT(PL_reginput); 
+                       }
                        /* If it could work, try it. */
-                       if (c1 == -1000 ||
-                           UCHARAT(PL_reginput) == c1 ||
-                           UCHARAT(PL_reginput) == c2)
+                       if (c1 == -1000 || c == c1 || c == c2)
                            {
                                TRYPAREN(paren, n, PL_reginput);
                                REGCP_UNWIND(lastcp);
@@ -3401,9 +3536,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
       loceol = scan + max;
     switch (OP(p)) {
     case REG_ANY:
-       if (DO_UTF8(PL_reg_sv)) {
+       if (do_utf8) {
            loceol = PL_regeol;
-           while (scan < loceol && *scan != '\n') {
+           while (scan < loceol && hardcount < max && *scan != '\n') {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
@@ -3413,9 +3548,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
        }
        break;
     case SANY:
-       if (DO_UTF8(PL_reg_sv)) {
+       if (do_utf8) {
            loceol = PL_regeol;
-           while (scan < loceol) {
+           while (hardcount < max && scan < loceol) {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
@@ -3444,7 +3579,8 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
     case ANYOF:
        if (do_utf8) {
            loceol = PL_regeol;
-           while (scan < loceol && reginclass(p, (U8*)scan, do_utf8)) {
+           while (hardcount < max && scan < loceol &&
+                  reginclass(p, (U8*)scan, do_utf8)) {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
@@ -3454,9 +3590,10 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
        }
        break;
     case ALNUM:
-       if (DO_UTF8(PL_reg_sv)) {
+       if (do_utf8) {
            loceol = PL_regeol;
-           while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
+           while (hardcount < max && scan < loceol &&
+                  swash_fetch(PL_utf8_alnum, (U8*)scan)) {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
@@ -3467,9 +3604,10 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
        break;
     case ALNUML:
        PL_reg_flags |= RF_tainted;
-       if (DO_UTF8(PL_reg_sv)) {
+       if (do_utf8) {
            loceol = PL_regeol;
-           while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
+           while (hardcount < max && scan < loceol &&
+                  isALNUM_LC_utf8((U8*)scan)) {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
@@ -3479,9 +3617,10 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
        }
        break;
     case NALNUM:
-       if (DO_UTF8(PL_reg_sv)) {
+       if (do_utf8) {
            loceol = PL_regeol;
-           while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
+           while (hardcount < max && scan < loceol &&
+                  !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
@@ -3492,9 +3631,10 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
        break;
     case NALNUML:
        PL_reg_flags |= RF_tainted;
-       if (DO_UTF8(PL_reg_sv)) {
+       if (do_utf8) {
            loceol = PL_regeol;
-           while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
+           while (hardcount < max && scan < loceol &&
+                  !isALNUM_LC_utf8((U8*)scan)) {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
@@ -3504,9 +3644,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
        }
        break;
     case SPACE:
-       if (DO_UTF8(PL_reg_sv)) {
+       if (do_utf8) {
            loceol = PL_regeol;
-           while (scan < loceol &&
+           while (hardcount < max && scan < loceol &&
                   (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
                scan += UTF8SKIP(scan);
                hardcount++;
@@ -3518,9 +3658,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
        break;
     case SPACEL:
        PL_reg_flags |= RF_tainted;
-       if (DO_UTF8(PL_reg_sv)) {
+       if (do_utf8) {
            loceol = PL_regeol;
-           while (scan < loceol &&
+           while (hardcount < max && scan < loceol &&
                   (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
                scan += UTF8SKIP(scan);
                hardcount++;
@@ -3531,9 +3671,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
        }
        break;
     case NSPACE:
-       if (DO_UTF8(PL_reg_sv)) {
+       if (do_utf8) {
            loceol = PL_regeol;
-           while (scan < loceol &&
+           while (hardcount < max && scan < loceol &&
                   !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
                scan += UTF8SKIP(scan);
                hardcount++;
@@ -3545,9 +3685,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
        }
     case NSPACEL:
        PL_reg_flags |= RF_tainted;
-       if (DO_UTF8(PL_reg_sv)) {
+       if (do_utf8) {
            loceol = PL_regeol;
-           while (scan < loceol &&
+           while (hardcount < max && scan < loceol &&
                   !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
                scan += UTF8SKIP(scan);
                hardcount++;
@@ -3558,9 +3698,10 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
        }
        break;
     case DIGIT:
-       if (DO_UTF8(PL_reg_sv)) {
+       if (do_utf8) {
            loceol = PL_regeol;
-           while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
+           while (hardcount < max && scan < loceol &&
+                  swash_fetch(PL_utf8_digit,(U8*)scan)) {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
@@ -3570,9 +3711,10 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
        }
        break;
     case NDIGIT:
-       if (DO_UTF8(PL_reg_sv)) {
+       if (do_utf8) {
            loceol = PL_regeol;
-           while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
+           while (hardcount < max && scan < loceol &&
+                  !swash_fetch(PL_utf8_digit,(U8*)scan)) {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
@@ -3623,7 +3765,7 @@ S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
        return 0;
 
     start = PL_reginput;
-    if (UTF) {
+    if (DO_UTF8(PL_reg_sv)) {
        while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
            if (!count++) {
                l = 0;
@@ -3701,16 +3843,21 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
 {
     char flags = ANYOF_FLAGS(n);
     bool match = FALSE;
+    UV c;
+    STRLEN len;
+
+    if (do_utf8)
+       c = utf8_to_uv_simple(p, &len);
+    else
+       c = *p;
 
     if (do_utf8 || (flags & ANYOF_UNICODE)) {
        if (do_utf8 && !ANYOF_RUNTIME(n)) {
-           STRLEN len;
-           UV c = utf8_to_uv_simple(p, &len);
-
            if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
                match = TRUE;
        }
-
+       if (do_utf8 && flags & ANYOF_UNICODE_ALL && c >= 256)
+           match = TRUE;
        if (!match) {
            SV *sw = regclass_swash(n, TRUE, 0);
        
@@ -3724,17 +3871,15 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
                        PL_reg_flags |= RF_tainted;
                        uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
                    }
-               else
-                   uv_to_utf8(tmpbuf, toLOWER_utf8(p));
+                   else
+                       uv_to_utf8(tmpbuf, toLOWER_utf8(p));
                    if (swash_fetch(sw, tmpbuf))
                        match = TRUE;
                }
            }
        }
     }
-    else {
-       U8 c = *p;
-
+    if (!match && c < 256) {
        if (ANYOF_BITMAP_TEST(n, c))
            match = TRUE;
        else if (flags & ANYOF_FOLD) {
@@ -3796,18 +3941,24 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
 STATIC U8 *
 S_reghop(pTHX_ U8 *s, I32 off)
 {                               
+    return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
+}
+
+STATIC U8 *
+S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
+{                               
     if (off >= 0) {
-       while (off-- && s < (U8*)PL_regeol) {
+       while (off-- && s < lim) {
            /* XXX could check well-formedness here */
            s += UTF8SKIP(s);
        }
     }
     else {
        while (off++) {
-           if (s > (U8*)PL_bostr) {
+           if (s > lim) {
                s--;
                if (UTF8_IS_CONTINUED(*s)) {
-                   while (s > (U8*)PL_bostr && UTF8_IS_CONTINUATION(*s))
+                   while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
                        s--;
                }
                /* XXX could check well-formedness here */
@@ -3818,10 +3969,16 @@ S_reghop(pTHX_ U8 *s, I32 off)
 }
 
 STATIC U8 *
-S_reghopmaybe(pTHX_ U8* s, I32 off)
+S_reghopmaybe(pTHX_ U8 *s, I32 off)
+{                               
+    return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
+}
+
+STATIC U8 *
+S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
 {
     if (off >= 0) {
-       while (off-- && s < (U8*)PL_regeol) {
+       while (off-- && s < lim) {
            /* XXX could check well-formedness here */
            s += UTF8SKIP(s);
        }
@@ -3830,10 +3987,10 @@ S_reghopmaybe(pTHX_ U8* s, I32 off)
     }
     else {
        while (off++) {
-           if (s > (U8*)PL_bostr) {
+           if (s > lim) {
                s--;
                if (UTF8_IS_CONTINUED(*s)) {
-                   while (s > (U8*)PL_bostr && UTF8_IS_CONTINUATION(*s))
+                   while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
                        s--;
                }
                /* XXX could check well-formedness here */
diff --git a/sv.c b/sv.c
index 662b974..3a32525 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -3517,7 +3517,7 @@ Perl_sv_setpvn(pTHX_ register SV *sv, register const char *ptr, register STRLEN
     Move(ptr,dptr,len,char);
     dptr[len] = '\0';
     SvCUR_set(sv, len);
-    (void)SvPOK_only(sv);              /* validate pointer */
+    (void)SvPOK_only_UTF8(sv);         /* validate pointer */
     SvTAINT(sv);
 }
 
@@ -3561,7 +3561,7 @@ Perl_sv_setpv(pTHX_ register SV *sv, register const char *ptr)
     SvGROW(sv, len + 1);
     Move(ptr,SvPVX(sv),len+1,char);
     SvCUR_set(sv, len);
-    (void)SvPOK_only(sv);              /* validate pointer */
+    (void)SvPOK_only_UTF8(sv);         /* validate pointer */
     SvTAINT(sv);
 }
 
@@ -3611,7 +3611,7 @@ Perl_sv_usepvn(pTHX_ register SV *sv, register char *ptr, register STRLEN len)
     SvCUR_set(sv, len);
     SvLEN_set(sv, len+1);
     *SvEND(sv) = '\0';
-    (void)SvPOK_only(sv);              /* validate pointer */
+    (void)SvPOK_only_UTF8(sv);         /* validate pointer */
     SvTAINT(sv);
 }
 
@@ -4658,13 +4658,24 @@ Perl_sv_eq(pTHX_ register SV *sv1, register SV *sv2)
 
     /* do not utf8ize the comparands as a side-effect */
     if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTE) {
+       if (PL_hints & HINT_UTF8_DISTINCT)
+           return FALSE;
+
        if (SvUTF8(sv1)) {
-           pv2 = (char*)bytes_to_utf8((U8*)pv2, &cur2);
-           pv2tmp = TRUE;
+           (void)utf8_to_bytes((U8*)(pv1 = savepvn(pv1, cur1)), &cur1);
+           if (cur1 < 0) {
+               Safefree(pv1);
+               return 0;
+           }
+           pv1tmp = TRUE;
        }
        else {
-           pv1 = (char*)bytes_to_utf8((U8*)pv1, &cur1);
-           pv1tmp = TRUE;
+           (void)utf8_to_bytes((U8*)(pv2 = savepvn(pv2, cur2)), &cur2);
+           if (cur2 < 0) {
+               Safefree(pv2);
+               return 0;
+           }
+           pv2tmp = TRUE;
        }
     }
 
@@ -4714,6 +4725,9 @@ Perl_sv_cmp(pTHX_ register SV *sv1, register SV *sv2)
 
     /* do not utf8ize the comparands as a side-effect */
     if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTE) {
+       if (PL_hints & HINT_UTF8_DISTINCT)
+           return SvUTF8(sv1) ? 1 : -1;
+
        if (SvUTF8(sv1)) {
            pv2 = (char*)bytes_to_utf8((U8*)pv2, &cur2);
            pv2tmp = TRUE;
index 9a6586d..90c38e0 100755 (executable)
@@ -1,6 +1,6 @@
 #!./perl
 
-print "1..29\n";
+print "1..30\n";
 
 $FS = ':';
 
@@ -127,3 +127,8 @@ print "ok 28\n";
 $_ = join ':', split /(?=\w)/, "rm b";
 print "not" if $_ ne "r:m :b";
 print "ok 29\n";
+
+# unicode splittage
+@ary = map {ord} split //, v1.20.300.4000.50000.4000.300.20.1;
+print "not " unless "@ary" eq "1 20 300 4000 50000 4000 300 20 1";
+print "ok 30\n";
diff --git a/utf8.c b/utf8.c
index 24dc692..f65c94f 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -163,9 +163,13 @@ bool
 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 {
     U8* x = s;
-    U8* send = s + len;
+    U8* send;
     STRLEN c;
 
+    if (!len)
+       len = strlen(s);
+    send = s + len;
+
     while (x < send) {
         c = is_utf8_char(x);
        if (!c)