From: David Mitchell <davem@iabyn.com>
Date: Mon, 3 May 2010 12:57:58 +0000 (+0100)
Subject: tries: don't allocate memory at runtime
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=2e64971a6530d2645969bc489f564bfd3ce64993;p=p5sagit%2Fp5-mst-13.2.git

tries: don't allocate memory at runtime

This is an indirect fix for
    [perl #74484] Regex causing exponential runtime+mem usage

The trie runtime code was doing more SAVETMPS than FREETMPS and was thus
growing a large tmps stack on heavy backtracking. Rather than fixing this
directly, I rewrote part of the trie code so that it no longer needs to
allocate memory in S_regmatch (it still does in find_byclass()).

The basic issue is that multiple branches in the trie may trigger an
accept state; for example:

    "abcd" =~ /xyz/abcd.*X|ab.*Y|/

here, words (branches) 2 and 3 are accept states. The original approach
was, at run time, to create a list of accepted word numbers and the
character positions of the end of each of those words. Then run the rest
of the pattern for each word in the list in turn (in word index order).
This requires memory for the list to be allocated and freed.

The new approach involves creating extra info at compile time; in
particular, for each word, a pointer to the previous accepted word (if
any) in the state tree. For example for the above pattern, part of the
state tree may be

      q    b    c    d
    1 -> 2 -> 3 -> 4 -> 5
            (#3)       (#2)

(e.g. at state 1, if the next char is 'a', we transition to state 2).
Here, state 3 is an accept state with word #3, and 5 is an accept state
with word #2. So we build a table indexed by word number, which has
wordinfo[2] = 3, wordinfo[3] = 0, thus building the word chain 2->3->0.

At run time we run the trie to completion, and remember the word
associated with the longest accept state (word #2 above). Then by following
back the chain of .prev fields, we can produce a list of all accepting
words. We then iteratively find the smallest-numbered (ie LH-most) word in
the chain, and run with it. On failure and backtrack, we find the
next-smallest and so on.

Since we are no longer recording the end-position of each word in the
string, we have to recalculate this for each backtrack. We initially
record the end-position of the shortest accepting word, and given that we
know the length of each word, we can calculate the new position each time
as an offset from that first word. Depending on unicode and folding, that
calculation can be cheap or expensive.

This algorithm is optimised for the typical case where there are a small
number (<= 2) accepting states.

This patch creates a new compile-time array, trie->wordinfo[], indexed by
word number, which contains relevant info about each word. This also
supersedes the old trie->newword[] array, whose function of recording
"overspills" of multiple words per accept state, is now handled as part of
the wordinfo[].prev chain.
---

diff --git a/ext/re/t/regop.t b/ext/re/t/regop.t
index 46e6ec0..c24c32f 100644
--- a/ext/re/t/regop.t
+++ b/ext/re/t/regop.t
@@ -41,7 +41,9 @@ foreach my $testout ( @tests ) {
         s/\s+$//;
         ok( $testout=~/\Q$_\E/, "$_: /$pattern/" )
             or do {
-                !$diaged++ and diag("$_: /$pattern/\n$testout");
+                !$diaged++ and diag("PATTERN: /$pattern/\n\n"
+		    . "EXPECTED:\n$_\n\n"
+		    . "WITHIN GOT:\n$testout");
             };
     }
 }
@@ -152,16 +154,17 @@ minlen 3
 #       #   8| W   4 @   0 
 #       #   9| W   5 @   0 
 #       #   A| W   6 @   0 
+#     word_info N:(prev,char)= 1:(0,1) 2:(0,1) 3:(0,1) 4:(0,1) 5:(0,1) 6:(0,1)
 # Final program:
-#    1: EXACT <ABC>(3)
-#    3: TRIEC-EXACT<S:4/10 W:6 L:1/1 C:24/7>[A-EGP](20)
+#    1: EXACT <ABC> (3)
+#    3: TRIEC-EXACT<S:4/10 W:6 L:1/1 C:24/7>[A-EGP] (20)
 #       <P> 
 #       <G> 
 #       <E> 
 #       <B> 
 #       <A> 
 #       <D> 
-#   20: END(0)
+#   20: END (0)
 # anchored "ABC" at 0 (checking anchored) minlen 4 
 # Offsets: [20]
 # 	1:4[3] 3:4[15] 19:32[0] 20:34[0] 
@@ -172,10 +175,10 @@ minlen 3
 #    0 <> <ABCD>               |  1:EXACT <ABC>(3)
 #    3 <ABC> <D>               |  3:TRIEC-EXACT<S:4/10 W:6 L:1/1 C:24/7>[A-EGP](20)
 #    3 <ABC> <D>               |    State:    4 Accepted:    0 Charid:  7 CP:  44 After State:    a
-#    4 <ABCD> <>               |    State:    a Accepted:    1 Charid:  6 CP:   0 After State:    0
+#    4 <ABCD> <>               |    State:    a Accepted:    1 Charid:  7 CP:   0 After State:    0
 #                                   got 1 possible matches
-#                                   only one match left: #6 <D>
-#    4 <ABCD> <>               | 20:END(0)
+#                                   TRIE matched word #6, continuing
+#    4 <ABCD> <>               | 20:  END(0)
 # Match successful!
 # %MATCHED%
 # Freeing REx: "(?:ABCP|ABCG|ABCE|ABCB|ABCA|ABCD)"
@@ -183,7 +186,6 @@ minlen 3
 EXACT <ABC>
 TRIEC-EXACT
 [A-EGP]
-only one match left: #6 <D>
 S:4/10
 W:6
 L:1/1
diff --git a/regcomp.c b/regcomp.c
index 1a815c6..f665f0b 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -878,6 +878,7 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
     U32 state;
     SV *sv=sv_newmortal();
     int colwidth= widecharmap ? 6 : 4;
+    U16 word;
     GET_RE_DEBUG_FLAGS_DECL;
 
     PERL_ARGS_ASSERT_DUMP_TRIE;
@@ -947,6 +948,13 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
         }
         PerlIO_printf( Perl_debug_log, "\n" );
     }
+    PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
+    for (word=1; word <= trie->wordcount; word++) {
+	PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
+	    (int)word, (int)(trie->wordinfo[word].prev),
+	    (int)(trie->wordinfo[word].len));
+    }
+    PerlIO_printf(Perl_debug_log, "\n" );
 }    
 /*
   Dumps a fully constructed but uncompressed trie in list form.
@@ -1077,6 +1085,7 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
 
 #endif
 
+
 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
   startbranch: the first branch in the whole branch sequence
   first      : start branch of sequence of branch-exact nodes.
@@ -1257,8 +1266,6 @@ is the recommended Unicode-aware way of saying
     U16 dupe= trie->states[ state ].wordnum;                    \
     regnode * const noper_next = regnext( noper );              \
                                                                 \
-    if (trie->wordlen)                                          \
-        trie->wordlen[ curword ] = wordlen;                     \
     DEBUG_r({                                                   \
         /* store the word for dumping */                        \
         SV* tmp;                                                \
@@ -1270,6 +1277,9 @@ is the recommended Unicode-aware way of saying
     });                                                         \
                                                                 \
     curword++;                                                  \
+    trie->wordinfo[curword].prev   = 0;                         \
+    trie->wordinfo[curword].len    = wordlen;                   \
+    trie->wordinfo[curword].accept = state;                     \
                                                                 \
     if ( noper_next < tail ) {                                  \
         if (!trie->jump)                                        \
@@ -1282,16 +1292,11 @@ is the recommended Unicode-aware way of saying
     }                                                           \
                                                                 \
     if ( dupe ) {                                               \
-        /* So it's a dupe. This means we need to maintain a   */\
-        /* linked-list from the first to the next.            */\
-        /* we only allocate the nextword buffer when there    */\
-        /* a dupe, so first time we have to do the allocation */\
-        if (!trie->nextword)                                    \
-            trie->nextword = (U16 *)					\
-		PerlMemShared_calloc( word_count + 1, sizeof(U16));	\
-        while ( trie->nextword[dupe] )                          \
-            dupe= trie->nextword[dupe];                         \
-        trie->nextword[dupe]= curword;                          \
+        /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
+        /* chain, so that when the bits of chain are later    */\
+        /* linked together, the dups appear in the chain      */\
+	trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
+	trie->wordinfo[dupe].prev = curword;                    \
     } else {                                                    \
         /* we haven't inserted this word yet.                */ \
         trie->states[ state ].wordnum = curword;                \
@@ -1329,6 +1334,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
     regnode *jumper = NULL;
     regnode *nextbranch = NULL;
     regnode *convert = NULL;
+    U32 *prev_states; /* temp array mapping each state to previous one */
     /* we just use folder as a flag in utf8 */
     const U8 * const folder = ( flags == EXACTF
                        ? PL_fold
@@ -1364,6 +1370,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
     if (!(UTF && folder))
 	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
+    trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
+                       trie->wordcount+1, sizeof(reg_trie_wordinfo));
+
     DEBUG_r({
         trie_words = newAV();
     });
@@ -1496,7 +1505,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
 		(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
 		(int)trie->minlen, (int)trie->maxlen )
     );
-    trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
 
     /*
         We now know what we are dealing with in terms of unique chars and
@@ -1520,6 +1528,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
     */
 
 
+    Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
+    prev_states[1] = 0;
+
     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
         /*
             Second Pass -- Array Of Lists Representation
@@ -1590,6 +1601,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                         }
                         if ( ! newstate ) {
                             newstate = next_alloc++;
+			    prev_states[newstate] = state;
                             TRIE_LIST_PUSH( state, charid, newstate );
                             transcount++;
                         }
@@ -1773,6 +1785,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                         if ( !trie->trans[ state + charid ].next ) {
                             trie->trans[ state + charid ].next = next_alloc;
                             trie->trans[ state ].check++;
+			    prev_states[TRIE_NODENUM(next_alloc)]
+				    = TRIE_NODENUM(state);
                             next_alloc += trie->uniquecharcount;
                         }
                         state = trie->trans[ state + charid ].next;
@@ -1920,9 +1934,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
 	PerlMemShared_realloc( trie->trans, trie->lasttrans
 			       * sizeof(reg_trie_trans) );
 
-    /* and now dump out the compressed format */
-    DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
-
     {   /* Modify the program and insert the new TRIE node*/ 
         U8 nodetype =(U8)(flags & 0xFF);
         char *str=NULL;
@@ -2052,6 +2063,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
 		    break;
 		}
 	    }
+	    trie->prefixlen = (state-1);
             if (str) {
                 regnode *n = convert+NODE_SZ_STR(convert);
                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
@@ -2147,6 +2159,42 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
         });
     } /* end node insert */
+
+    /*  Finish populating the prev field of the wordinfo array.  Walk back
+     *  from each accept state until we find another accept state, and if
+     *  so, point the first word's .prev field at the second word. If the
+     *  second already has a .prev field set, stop now. This will be the
+     *  case either if we've already processed that word's accept state,
+     *  or that that state had multiple words, and the overspill words
+     *  were already linked up earlier.
+     */
+    {
+	U16 word;
+	U32 state;
+	U16 prev;
+
+	for (word=1; word <= trie->wordcount; word++) {
+	    prev = 0;
+	    if (trie->wordinfo[word].prev)
+		continue;
+	    state = trie->wordinfo[word].accept;
+	    while (state) {
+		state = prev_states[state];
+		if (!state)
+		    break;
+		prev = trie->states[state].wordnum;
+		if (prev)
+		    break;
+	    }
+	    trie->wordinfo[word].prev = prev;
+	}
+	Safefree(prev_states);
+    }
+
+
+    /* and now dump out the compressed format */
+    DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
+
     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
 #ifdef DEBUGGING
     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
@@ -9571,12 +9619,9 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
                         PerlMemShared_free(trie->trans);
                         if (trie->bitmap)
                             PerlMemShared_free(trie->bitmap);
-                        if (trie->wordlen)
-                            PerlMemShared_free(trie->wordlen);
                         if (trie->jump)
                             PerlMemShared_free(trie->jump);
-                        if (trie->nextword)
-                            PerlMemShared_free(trie->nextword);
+			PerlMemShared_free(trie->wordinfo);
                         /* do this last!!!! */
                         PerlMemShared_free(ri->data->data[n]);
 		    }
diff --git a/regcomp.h b/regcomp.h
index 20b4401..a20d6e1 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -586,6 +586,15 @@ struct _reg_trie_state {
   } trans;
 };
 
+/* info per word; indexed by wordnum */
+typedef struct {
+    U16  prev;	/* previous word in acceptance chain; eg in
+		 * zzz|abc|ab/ after matching the chars abc, the
+		 * accepted word is #2, and the previous accepted
+		 * word is #3 */
+    U32 len;	/* how many chars long is this word? */
+    U32 accept;	/* accept state for this word */
+} reg_trie_wordinfo;
 
 
 typedef struct _reg_trie_state    reg_trie_state;
@@ -603,15 +612,14 @@ struct _reg_trie_data {
     reg_trie_state  *states;         /* state data */
     reg_trie_trans  *trans;          /* array of transition elements */
     char            *bitmap;         /* stclass bitmap */
-    U32             *wordlen;        /* array of lengths of words */
     U16 	    *jump;           /* optional 1 indexed array of offsets before tail 
                                         for the node following a given word. */
-    U16	            *nextword;       /* optional 1 indexed array to support linked list
-                                        of duplicate wordnums */
+    reg_trie_wordinfo *wordinfo;     /* array of info per word */
     U16             uniquecharcount; /* unique chars in trie (width of trans table) */
     U32             startstate;      /* initial state - used for common prefix optimisation */
     STRLEN          minlen;          /* minimum length of words in trie - build/opt only? */
     STRLEN          maxlen;          /* maximum length of words in trie - build/opt only? */
+    U32             prefixlen;       /* #chars in common prefix */
     U32             statecount;      /* Build only - number of states in the states array 
                                         (including the unused zero state) */
     U32             wordcount;       /* Build only */
diff --git a/regexec.c b/regexec.c
index 7222efe..4aa68ef 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1736,7 +1736,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                         }
                                             
                         if ( word ) {
-                            U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
+                            U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
                             if (!leftmost || lpos < leftmost) {
                                 DEBUG_r(accepted_word=word);
                                 leftmost= lpos;
@@ -1810,7 +1810,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                         }
                     }
                     if ( aho->states[ state ].wordnum ) {
-                        U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
+                        U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
                         if (!leftmost || lpos < leftmost) {
                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
                             leftmost = lpos;
@@ -2505,9 +2505,6 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
 #define REPORT_CODE_OFF 32
 
 
-/* Make sure there is a test for this +1 options in re_tests */
-#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
-
 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
 
@@ -3069,6 +3066,50 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
             }
             /* FALL THROUGH */
 	case TRIE:
+	    /* the basic plan of execution of the trie is:
+	     * At the beginning, run though all the states, and
+	     * find the longest-matching word. Also remember the position
+	     * of the shortest matching word. For example, this pattern:
+	     *    1  2 3 4    5
+	     *    ab|a|x|abcd|abc
+	     * when matched against the string "abcde", will generate
+	     * accept states for all words except 3, with the longest
+	     * matching word being 4, and the shortest being 1 (with
+	     * the position being after char 1 of the string).
+	     *
+	     * Then for each matching word, in word order (i.e. 1,2,4,5),
+	     * we run the remainder of the pattern; on each try setting
+	     * the current position to the character following the word,
+	     * returning to try the next word on failure.
+	     *
+	     * We avoid having to build a list of words at runtime by
+	     * using a compile-time structure, wordinfo[].prev, which
+	     * gives, for each word, the previous accepting word (if any).
+	     * In the case above it would contain the mappings 1->2, 2->0,
+	     * 3->0, 4->5, 5->1.  We can use this table to generate, from
+	     * the longest word (4 above), a list of all words, by
+	     * following the list of prev pointers; this gives us the
+	     * unordered list 4,5,1,2. Then given the current word we have
+	     * just tried, we can go through the list and find the
+	     * next-biggest word to try (so if we just failed on word 2,
+	     * the next in the list is 4).
+	     *
+	     * Since at runtime we don't record the matching position in
+	     * the string for each word, we have to work that out for
+	     * each word we're about to process. The wordinfo table holds
+	     * the character length of each word; given that we recorded
+	     * at the start: the position of the shortest word and its
+	     * length in chars, we just need to move the pointer the
+	     * difference between the two char lengths. Depending on
+	     * Unicode status and folding, that's cheap or expensive.
+	     *
+	     * This algorithm is optimised for the case where are only a
+	     * small number of accept states, i.e. 0,1, or maybe 2.
+	     * With lots of accepts states, and having to try all of them,
+	     * it becomes quadratic on number of accept states to find all
+	     * the next words.
+	     */
+
 	    {
                 /* what type of TRIE am I? (utf8 makes this contextual) */
                 DECL_TRIE_TYPE(scan);
@@ -3105,76 +3146,62 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 		STRLEN len = 0;
 		STRLEN foldlen = 0;
 		U8 *uscan = (U8*)NULL;
-		STRLEN bufflen=0;
-		SV *sv_accept_buff = NULL;
 		U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+		U32 charcount = 0; /* how many input chars we have matched */
+		U32 accepted = 0; /* have we seen any accepting states? */
 
-	    	ST.accepted = 0; /* how many accepting states we have seen */
 		ST.B = next;
 		ST.jump = trie->jump;
 		ST.me = scan;
-	        /*
-        	   traverse the TRIE keeping track of all accepting states
-        	   we transition through until we get to a failing node.
-        	*/
+		ST.firstpos = NULL;
+		ST.longfold = FALSE; /* char longer if folded => it's harder */
+		ST.nextword = 0;
+
+		/* fully traverse the TRIE; note the position of the
+		   shortest accept state and the wordnum of the longest
+		   accept state */
 
 		while ( state && uc <= (U8*)PL_regeol ) {
                     U32 base = trie->states[ state ].trans.base;
                     UV uvc = 0;
                     U16 charid;
-                    /* We use charid to hold the wordnum as we don't use it
-                       for charid until after we have done the wordnum logic. 
-                       We define an alias just so that the wordnum logic reads
-                       more naturally. */
-
-#define got_wordnum charid
-                    got_wordnum = trie->states[ state ].wordnum;
-
-		    if ( got_wordnum ) {
-			if ( ! ST.accepted ) {
-			    ENTER;
-			    SAVETMPS; /* XXX is this necessary? dmq */
-			    bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
-			    sv_accept_buff=newSV(bufflen *
-					    sizeof(reg_trie_accepted) - 1);
-			    SvCUR_set(sv_accept_buff, 0);
-			    SvPOK_on(sv_accept_buff);
-			    sv_2mortal(sv_accept_buff);
-			    SAVETMPS;
-			    ST.accept_buff =
-				(reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
-			}
-			do {
-			    if (ST.accepted >= bufflen) {
-				bufflen *= 2;
-				ST.accept_buff =(reg_trie_accepted*)
-				    SvGROW(sv_accept_buff,
-				       	bufflen * sizeof(reg_trie_accepted));
+		    U16 wordnum;
+                    wordnum = trie->states[ state ].wordnum;
+
+		    if (wordnum) { /* it's an accept state */
+			if (!accepted) {
+			    accepted = 1;
+			    /* record first match position */
+			    if (ST.longfold) {
+				ST.firstpos = (U8*)locinput;
+				ST.firstchars = 0;
 			    }
-			    SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
-				+ sizeof(reg_trie_accepted));
-
-
-			    ST.accept_buff[ST.accepted].wordnum = got_wordnum;
-			    ST.accept_buff[ST.accepted].endpos = uc;
-			    ++ST.accepted;
-		        } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
+			    else {
+				ST.firstpos = uc;
+				ST.firstchars = charcount;
+			    }
+			}
+			if (!ST.nextword || wordnum < ST.nextword)
+			    ST.nextword = wordnum;
+			ST.topword = wordnum;
 		    }
-#undef got_wordnum 
 
 		    DEBUG_TRIE_EXECUTE_r({
 		                DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
 			        PerlIO_printf( Perl_debug_log,
-			            "%*s  %sState: %4"UVxf" Accepted: %4"UVxf" ",
+			            "%*s  %sState: %4"UVxf" Accepted: %c ",
 			            2+depth * 2, "", PL_colors[4],
-			            (UV)state, (UV)ST.accepted );
+			            (UV)state, (accepted ? 'Y' : 'N'));
 		    });
 
+		    /* read a char and goto next state */
 		    if ( base ) {
 			REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
 					     uscan, len, uvc, charid, foldlen,
 					     foldbuf, uniflags);
-
+			charcount++;
+			if (foldlen>0)
+			    ST.longfold = TRUE;
 			if (charid &&
 			     (base + charid > trie->uniquecharcount )
 			     && (base + charid - 1 - trie->uniquecharcount
@@ -3200,77 +3227,38 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 		            charid, uvc, (UV)state, PL_colors[5] );
 		    );
 		}
-		if (!ST.accepted )
+		if (!accepted)
 		   sayNO;
 
+		/* calculate total number of accept states */
+		{
+		    U16 w = ST.topword;
+		    accepted = 0;
+		    while (w) {
+			w = trie->wordinfo[w].prev;
+			accepted++;
+		    }
+		    ST.accepted = accepted;
+		}
+
 		DEBUG_EXECUTE_r(
 		    PerlIO_printf( Perl_debug_log,
 			"%*s  %sgot %"IVdf" possible matches%s\n",
 			REPORT_CODE_OFF + depth * 2, "",
 			PL_colors[4], (IV)ST.accepted, PL_colors[5] );
 		);
+		goto trie_first_try; /* jump into the fail handler */
 	    }}
-            goto trie_first_try; /* jump into the fail handler */
 	    /* NOTREACHED */
-	case TRIE_next_fail: /* we failed - try next alterative */
+
+	case TRIE_next_fail: /* we failed - try next alternative */
             if ( ST.jump) {
                 REGCP_UNWIND(ST.cp);
 	        for (n = *PL_reglastparen; n > ST.lastparen; n--)
 		    PL_regoffs[n].end = -1;
 	        *PL_reglastparen = n;
 	    }
-          trie_first_try:
-            if (do_cutgroup) {
-                do_cutgroup = 0;
-                no_final = 0;
-            }
-
-            if ( ST.jump) {
-                ST.lastparen = *PL_reglastparen;
-	        REGCP_SET(ST.cp);
-            }	        
-	    if ( ST.accepted == 1 ) {
-		/* only one choice left - just continue */
-		DEBUG_EXECUTE_r({
-		    AV *const trie_words
-			= MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
-		    SV ** const tmp = av_fetch( trie_words, 
-		        ST.accept_buff[ 0 ].wordnum-1, 0 );
-		    SV *sv= tmp ? sv_newmortal() : NULL;
-		    
-		    PerlIO_printf( Perl_debug_log,
-			"%*s  %sonly one match left: #%d <%s>%s\n",
-			REPORT_CODE_OFF+depth*2, "", PL_colors[4],
-			ST.accept_buff[ 0 ].wordnum,
-			tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, 
-	                        PL_colors[0], PL_colors[1],
-	                        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
-                            ) 
-			: "not compiled under -Dr",
-			PL_colors[5] );
-		});
-		PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
-		/* in this case we free tmps/leave before we call regmatch
-		   as we wont be using accept_buff again. */
-		
-		locinput = PL_reginput;
-		nextchr = UCHARAT(locinput);
-    		if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum]) 
-    		    scan = ST.B;
-    		else
-    		    scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
-		if (!has_cutgroup) {
-		    FREETMPS;
-		    LEAVE;
-                } else {
-                    ST.accepted--;
-                    PUSH_YES_STATE_GOTO(TRIE_next, scan);
-                }
-		
-		continue; /* execute rest of RE */
-	    }
-	    
-	    if ( !ST.accepted-- ) {
+	    if (!--ST.accepted) {
 	        DEBUG_EXECUTE_r({
 		    PerlIO_printf( Perl_debug_log,
 			"%*s  %sTRIE failed...%s\n",
@@ -3278,86 +3266,129 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 			PL_colors[4],
 			PL_colors[5] );
 		});
-		FREETMPS;
-		LEAVE;
 		sayNO_SILENT;
-		/*NOTREACHED*/
-	    } 
+	    }
+	    {
+		/* Find next-highest word to process.  Note that this code
+		 * is O(N^2) per trie run (O(N) per branch), so keep tight */
+		register U32 min = 0;
+		register U32 word;
+		register U16 const nextword = ST.nextword;
+		register reg_trie_wordinfo * const wordinfo
+		    = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
+		for (word=ST.topword; word; word=wordinfo[word].prev) {
+		    if (word > nextword && (!min || word < min))
+			min = word;
+		}
+		ST.nextword = min;
+	    }
 
-	    /*
-	       There are at least two accepting states left.  Presumably
-	       the number of accepting states is going to be low,
-	       typically two. So we simply scan through to find the one
-	       with lowest wordnum.  Once we find it, we swap the last
-	       state into its place and decrement the size. We then try to
-	       match the rest of the pattern at the point where the word
-	       ends. If we succeed, control just continues along the
-	       regex; if we fail we return here to try the next accepting
-	       state
-	     */
+          trie_first_try:
+            if (do_cutgroup) {
+                do_cutgroup = 0;
+                no_final = 0;
+            }
 
-	    {
-		U32 best = 0;
-		U32 cur;
-		for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
-		    DEBUG_TRIE_EXECUTE_r(
-			PerlIO_printf( Perl_debug_log,
-			    "%*s  %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
-			    REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
-			    (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
-			    ST.accept_buff[ cur ].wordnum, PL_colors[5] );
-		    );
+            if ( ST.jump) {
+                ST.lastparen = *PL_reglastparen;
+	        REGCP_SET(ST.cp);
+            }
 
-		    if (ST.accept_buff[cur].wordnum <
-			    ST.accept_buff[best].wordnum)
-			best = cur;
+	    /* find start char of end of current word */
+	    {
+		U32 chars; /* how many chars to skip */
+		U8 *uc = ST.firstpos;
+		reg_trie_data * const trie
+		    = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
+
+		assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
+			    >=  ST.firstchars);
+		chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
+			    - ST.firstchars;
+
+		if (ST.longfold) {
+		    /* the hard option - fold each char in turn and find
+		     * its folded length (which may be different */
+		    U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
+		    STRLEN foldlen;
+		    STRLEN len;
+		    U8 uvc;
+		    U8 *uscan;
+
+		    while (chars) {
+			if (do_utf8) {
+			    uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
+						    uniflags);
+			    uc += len;
+			}
+			else {
+			    uvc = *uc;
+			    uc++;
+			}
+			uvc = to_uni_fold(uvc, foldbuf, &foldlen);
+			uscan = foldbuf;
+			while (foldlen) {
+			    if (!--chars)
+				break;
+			    uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
+					    uniflags);
+			    uscan += len;
+			    foldlen -= len;
+			}
+		    }
 		}
+		else {
+		    if (do_utf8) 
+			while (chars--)
+			    uc += UTF8SKIP(uc);
+		    else
+			uc += chars;
+		}
+		PL_reginput = (char *)uc;
+	    }
 
-		DEBUG_EXECUTE_r({
-		    AV *const trie_words
-			= MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
-		    SV ** const tmp = av_fetch( trie_words, 
-		        ST.accept_buff[ best ].wordnum - 1, 0 );
-		    regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ? 
-		                    ST.B : 
-		                    ST.me + ST.jump[ST.accept_buff[best].wordnum];    
-		    SV *sv= tmp ? sv_newmortal() : NULL;
-		    
-		    PerlIO_printf( Perl_debug_log, 
-		        "%*s  %strying alternation #%d <%s> at node #%d %s\n",
-			REPORT_CODE_OFF+depth*2, "", PL_colors[4],
-			ST.accept_buff[best].wordnum,
-			tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, 
-	                        PL_colors[0], PL_colors[1],
-	                        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
-                            ) : "not compiled under -Dr", 
-			    REG_NODE_NUM(nextop),
-			PL_colors[5] );
-		});
+	    scan = (ST.jump && ST.jump[ST.nextword]) 
+			? ST.me + ST.jump[ST.nextword]
+			: ST.B;
 
-		if ( best<ST.accepted ) {
-		    reg_trie_accepted tmp = ST.accept_buff[ best ];
-		    ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
-		    ST.accept_buff[ ST.accepted ] = tmp;
-		    best = ST.accepted;
-		}
-		PL_reginput = (char *)ST.accept_buff[ best ].endpos;
-		if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
-		    scan = ST.B;
-		} else {
-		    scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
-                }
-                PUSH_YES_STATE_GOTO(TRIE_next, scan);    
-                /* NOTREACHED */
+	    DEBUG_EXECUTE_r({
+		PerlIO_printf( Perl_debug_log,
+		    "%*s  %sTRIE matched word #%d, continuing%s\n",
+		    REPORT_CODE_OFF+depth*2, "", 
+		    PL_colors[4],
+		    ST.nextword,
+		    PL_colors[5]
+		    );
+	    });
+
+	    if (ST.accepted > 1 || has_cutgroup) {
+		PUSH_STATE_GOTO(TRIE_next, scan);
+		/* NOTREACHED */
 	    }
+	    /* only one choice left - just continue */
+	    DEBUG_EXECUTE_r({
+		AV *const trie_words
+		    = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
+		SV ** const tmp = av_fetch( trie_words,
+		    ST.nextword-1, 0 );
+		SV *sv= tmp ? sv_newmortal() : NULL;
+
+		PerlIO_printf( Perl_debug_log,
+		    "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
+		    REPORT_CODE_OFF+depth*2, "", PL_colors[4],
+		    ST.nextword,
+		    tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
+			    PL_colors[0], PL_colors[1],
+			    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
+			) 
+		    : "not compiled under -Dr",
+		    PL_colors[5] );
+	    });
+
+	    locinput = PL_reginput;
+	    nextchr = UCHARAT(locinput);
+	    continue; /* execute rest of RE */
 	    /* NOTREACHED */
-        case TRIE_next:
-	    /* we dont want to throw this away, see bug 57042*/
-	    if (oreplsv != GvSV(PL_replgv))
-		sv_setsv(oreplsv, GvSV(PL_replgv));
-            FREETMPS;
-	    LEAVE;
-	    sayYES;
 #undef  ST
 
 	case EXACT: {
diff --git a/regexp.h b/regexp.h
index 90e3406..a9dd2e1 100644
--- a/regexp.h
+++ b/regexp.h
@@ -490,13 +490,6 @@ and check for NULL.
 
 #define FBMrf_MULTILINE	1
 
-/* an accepting state/position*/
-struct _reg_trie_accepted {
-    U8   *endpos;
-    U16  wordnum;
-};
-typedef struct _reg_trie_accepted reg_trie_accepted;
-
 /* some basic information about the current match that is created by
  * Perl_regexec_flags and then passed to regtry(), regmatch() etc */
 
@@ -557,11 +550,15 @@ typedef struct regmatch_state {
 	    U32 lastparen;
 	    CHECKPOINT cp;
 
-	    reg_trie_accepted *accept_buff; /* accepting states we have seen */
-	    U32		accepted; /* how many accepting states we have seen */
+	    U32		accepted; /* how many accepting states left */
 	    U16         *jump;  /* positive offsets from me */
 	    regnode	*B;	/* node following the trie */
 	    regnode	*me;	/* Which node am I - needed for jump tries*/
+	    U8		*firstpos;/* pos in string of first trie match */
+	    U32		firstchars;/* len in chars of firstpos from start */
+	    U16		nextword;/* next word to try */
+	    U16		topword; /* longest accepted word */
+	    bool	longfold;/* saw a fold with a 1->n char mapping */
 	} trie;
 
         /* special types - these members are used to store state for special
diff --git a/t/op/svleak.t b/t/op/svleak.t
index 7b1f8f0..07c2efc 100644
--- a/t/op/svleak.t
+++ b/t/op/svleak.t
@@ -70,7 +70,4 @@ sub STORE	{ $_[0]->[$_[1]] = $_[2] }
 
 # [perl #74484]  repeated tries leaked SVs on the tmps stack
 
-{
-    local $TODO = 'not fixed yet';
-    leak_expr(5, 0, q{"YYYYYa" =~ /.+?(a(.+?)|b)/ }, "trie leak");
-}
+leak_expr(5, 0, q{"YYYYYa" =~ /.+?(a(.+?)|b)/ }, "trie leak");