From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Wed, 12 Aug 1998 15:42:35 +0000 (+0300)
Subject: apply minimal variant of patch (sent via private mail)
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=8ada0baa1f731edbe470a7630cfeb30c131b4672;p=p5sagit%2Fp5-mst-13.2.git

apply minimal variant of patch (sent via private mail)
	Message-Id: <199808121242.PAA29761@comanche.spices>
	Subject: [PATCH] 5.004_02 or 5.005_51: fix regexp and tr character ranges in non-ASCII lands

p4raw-id: //depot/perl@1803
---

diff --git a/MANIFEST b/MANIFEST
index 91147fd..966cc80 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -1116,6 +1116,7 @@ t/op/tie.t		See if tie/untie functions work
 t/op/tiearray.t		See if tie for arrays works
 t/op/tiehandle.t	See if tie for handles works
 t/op/time.t		See if time functions work
+t/op/tr.t		See if tr works
 t/op/undef.t		See if undef works
 t/op/universal.t	See if UNIVERSAL class works
 t/op/unshift.t		See if unshift works
diff --git a/perl.h b/perl.h
index 0f7fe6d..c406f38 100644
--- a/perl.h
+++ b/perl.h
@@ -209,6 +209,12 @@ register struct op *op asm(stringify(OP_IN_REGISTER));
 #   define LIBERAL 1
 #endif
 
+#if 'A' == 65 && 'I' == 73 && 'J' == 74 && 'Z' == 90
+#define ASCIIish
+#else
+#undef  ASCIIish
+#endif
+
 /*
  * The following contortions are brought to you on behalf of all the
  * standards, semi-standards, de facto standards, not-so-de-facto standards
diff --git a/pod/perllocale.pod b/pod/perllocale.pod
index 4401be2..0a85c0e 100644
--- a/pod/perllocale.pod
+++ b/pod/perllocale.pod
@@ -710,7 +710,7 @@ case-mapping with C<\l>, C<\L>,C<\u> or C<\U>.
 
 =item B<In-memory formatting function> (sprintf()):
 
-Result is tainted if "use locale" is in effect.
+Result is tainted if C<use locale> is in effect.
 
 =item B<Output formatting functions> (printf() and write()):
 
diff --git a/pod/perlop.pod b/pod/perlop.pod
index 35f9e5f..8e50ec3 100644
--- a/pod/perlop.pod
+++ b/pod/perlop.pod
@@ -1149,6 +1149,7 @@ the number of characters replaced or deleted.  If no string is
 specified via the =~ or !~ operator, the $_ string is transliterated.  (The
 string specified with =~ must be a scalar variable, an array element, a
 hash element, or an assignment to one of those, i.e., an lvalue.)
+
 A character range may be specified with a hyphen, so C<tr/A-J/0-9/> 
 does the same replacement as C<tr/ACEGIBDFHJ/0246813579/>.
 For B<sed> devotees, C<y> is provided as a synonym for C<tr>.  If the
@@ -1156,6 +1157,13 @@ SEARCHLIST is delimited by bracketing quotes, the REPLACEMENTLIST has
 its own pair of quotes, which may or may not be bracketing quotes,
 e.g., C<tr[A-Z][a-z]> or C<tr(+\-*/)/ABCD/>.
 
+Note also that the whole range idea is rather unportable between
+character sets--and even within character sets they may cause results
+you probably didn't expect.  A sound principle is to use only ranges
+that begin from and end at either alphabets of equal case (a-e, A-E),
+or digits (0-4).  Anything else is unsafe.  If in doubt, spell out the
+character sets in full.
+
 Options:
 
     c	Complement the SEARCHLIST.
diff --git a/pod/perlre.pod b/pod/perlre.pod
index 1b49ba4..f696525 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -735,6 +735,13 @@ following all specify the same class of three characters: C<[-az]>,
 C<[az-]>, and C<[a\-z]>.  All are different from C<[a-z]>, which
 specifies a class containing twenty-six characters.)
 
+Note also that the whole range idea is rather unportable between
+character sets--and even within character sets they may cause results
+you probably didn't expect.  A sound principle is to use only ranges
+that begin from and end at either alphabets of equal case ([a-e],
+[A-E]), or digits ([0-9]).  Anything else is unsafe.  If in doubt,
+spell out the character sets in full.
+
 Characters may be specified using a metacharacter syntax much like that
 used in C: "\n" matches a newline, "\t" a tab, "\r" a carriage return,
 "\f" a form feed, etc.  More generally, \I<nnn>, where I<nnn> is a string
diff --git a/regcomp.c b/regcomp.c
index ef651fa..8db8b8a 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2245,9 +2245,24 @@ regclass(void)
 	    }
 	}
 	if (!SIZE_ONLY) {
-	    for ( ; lastvalue <= value; lastvalue++)
-		ANYOF_SET(opnd, lastvalue);
-	}
+#ifndef ASCIIish
+	    if ((isLOWER(lastvalue) && isLOWER(value)) ||
+		(isUPPER(lastvalue) && isUPPER(value))) {
+ 		if (isLOWER(lastvalue)) {
+ 		    for (i = lastvalue; i <= value; i++)
+			if (isLOWER(i))
+			    ANYOF_SET(opnd, i);
+ 		} else {
+ 		    for (i = lastvalue; i <= value; i++)
+			if (isUPPER(i))
+			    ANYOF_SET(opnd, i);
+		}
+	    }
+	    else
+#endif
+		for ( ; lastvalue <= value; lastvalue++)
+		    ANYOF_SET(opnd, lastvalue);
+        }
 	lastvalue = value;
     }
     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
diff --git a/t/pragma/locale.t b/t/pragma/locale.t
index 00baa66..7e3df8c 100755
--- a/t/pragma/locale.t
+++ b/t/pragma/locale.t
@@ -23,6 +23,9 @@ eval {
 # and mingw32 uses said silly CRT
 $have_setlocale = 0 if $^O eq 'MSWin32' && $Config{cc} =~ /^(cl|gcc)/i;
 
+# 103 (the last test) may fail but that is okay.
+# (It indicates something broken in the environment, not Perl)
+# Therefore .. only until 102, not 103.
 print "1..", ($have_setlocale ? 102 : 98), "\n";
 
 use vars qw($a
@@ -404,6 +407,7 @@ print "ok 101\n";
 
 # Test for read-onlys.
 
+print "# testing 102\n";
 {
     no locale;
     $a = "qwerty";
@@ -419,7 +423,7 @@ print "ok 102\n";
 # Thanks to Hallvard Furuseth <h.b.furuseth@usit.uio.no>
 # for inventing a way to test for ordering consistency
 # without requiring any particular order.
-# ++$jhi;#@iki.fi
+# <jhi@iki.fi>
 
 print "# testing 103\n";
 {
diff --git a/toke.c b/toke.c
index 2381be3..90fc7b8 100644
--- a/toke.c
+++ b/toke.c
@@ -904,6 +904,7 @@ scan_const(char *start)
 	    /* expand a range A-Z to the full set of characters.  AIE! */
 	    if (dorange) {
 		I32 i;				/* current expanded character */
+		I32 min;			/* first character in range */
 		I32 max;			/* last character in range */
 
 		i = d - SvPVX(sv);		/* remember current offset */
@@ -911,10 +912,26 @@ scan_const(char *start)
 		d = SvPVX(sv) + i;		/* restore d after the grow potentially has changed the ptr */
 		d -= 2;				/* eat the first char and the - */
 
-		max = (U8)d[1];			/* last char in range */
-
-		for (i = (U8)*d; i <= max; i++)
-		    *d++ = i;
+		min = (U8)*d;			/* first char in range */
+		max = (U8)d[1];			/* last char in range  */
+
+#ifndef ASCIIish
+		if ((isLOWER(min) && isLOWER(max)) ||
+		    (isUPPER(min) && isUPPER(max))) {
+		    if (isLOWER(min)) {
+			for (i = min; i <= max; i++)
+			    if (isLOWER(i))
+				*d++ = i;
+		    } else {
+			for (i = min; i <= max; i++)
+			    if (isUPPER(i))
+				*d++ = i;
+		    }
+		}
+		else
+#endif
+		    for (i = min; i <= max; i++)
+			*d++ = i;
 
 		/* mark the range as done, and continue */
 		dorange = FALSE;