[p5sagit/p5-mst-13.2.git] / t / lib / locale.t

#!./perl -wT

print "1..67\n";

BEGIN {
    chdir 't' if -d 't';
    @INC = '../lib';
}

use strict;
use POSIX qw(locale_h);

use vars qw($a
	    $English $German $French $Spanish
	    @C @English @German @French @Spanish
	    $Locale @Locale %iLocale %UPPER %lower @Neoalpha);

$a = 'abc %';

sub ok {
    my ($n, $result) = @_;

    print 'not ' unless ($result);
    print "ok $n\n";
}

# First we'll do a lot of taint checking for locales.
# This is the easiest to test, actually, as any locale,
# even the default locale will taint under 'use locale'.

sub is_tainted { # hello, camel two.
    my $dummy;
    not eval { $dummy = join("", @_), kill 0; 1 }
}

sub check_taint ($$) {
    ok $_[0], is_tainted($_[1]);
}

sub check_taint_not ($$) {
    ok $_[0], not is_tainted($_[1]);
}

use locale;	# engage locale and therefore locale taint.

check_taint_not   1, $a;

check_taint       2, uc($a);
check_taint       3, "\U$a";
check_taint       4, ucfirst($a);
check_taint       5, "\u$a";
check_taint       6, lc($a);
check_taint       7, "\L$a";
check_taint       8, lcfirst($a);
check_taint       9, "\l$a";

check_taint      10, sprintf('%e', 123.456);
check_taint      11, sprintf('%f', 123.456);
check_taint      12, sprintf('%g', 123.456);
check_taint_not  13, sprintf('%d', 123.456);
check_taint_not  14, sprintf('%x', 123.456);

$_ = $a;	# untaint $_

$_ = uc($a);	# taint $_

check_taint      15, $_;

/(\w)/;	# taint $&, $`, $', $+, $1.
check_taint      16, $&;
check_taint      17, $`;
check_taint      18, $';
check_taint      19, $+;
check_taint      20, $1;
check_taint_not  21, $2;

/(\W)/;	# taint $&, $`, $', $+, $1.
check_taint      22, $&;
check_taint      23, $`;
check_taint      24, $';
check_taint      25, $+;
check_taint      26, $1;
check_taint_not  27, $2;

/(\s)/;	# taint $&, $`, $', $+, $1.
check_taint      28, $&;
check_taint      29, $`;
check_taint      30, $';
check_taint      31, $+;
check_taint      32, $1;
check_taint_not  33, $2;

/(\S)/;	# taint $&, $`, $', $+, $1.
check_taint      34, $&;
check_taint      35, $`;
check_taint      36, $';
check_taint      37, $+;
check_taint      38, $1;
check_taint_not  39, $2;

$_ = $a;	# untaint $_

check_taint_not  40, $_;

/(b)/;		# this must not taint
check_taint_not  41, $&;
check_taint_not  42, $`;
check_taint_not  43, $';
check_taint_not  44, $+;
check_taint_not  45, $1;
check_taint_not  46, $2;

$_ = $a;	# untaint $_

check_taint_not  47, $_;

$b = uc($a);	# taint $b
s/(.+)/$b/;	# this must taint only the $_

check_taint      48, $_;
check_taint_not  49, $&;
check_taint_not  50, $`;
check_taint_not  51, $';
check_taint_not  52, $+;
check_taint_not  53, $1;
check_taint_not  54, $2;

$_ = $a;	# untaint $_

s/(.+)/b/;	# this must not taint
check_taint_not  55, $_;
check_taint_not  56, $&;
check_taint_not  57, $`;
check_taint_not  58, $';
check_taint_not  59, $+;
check_taint_not  60, $1;
check_taint_not  61, $2;

check_taint_not  62, $a;

# I think we've seen quite enough of taint.
# Let us do some *real* locale work now.

sub getalnum {
    sort grep /\w/, map { chr } 0..255
}

sub locatelocale ($$@) {
    my ($lcall, $alnum, @try) = @_;

    undef $$lcall;

    for (@try) {
	local $^W = 0; # suppress "Subroutine LC_ALL redefined"
	if (setlocale(LC_ALL, $_)) {
	    $$lcall = $_;
	    @$alnum = &getalnum;
	    last;
	}
    }

    @$alnum = () unless (defined $$lcall);
}

# Find some default locale

locatelocale(\$Locale, \@Locale, qw(C POSIX));

# Find some English locale

locatelocale(\$English, \@English,
	     qw(en_US.ISO8859-1 en_GB.ISO8859-1
		en en_US en_UK en_IE en_CA en_AU en_NZ
		english english.iso88591
		american american.iso88591
		british british.iso88591
		));

# Find some German locale

locatelocale(\$German, \@German,
	     qw(de_DE.ISO8859-1 de_AT.ISO8859-1 de_CH.ISO8859-1
		de de_DE de_AT de_CH
		german german.iso88591));

# Find some French locale

locatelocale(\$French, \@French,
	     qw(fr_FR.ISO8859-1 fr_BE.ISO8859-1 fr_CA.ISO8859-1 fr_CH.ISO8859-1
		fr fr_FR fr_BE fr_CA fr_CH
		french french.iso88591));

# Find some Spanish locale

locatelocale(\$Spanish, \@Spanish,
	     qw(es_AR.ISO8859-1 es_BO.ISO8859-1 es_CL.ISO8859-1
		es_CO.ISO8859-1 es_CR.ISO8859-1 es_EC.ISO8859-1
		es_ES.ISO8859-1 es_GT.ISO8859-1 es_MX.ISO8859-1
		es_NI.ISO8859-1 es_PA.ISO8859-1 es_PE.ISO8859-1
		es_PY.ISO8859-1 es_SV.ISO8859-1 es_UY.ISO8859-1 es_VE.ISO8859-1
		es es_AR es_BO es_CL
		es_CO es_CR es_EC
		es_ES es_GT es_MX
		es_NI es_PA es_PE
		es_PY es_SV es_UY es_VE
		spanish spanish.iso88591));

# Select the largest of the alpha(num)bets.

($Locale, @Locale) = ($English, @English)
    if (length(@English) > length(@Locale));
($Locale, @Locale) = ($German, @German)
    if (length(@German)  > length(@Locale));
($Locale, @Locale) = ($French, @French)
    if (length(@French)  > length(@Locale));
($Locale, @Locale) = ($Spanish, @Spanish)
    if (length(@Spanish) > length(@Locale));

print "# Locale = $Locale\n";
print "# Alnum_ = @Locale\n";

{
    local $^W = 0;
    setlocale(LC_ALL, $Locale);
}

{
    my $i = 0;

    for (@Locale) {
	$iLocale{$_} = $i++;
    }
}

# Sieve the uppercase and the lowercase.

for (@Locale) {
    if (/[^\d_]/) { # skip digits and the _
	if (lc eq $_) {
	    $UPPER{$_} = uc;
	} else {
	    $lower{$_} = lc;
	}
    }
}

# Cross-check the upper and the lower.
# Yes, this is broken when the upper<->lower changes the number of
# the glyphs (e.g. the German sharp-s aka double-s aka sz-ligature.
# But so far all the implementations do this wrong so we can do it wrong too.

for (keys %UPPER) {
    if (defined $lower{$UPPER{$_}}) {
	if ($_ ne $lower{$UPPER{$_}}) {
	    print 'not ';
	    last;
	}
    }
}
print "ok 63\n";

for (keys %lower) {
    if (defined $UPPER{$lower{$_}}) {
	if ($_ ne $UPPER{$lower{$_}}) {
	    print 'not ';
	    last;
	}
    }
}
print "ok 64\n";

# Find the alphabets that are not alphabets in the default locale.

{
    no locale;
    
    for (keys %UPPER, keys %lower) {
	push(@Neoalpha, $_) if (/\W/);
    }
}

@Neoalpha = sort @Neoalpha;

# Test \w.

{
    my $word = join('', @Neoalpha);

    $word =~ /^(\w*)$/;

    print 'not ' if ($1 ne $word);
}
print "ok 65\n";

# Find places where the collation order differs from the default locale.

{
    no locale;

    my @k = sort (keys %UPPER, keys %lower); 
    my ($i, $j, @d);

    for ($i = 0; $i < @k; $i++) {
	for ($j = $i + 1; $j < @k; $j++) {
	    if ($iLocale{$k[$j]} < $iLocale{$k[$i]}) {
		push(@d, [$k[$j], $k[$i]]);
	    }
	}
    }

    # Cross-check those places.

    for (@d) {
	($i, $j) = @$_;
	print 'not ' if ($i le $j or not (($i cmp $j) == 1));
    }
}
print "ok 66\n";

# Cross-check whole character set.

for (map { chr } 0..255) {
    if (/\w/ and /\W/) { print 'not '; last }
    if (/\d/ and /\D/) { print 'not '; last }
    if (/\s/ and /\S/) { print 'not '; last }
    if (/\w/ and /\D/ and not /_/ and
	not (exists $UPPER{$_} or exists $lower{$_})) {
	print 'not '; last
    }
}
print "ok 67\n";
Commit	Line	Data
92d69e20	1	#!./perl -wT
	2
	3	print "1..67\n";
	4
	5	BEGIN {
	6	chdir 't' if -d 't';
	7	@INC = '../lib';
	8	}
	9
	10	use strict;
	11	use POSIX qw(locale_h);
	12
	13	use vars qw($a
	14	$English $German $French $Spanish
	15	@C @English @German @French @Spanish
	16	$Locale @Locale %iLocale %UPPER %lower @Neoalpha);
	17
	18	$a = 'abc %';
	19
	20	sub ok {
	21	my ($n, $result) = @_;
	22
	23	print 'not ' unless ($result);
	24	print "ok $n\n";
	25	}
	26
	27	# First we'll do a lot of taint checking for locales.
	28	# This is the easiest to test, actually, as any locale,
	29	# even the default locale will taint under 'use locale'.
	30
	31	sub is_tainted { # hello, camel two.
	32	my $dummy;
	33	not eval { $dummy = join("", @_), kill 0; 1 }
	34	}
	35
	36	sub check_taint ($$) {
	37	ok $_[0], is_tainted($_[1]);
	38	}
	39
	40	sub check_taint_not ($$) {
	41	ok $_[0], not is_tainted($_[1]);
	42	}
	43
	44	use locale; # engage locale and therefore locale taint.
	45
	46	check_taint_not 1, $a;
	47
	48	check_taint 2, uc($a);
	49	check_taint 3, "\U$a";
	50	check_taint 4, ucfirst($a);
	51	check_taint 5, "\u$a";
	52	check_taint 6, lc($a);
	53	check_taint 7, "\L$a";
	54	check_taint 8, lcfirst($a);
	55	check_taint 9, "\l$a";
	56
	57	check_taint 10, sprintf('%e', 123.456);
	58	check_taint 11, sprintf('%f', 123.456);
	59	check_taint 12, sprintf('%g', 123.456);
	60	check_taint_not 13, sprintf('%d', 123.456);
	61	check_taint_not 14, sprintf('%x', 123.456);
	62
	63	$_ = $a; # untaint $_
	64
65	$_ = uc($a); # taint $_
66
67	check_taint 15, $_;
68
69	/(\w)/; # taint $&, $`, $', $+, $1.
70	check_taint 16, $&;
71	check_taint 17, $`;
72	check_taint 18, $';
73	check_taint 19, $+;
74	check_taint 20, $1;
75	check_taint_not 21, $2;
76
77	/(\W)/; # taint $&, $`, $', $+, $1.
78	check_taint 22, $&;
79	check_taint 23, $`;
80	check_taint 24, $';
81	check_taint 25, $+;
82	check_taint 26, $1;
83	check_taint_not 27, $2;
84
85	/(\s)/; # taint $&, $`, $', $+, $1.
86	check_taint 28, $&;
87	check_taint 29, $`;
88	check_taint 30, $';
89	check_taint 31, $+;
90	check_taint 32, $1;
91	check_taint_not 33, $2;
92
93	/(\S)/; # taint $&, $`, $', $+, $1.
94	check_taint 34, $&;
95	check_taint 35, $`;
96	check_taint 36, $';
97	check_taint 37, $+;
98	check_taint 38, $1;
99	check_taint_not 39, $2;
100
101	$_ = $a; # untaint $_
102
103	check_taint_not 40, $_;
104
105	/(b)/; # this must not taint
106	check_taint_not 41, $&;
107	check_taint_not 42, $`;
108	check_taint_not 43, $';
109	check_taint_not 44, $+;
110	check_taint_not 45, $1;
111	check_taint_not 46, $2;
112
113	$_ = $a; # untaint $_
114
115	check_taint_not 47, $_;
116
117	$b = uc($a); # taint $b
118	s/(.+)/$b/; # this must taint only the $_
119
120	check_taint 48, $_;
121	check_taint_not 49, $&;
122	check_taint_not 50, $`;
123	check_taint_not 51, $';
124	check_taint_not 52, $+;
125	check_taint_not 53, $1;
126	check_taint_not 54, $2;
127
128	$_ = $a; # untaint $_
129
130	s/(.+)/b/; # this must not taint
131	check_taint_not 55, $_;
132	check_taint_not 56, $&;
133	check_taint_not 57, $`;
134	check_taint_not 58, $';
135	check_taint_not 59, $+;
136	check_taint_not 60, $1;
137	check_taint_not 61, $2;
138
139	check_taint_not 62, $a;
140
141	# I think we've seen quite enough of taint.
142	# Let us do some real locale work now.
143
144	sub getalnum {
145	sort grep /\w/, map { chr } 0..255
146	}
147
148	sub locatelocale ($$@) {
149	my ($lcall, $alnum, @try) = @_;
150
151	undef $$lcall;
152
153	for (@try) {
154	local $^W = 0; # suppress "Subroutine LC_ALL redefined"
155	if (setlocale(LC_ALL, $_)) {
156	$$lcall = $_;
157	@$alnum = &getalnum;
158	last;
159	}
160	}
161
162	@$alnum = () unless (defined $$lcall);
163	}
164
165	# Find some default locale
166
167	locatelocale(\$Locale, \@Locale, qw(C POSIX));
168
169	# Find some English locale
170
171	locatelocale(\$English, \@English,
172	qw(en_US.ISO8859-1 en_GB.ISO8859-1
173	en en_US en_UK en_IE en_CA en_AU en_NZ
174	english english.iso88591
175	american american.iso88591
176	british british.iso88591
177	));
178
179	# Find some German locale
180
181	locatelocale(\$German, \@German,
182	qw(de_DE.ISO8859-1 de_AT.ISO8859-1 de_CH.ISO8859-1
183	de de_DE de_AT de_CH
184	german german.iso88591));
185
186	# Find some French locale
187
188	locatelocale(\$French, \@French,
189	qw(fr_FR.ISO8859-1 fr_BE.ISO8859-1 fr_CA.ISO8859-1 fr_CH.ISO8859-1
190	fr fr_FR fr_BE fr_CA fr_CH
191	french french.iso88591));
192
193	# Find some Spanish locale
194
195	locatelocale(\$Spanish, \@Spanish,
196	qw(es_AR.ISO8859-1 es_BO.ISO8859-1 es_CL.ISO8859-1
197	es_CO.ISO8859-1 es_CR.ISO8859-1 es_EC.ISO8859-1
198	es_ES.ISO8859-1 es_GT.ISO8859-1 es_MX.ISO8859-1
199	es_NI.ISO8859-1 es_PA.ISO8859-1 es_PE.ISO8859-1
200	es_PY.ISO8859-1 es_SV.ISO8859-1 es_UY.ISO8859-1 es_VE.ISO8859-1
201	es es_AR es_BO es_CL
202	es_CO es_CR es_EC
203	es_ES es_GT es_MX
204	es_NI es_PA es_PE
205	es_PY es_SV es_UY es_VE
206	spanish spanish.iso88591));
207
208	# Select the largest of the alpha(num)bets.
209
210	($Locale, @Locale) = ($English, @English)
211	if (length(@English) > length(@Locale));
212	($Locale, @Locale) = ($German, @German)
213	if (length(@German) > length(@Locale));
214	($Locale, @Locale) = ($French, @French)
215	if (length(@French) > length(@Locale));
216	($Locale, @Locale) = ($Spanish, @Spanish)
217	if (length(@Spanish) > length(@Locale));
218
219	print "# Locale = $Locale\n";
220	print "# Alnum_ = @Locale\n";
221
222	{
223	local $^W = 0;
224	setlocale(LC_ALL, $Locale);
225	}
226
227	{
228	my $i = 0;
229
230	for (@Locale) {
231	$iLocale{$_} = $i++;
232	}
233	}
234
235	# Sieve the uppercase and the lowercase.
236
237	for (@Locale) {
238	if (/[^\d_]/) { # skip digits and the _
239	if (lc eq $_) {
240	$UPPER{$_} = uc;
241	} else {
242	$lower{$_} = lc;
243	}
244	}
245	}
246
247	# Cross-check the upper and the lower.
248	# Yes, this is broken when the upper<->lower changes the number of
249	# the glyphs (e.g. the German sharp-s aka double-s aka sz-ligature.
250	# But so far all the implementations do this wrong so we can do it wrong too.
251
252	for (keys %UPPER) {
253	if (defined $lower{$UPPER{$_}}) {
254	if ($_ ne $lower{$UPPER{$_}}) {
255	print 'not ';
256	last;
257	}
258	}
259	}
260	print "ok 63\n";
261
262	for (keys %lower) {
263	if (defined $UPPER{$lower{$_}}) {
264	if ($_ ne $UPPER{$lower{$_}}) {
265	print 'not ';
266	last;
267	}
268	}
269	}
270	print "ok 64\n";
271
272	# Find the alphabets that are not alphabets in the default locale.
273
274	{
275	no locale;
276
277	for (keys %UPPER, keys %lower) {
278	push(@Neoalpha, $_) if (/\W/);
279	}
280	}
281
282	@Neoalpha = sort @Neoalpha;
283
284	# Test \w.
285
286	{
287	my $word = join('', @Neoalpha);
288
289	$word =~ /^(\w*)$/;
290
291	print 'not ' if ($1 ne $word);
292	}
293	print "ok 65\n";
294
295	# Find places where the collation order differs from the default locale.
296
297	{
298	no locale;
299
300	my @k = sort (keys %UPPER, keys %lower);
301	my ($i, $j, @d);
302
303	for ($i = 0; $i < @k; $i++) {
304	for ($j = $i + 1; $j < @k; $j++) {
305	if ($iLocale{$k[$j]} < $iLocale{$k[$i]}) {
306	push(@d, [$k[$j], $k[$i]]);
307	}
308	}
309	}
310
311	# Cross-check those places.
312
313	for (@d) {
314	($i, $j) = @$_;
315	print 'not ' if ($i le $j or not (($i cmp $j) == 1));
316	}
317	}
318	print "ok 66\n";
319
320	# Cross-check whole character set.
321
322	for (map { chr } 0..255) {
323	if (/\w/ and /\W/) { print 'not '; last }
324	if (/\d/ and /\D/) { print 'not '; last }
325	if (/\s/ and /\S/) { print 'not '; last }
326	if (/\w/ and /\D/ and not /_/ and
327	not (exists $UPPER{$_} or exists $lower{$_})) {
328	print 'not '; last
329	}
330	}
331	print "ok 67\n";