From: Jarkko Hietaniemi Date: Wed, 27 Mar 2002 23:48:15 +0000 (+0000) Subject: Try to handle UTF-8 locales. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=ef4a39e599a07c97b0213b41bfa69f4a6f4a17ed;p=p5sagit%2Fp5-mst-13.2.git Try to handle UTF-8 locales. p4raw-id: //depot/perl@15561 --- diff --git a/lib/locale.t b/lib/locale.t index e9ba3fa..ee75c65 100644 --- a/lib/locale.t +++ b/lib/locale.t @@ -308,7 +308,7 @@ if ($^O eq 'os390') { $locales =~ s/Thai:th:th:11 tis620\n//; } -sub in_utf8 () { $^H & 0x08 } +sub in_utf8 () { $^H & 0x08 || (${^OPEN} || "") =~ /:utf8/ } if (in_utf8) { require "lib/locale/utf8"; @@ -430,8 +430,6 @@ if (-x "/usr/bin/locale" && open(LOCALES, "/usr/bin/locale -a 2>/dev/null|")) { setlocale(LC_ALL, "C"); -sub utf8locale { $_[0] =~ /utf-?8/i } - @Locale = sort @Locale; debug "# Locales = @Locale\n"; @@ -520,18 +518,19 @@ foreach $Locale (@Locale) { # Test \w. - if (utf8locale($Locale)) { - # utf8 and locales do not mix. - debug "# skipping UTF-8 locale '$Locale'\n"; - push @utf8locale, $Locale; - @utf8skip{99..102} = (); - } else { - my $word = join('', @Neoalpha); + my $word = join('', @Neoalpha); - $word =~ /^(\w+)$/; - - tryneoalpha($Locale, 99, $1 eq $word); + if ($Locale =~ /utf-?8/i) { + debug "# unknown whether locale and Unicode have the same \\w, skipping test 99 for locale '$Locale'\n"; + push @{$Okay{99}}, $Locale; + } else { + if ($word =~ /^(\w+)$/) { + tryneoalpha($Locale, 99, 1); + } else { + tryneoalpha($Locale, 99, 0); + } } + # Cross-check the whole 8-bit character set. for (map { chr } 0..255) { @@ -712,50 +711,44 @@ foreach $Locale (@Locale) { # case-insensitively the UPPER, and does the UPPER match # case-insensitively the lc of the UPPER. And vice versa. { - if (utf8locale($Locale)) { - # utf8 and locales do not mix. - debug "# skipping UTF-8 locale '$Locale'\n"; - push @utf8locale, $Locale; - $utf8skip{117}++; - } else { - use locale; - no utf8; - my $re = qr/[\[\(\{\*\+\?\|\^\$\\]/; - - my @f = (); - foreach my $x (keys %UPPER) { - my $y = lc $x; - next unless uc $y eq $x; - print "# UPPER $x lc $y ", - $x =~ /$y/i ? 1 : 0, " ", - $y =~ /$x/i ? 1 : 0, "\n" if 0; - # If $x and $y contain regular expression characters - # AND THEY lowercase (/i) to regular expression characters, - # regcomp() will be mightily confused. No, the \Q doesn't - # help here (maybe regex engine internal lowercasing - # is done after the \Q?) An example of this happening is - # the bg_BG (Bulgarian) locale under EBCDIC (OS/390 USS): - # the chr(173) (the "[") is the lowercase of the chr(235). - # Similarly losing EBCDIC locales include cs_cz, cs_CZ, - # el_gr, el_GR, en_us.IBM-037 (!), en_US.IBM-037 (!), - # et_ee, et_EE, hr_hr, hr_HR, hu_hu, hu_HU, lt_LT, - # mk_mk, mk_MK, nl_nl.IBM-037, nl_NL.IBM-037, - # pl_pl, pl_PL, ro_ro, ro_RO, ru_ru, ru_RU, - # sk_sk, sk_SK, sl_si, sl_SI, tr_tr, tr_TR. - if ($x =~ $re || $y =~ $re) { - print "# Regex characters in '$x' or '$y', skipping test 117 for locale '$Locale'\n"; - next; - } - # With utf8 both will fail since the locale concept - # of upper/lower does not work well in Unicode. - push @f, $x unless $x =~ /$y/i == $y =~ /$x/i; + use locale; + no utf8; + my $re = qr/[\[\(\{\*\+\?\|\^\$\\]/; + + my @f = (); + foreach my $x (keys %UPPER) { + my $y = lc $x; + next unless uc $y eq $x; + print "# UPPER $x lc $y ", + $x =~ /$y/i ? 1 : 0, " ", + $y =~ /$x/i ? 1 : 0, "\n" if 0; + # If $x and $y contain regular expression characters + # AND THEY lowercase (/i) to regular expression characters, + # regcomp() will be mightily confused. No, the \Q doesn't + # help here (maybe regex engine internal lowercasing + # is done after the \Q?) An example of this happening is + # the bg_BG (Bulgarian) locale under EBCDIC (OS/390 USS): + # the chr(173) (the "[") is the lowercase of the chr(235). + # Similarly losing EBCDIC locales include cs_cz, cs_CZ, + # el_gr, el_GR, en_us.IBM-037 (!), en_US.IBM-037 (!), + # et_ee, et_EE, hr_hr, hr_HR, hu_hu, hu_HU, lt_LT, + # mk_mk, mk_MK, nl_nl.IBM-037, nl_NL.IBM-037, + # pl_pl, pl_PL, ro_ro, ro_RO, ru_ru, ru_RU, + # sk_sk, sk_SK, sl_si, sl_SI, tr_tr, tr_TR. + if ($x =~ $re || $y =~ $re) { + print "# Regex characters in '$x' or '$y', skipping test 117 for locale '$Locale'\n"; + next; } + # With utf8 both will fail since the locale concept + # of upper/lower does not work well in Unicode. + push @f, $x unless $x =~ /$y/i == $y =~ /$x/i; + foreach my $x (keys %lower) { my $y = uc $x; next unless lc $y eq $x; print "# lower $x uc $y ", - $x =~ /$y/i ? 1 : 0, " ", - $y =~ /$x/i ? 1 : 0, "\n" if 0; + $x =~ /$y/i ? 1 : 0, " ", + $y =~ /$x/i ? 1 : 0, "\n" if 0; if ($x =~ $re || $y =~ $re) { # See above. print "# Regex characters in '$x' or '$y', skipping test 117 for locale '$Locale'\n"; next;