##
## Get the list of codepoints for the type.
- ## Called from utf8.c
+ ## Called from swash_init (see utf8.c) or SWASHNEW itself.
+ ##
+ ## Callers of swash_init:
+ ## op.c:pmtrans -- for tr/// and y///
+ ## regexec.c:regclass_swash -- for /[]/, \p, and \P
+ ## utf8.c:is_utf8_common -- for common Unicode properties
+ ## utf8.c:to_utf8_case -- for lc, uc, ucfirst, etc. and //i
##
## Given a $type, our goal is to fill $list with the set of codepoint
- ## ranges.
+ ## ranges. If $type is false, $list passed is used.
+ ##
+ ## $minbits:
+ ## For binary properties, $minbits must be 1.
+ ## For character mappings (case and transliteration), $minbits must
+ ## be a number except 1.
+ ##
+ ## $list (or that filled according to $type):
+ ## Refer to perlunicode.pod, "User-Defined Character Properties."
+ ##
+ ## For binary properties, only characters with the property value
+ ## of True should be listed. The 3rd column, if any, will be ignored.
##
## To make the parsing of $type clear, this code takes the a rather
## unorthodox approach of last'ing out of the block once we have the
$type =~ s/^\s+//;
$type =~ s/\s+$//;
- print "type = $type\n" if DEBUG;
+ print STDERR "type = $type\n" if DEBUG;
GETFILE:
{
##
my $canonical = lc $type;
$canonical =~ s/(?<=[a-z\d])(?:\s+|[-_])(?=[a-z\d])//g;
- print "canonical = $canonical\n" if DEBUG;
+ print STDERR "canonical = $canonical\n" if DEBUG;
require "unicore/Canonical.pl";
if (my $base = ($utf8::Canonical{$canonical} || $utf8::Canonical{ lc $utf8::PropertyAlias{$canonical} })) {
## The user-level way to access ToDigit() and ToFold()
## is to use Unicode::UCD.
##
- if ($type =~ /^To(Digit|Fold|Lower|Title|Upper)$/)
- {
+ if ($type =~ /^To(Digit|Fold|Lower|Title|Upper)$/) {
$file = "unicore/To/$1.pl";
## would like to test to see if $file actually exists....
last GETFILE;
}
if (defined $file) {
- print "found it (file='$file')\n" if DEBUG;
+ print STDERR "found it (file='$file')\n" if DEBUG;
##
## If we reach here, it was due to a 'last GETFILE' above
## If we have, return the cached results. The cache key is the
## file to load.
##
- if ($Cache{$file} and ref($Cache{$file}) eq $class)
- {
- print "Returning cached '$file' for \\p{$type}\n" if DEBUG;
+ if ($Cache{$file} and ref($Cache{$file}) eq $class) {
+ print STDERR "Returning cached '$file' for \\p{$type}\n" if DEBUG;
return $Cache{$class, $file};
}
}
my $extras;
- my $bits = 0;
+ my $bits = $minbits;
my $ORIG = $list;
if ($list) {
$list =~ s/\tXXXX$/\t$hextra/mg;
}
- if ($minbits < 32) {
+ if ($minbits != 1 && $minbits < 32) { # not binary property
my $top = 0;
while ($list =~ /^([0-9a-fA-F]+)(?:[\t]([0-9a-fA-F]+)?)(?:[ \t]([0-9a-fA-F]+))?/mg) {
my $min = hex $1;
$val += $max - $min if defined $3;
$top = $val if $val > $top;
}
- $bits =
+ my $topbits =
$top > 0xffff ? 32 :
- $top > 0xff ? 16 :
- $top > 1 ? 8 : 1
+ $top > 0xff ? 16 : 8;
+ $bits = $topbits if $bits < $topbits;
}
- $bits = $minbits if $bits < $minbits;
my @extras;
for my $x ($extras) {
my ($c,$t) = split(/::/, $name, 2); # bogus use of ::, really
my $subobj;
if ($c eq 'utf8') {
- $subobj = utf8->SWASHNEW($t, "", 0, 0, 0);
+ $subobj = utf8->SWASHNEW($t, "", $minbits, 0);
}
elsif (exists &$name) {
- $subobj = utf8->SWASHNEW($name, "", 0, 0, 0);
+ $subobj = utf8->SWASHNEW($name, "", $minbits, 0);
}
elsif ($c =~ /^([0-9a-fA-F]+)/) {
- $subobj = utf8->SWASHNEW("", $c, 0, 0, 0);
+ $subobj = utf8->SWASHNEW("", $c, $minbits, 0);
}
return $subobj unless ref $subobj;
push @extras, $name => $subobj;
SV **const ary = AvARRAY(av);
SV **a, **b;
- /* See the end of regcomp.c:S_reglass() for
+ /* See the end of regcomp.c:S_regclass() for
* documentation of these array elements. */
si = *ary;
$| = 1;
-print "1..1191\n";
+print "1..1195\n";
BEGIN {
chdir 't' if -d 't';
END
}
-print "\x{0712}" =~ /\p{IsSyriac1}/ ? "ok $test\n" : "not ok $test\n"; $test++;
-print "\x{072F}" =~ /\P{IsSyriac1}/ ? "ok $test\n" : "not ok $test\n"; $test++;
+ok("\x{0712}" =~ /\p{IsSyriac1}/, '\x{0712}, \p{IsSyriac1}');
+ok("\x{072F}" =~ /\P{IsSyriac1}/, '\x{072F}, \P{IsSyriac1}');
sub Syriac1 {
return <<'END';
END
}
-print "\x{0712}" =~ /\p{Syriac1}/ ? "ok $test\n" : "not ok $test\n"; $test++;
-print "\x{072F}" =~ /\P{Syriac1}/ ? "ok $test\n" : "not ok $test\n"; $test++;
+ok("\x{0712}" =~ /\p{Syriac1}/, '\x{0712}, \p{Syriac1}');
+ok("\x{072F}" =~ /\P{Syriac1}/, '\x{072F}, \p{Syriac1}');
print "# user-defined character properties may lack \\n at the end\n";
sub InGreekSmall { return "03B1\t03C9" }
ok("\x{03A0}" =~ /\p{InGreekCapital}/, "Capital PI");
ok("\x{03A2}" =~ /\P{InGreekCapital}/, "Reserved");
+sub AsciiHexAndDash {
+ return <<'END';
++utf8::ASCII_Hex_Digit
++utf8::Dash
+END
+}
+
+ok("-" =~ /\p{Dash}/, "'-' is Dash");
+ok("A" =~ /\p{ASCII_Hex_Digit}/, "'A' is ASCII_Hex_Digit");
+ok("-" =~ /\p{AsciiHexAndDash}/, "'-' is AsciiHexAndDash");
+ok("A" =~ /\p{AsciiHexAndDash}/, "'A' is AsciiHexAndDash");
+
{
print "# Change #18179\n";
# previously failed with "panic: end_shift
"# TODO assigning to original string should not corrupt match vars");
}
-# last test 1191
+# last test 1195
if (!is_utf8_char(p))
return FALSE;
if (!*swash)
- *swash = swash_init("utf8", swashname, &PL_sv_undef, 0, 0);
+ *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
return swash_fetch(*swash, p, TRUE) != 0;
}
&PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
}
-/* a "swash" is a swatch hash */
-
+/* Note:
+ * A "swash" is a swatch hash.
+ * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
+ * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
+ * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
+ */
SV*
Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
{
S_swash_get(pTHX_ SV* swash, UV start, UV span)
{
SV *swatch;
- U8 *l, *lend, *x, *xend, *s, *nl;
+ U8 *l, *lend, *x, *xend, *s;
STRLEN lcur, xcur, scur;
HV* const hv = (HV*)SvRV(swash);
STRLEN numlen;
I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
- nl = (U8*)memchr(l, '\n', lend - l);
+ U8* nl = (U8*)memchr(l, '\n', lend - l);
numlen = lend - l;
min = grok_hex((char *)l, &numlen, &flags, NULL);
}
}
}
+ else
+ val = 0; /* bits == 1, then val should be ignored */
}
else {
max = min;
Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
}
}
+ else
+ val = 0; /* bits == 1, then val should be ignored */
}
if (nl)
++val;
}
}
- else {
+ else { /* bits == 1, then val should be ignored */
if (min < start)
min = start;
for (key = min; key <= max; key++) {
HV* otherhv;
STRLEN otherbits;
SV **otherbitssvp, *other;
- U8 *s, *o;
+ U8 *s, *o, *nl;
STRLEN slen, olen;
U8 opc = *x++;
break;
}
}
- else { /* bits >= 8 */
- /* XXX: but weirdly otherval is treated as boolean */
+ else {
STRLEN otheroctets = otherbits >> 3;
STRLEN offset = 0;
U8* send = s + slen;
}
}
- if (opc == '+' && otherval)
- otherval = 1;
+ if (opc == '+' && otherval)
+ ; /* replace with otherval */
else if (opc == '!' && !otherval)
otherval = 1;
else if (opc == '-' && otherval)
else if (opc == '&' && !otherval)
otherval = 0;
else {
- s += octets; /* not modify orig swatch */
+ s += octets; /* no replacement */
continue;
}