From: Jarkko Hietaniemi Date: Wed, 14 Nov 2001 14:59:32 +0000 (+0000) Subject: The First, Last ranges in the Unicode data weren't X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=e904f99525ffc0cd5f09346758a1931019c2f0b0;p=p5sagit%2Fp5-mst-13.2.git The First, Last ranges in the Unicode data weren't getting their general categories added properly; noticed by Jeffrey Friedl. p4raw-id: //depot/perl@12994 --- diff --git a/lib/unicore/Category.pl b/lib/unicore/Category.pl index 6f0979d..e8f676c 100644 --- a/lib/unicore/Category.pl +++ b/lib/unicore/Category.pl @@ -1381,12 +1381,17 @@ return <<'END'; 3300 3376 So 337B 33DD So 33E0 33FE So +3400 4DB5 Lo +4E00 9FA5 Lo A000 A48C Lo A490 A4A1 So A4A4 A4B3 So A4B5 A4C0 So A4C2 A4C4 So A4C6 So +AC00 D7A3 Lo +D800 DFFF Cs +E000 F8FF Co F900 FA2D Lo FB00 FB06 Ll FB13 FB17 Ll @@ -1587,7 +1592,10 @@ FFFC FFFD So 1D7C3 Sm 1D7C4 1D7C9 Ll 1D7CE 1D7FF Nd +20000 2A6D6 Lo 2F800 2FA1D Lo E0001 Cf E0020 E007F Cf +F0000 FFFFD Co +100000 10FFFD Co END diff --git a/lib/unicore/In/0.pl b/lib/unicore/In/0.pl index 6b95de3..db52684f 100644 --- a/lib/unicore/In/0.pl +++ b/lib/unicore/In/0.pl @@ -2,5 +2,5 @@ # This file is built by mktables from e.g. Unicode.txt. # Any changes made here will be lost! return <<'END'; -3400 4DB5 CJK Ideograph Extension A +3400 4DB5 Lo END diff --git a/lib/unicore/In/1.pl b/lib/unicore/In/1.pl index 3ef3166..e1894b8 100644 --- a/lib/unicore/In/1.pl +++ b/lib/unicore/In/1.pl @@ -2,5 +2,5 @@ # This file is built by mktables from e.g. Unicode.txt. # Any changes made here will be lost! return <<'END'; -4E00 9FA5 CJK Ideograph +4E00 9FA5 Lo END diff --git a/lib/unicore/In/164.pl b/lib/unicore/In/164.pl index afa40c3..6a1e2c3 100644 --- a/lib/unicore/In/164.pl +++ b/lib/unicore/In/164.pl @@ -309,13 +309,10 @@ return <<'END'; 3105 312C 3131 318E 31A0 31B7 -3400 -4DB5 -4E00 -9FA5 +3400 4DB5 +4E00 9FA5 A000 A48C -AC00 -D7A3 +AC00 D7A3 F900 FA2D FB00 FB06 FB13 FB17 @@ -378,7 +375,6 @@ FFDA FFDC 1D78A 1D7A8 1D7AA 1D7C2 1D7C4 1D7C9 -20000 -2A6D6 +20000 2A6D6 2F800 2FA1D END diff --git a/lib/unicore/In/169.pl b/lib/unicore/In/169.pl index 570636e..b41f21d 100644 --- a/lib/unicore/In/169.pl +++ b/lib/unicore/In/169.pl @@ -935,13 +935,10 @@ return <<'END'; 3105 312C 3131 318E 31A0 31B7 -3400 -4DB5 -4E00 -9FA5 +3400 4DB5 +4E00 9FA5 A000 A48C -AC00 -D7A3 +AC00 D7A3 F900 FA2D FB00 FB06 FB13 FB17 @@ -1034,7 +1031,6 @@ FFDA FFDC 1D790 1D7A8 1D7AA 1D7C2 1D7C4 1D7C9 -20000 -2A6D6 +20000 2A6D6 2F800 2FA1D END diff --git a/lib/unicore/In/170.pl b/lib/unicore/In/170.pl index a97c18f..30cbfe9 100644 --- a/lib/unicore/In/170.pl +++ b/lib/unicore/In/170.pl @@ -1099,13 +1099,10 @@ return <<'END'; 3105 312C 3131 318E 31A0 31B7 -3400 -4DB5 -4E00 -9FA5 +3400 4DB5 +4E00 9FA5 A000 A48C -AC00 -D7A3 +AC00 D7A3 F900 FA2D FB00 FB06 FB13 FB17 @@ -1212,7 +1209,6 @@ FFDA FFDC 1D7AA 1D7C2 1D7C4 1D7C9 1D7CE 1D7FF -20000 -2A6D6 +20000 2A6D6 2F800 2FA1D END diff --git a/lib/unicore/In/2.pl b/lib/unicore/In/2.pl index eec928f..c16f7d1 100644 --- a/lib/unicore/In/2.pl +++ b/lib/unicore/In/2.pl @@ -2,5 +2,5 @@ # This file is built by mktables from e.g. Unicode.txt. # Any changes made here will be lost! return <<'END'; -AC00 D7A3 Hangul Syllable +AC00 D7A3 Lo END diff --git a/lib/unicore/In/3.pl b/lib/unicore/In/3.pl index 5df4d54..2ca13f4 100644 --- a/lib/unicore/In/3.pl +++ b/lib/unicore/In/3.pl @@ -2,5 +2,5 @@ # This file is built by mktables from e.g. Unicode.txt. # Any changes made here will be lost! return <<'END'; -D800 DB7F Non Private Use High Surrogate +D800 DB7F Cs END diff --git a/lib/unicore/In/4.pl b/lib/unicore/In/4.pl index f33e5c3..acf09cc 100644 --- a/lib/unicore/In/4.pl +++ b/lib/unicore/In/4.pl @@ -2,5 +2,5 @@ # This file is built by mktables from e.g. Unicode.txt. # Any changes made here will be lost! return <<'END'; -DB80 DBFF Private Use High Surrogate +DB80 DBFF Cs END diff --git a/lib/unicore/In/5.pl b/lib/unicore/In/5.pl index fd896ff..15c3f92 100644 --- a/lib/unicore/In/5.pl +++ b/lib/unicore/In/5.pl @@ -2,5 +2,5 @@ # This file is built by mktables from e.g. Unicode.txt. # Any changes made here will be lost! return <<'END'; -DC00 DFFF Low Surrogate +DC00 DFFF Cs END diff --git a/lib/unicore/In/6.pl b/lib/unicore/In/6.pl index 1404dba..fc31fb8 100644 --- a/lib/unicore/In/6.pl +++ b/lib/unicore/In/6.pl @@ -2,5 +2,5 @@ # This file is built by mktables from e.g. Unicode.txt. # Any changes made here will be lost! return <<'END'; -E000 F8FF Private Use +E000 F8FF Co END diff --git a/lib/unicore/In/7.pl b/lib/unicore/In/7.pl index f5481cc..8eb0eee 100644 --- a/lib/unicore/In/7.pl +++ b/lib/unicore/In/7.pl @@ -2,5 +2,5 @@ # This file is built by mktables from e.g. Unicode.txt. # Any changes made here will be lost! return <<'END'; -20000 2A6D6 CJK Ideograph Extension B +20000 2A6D6 Lo END diff --git a/lib/unicore/In/8.pl b/lib/unicore/In/8.pl index be01ceb..5c82bcd 100644 --- a/lib/unicore/In/8.pl +++ b/lib/unicore/In/8.pl @@ -2,5 +2,5 @@ # This file is built by mktables from e.g. Unicode.txt. # Any changes made here will be lost! return <<'END'; -F0000 FFFFD Plane 15 Private Use +F0000 FFFFD Co END diff --git a/lib/unicore/In/9.pl b/lib/unicore/In/9.pl index 8eb12d1..ec7132a 100644 --- a/lib/unicore/In/9.pl +++ b/lib/unicore/In/9.pl @@ -2,5 +2,5 @@ # This file is built by mktables from e.g. Unicode.txt. # Any changes made here will be lost! return <<'END'; -100000 10FFFD Plane 16 Private Use +100000 10FFFD Co END diff --git a/lib/unicore/Is/Alnum.pl b/lib/unicore/Is/Alnum.pl index eb97eb8..97858ab 100644 --- a/lib/unicore/Is/Alnum.pl +++ b/lib/unicore/Is/Alnum.pl @@ -325,7 +325,10 @@ return <<'END'; 31A0 31B7 3220 3229 3280 3289 +3400 4DB5 +4E00 9FA5 A000 A48C +AC00 D7A3 F900 FA2D FB00 FB06 FB13 FB17 @@ -394,5 +397,6 @@ FFDA FFDC 1D7AA 1D7C2 1D7C4 1D7C9 1D7CE 1D7FF +20000 2A6D6 2F800 2FA1D END diff --git a/lib/unicore/Is/Alpha.pl b/lib/unicore/Is/Alpha.pl index cbd65d0..b8dc6c4 100644 --- a/lib/unicore/Is/Alpha.pl +++ b/lib/unicore/Is/Alpha.pl @@ -295,7 +295,10 @@ return <<'END'; 3105 312C 3131 318E 31A0 31B7 +3400 4DB5 +4E00 9FA5 A000 A48C +AC00 D7A3 F900 FA2D FB00 FB06 FB13 FB17 @@ -361,5 +364,6 @@ FFDA FFDC 1D78A 1D7A8 1D7AA 1D7C2 1D7C4 1D7C9 +20000 2A6D6 2F800 2FA1D END diff --git a/lib/unicore/Is/C.pl b/lib/unicore/Is/C.pl index 199094f..b58d48d 100644 --- a/lib/unicore/Is/C.pl +++ b/lib/unicore/Is/C.pl @@ -9,18 +9,12 @@ return <<'END'; 200C 200F 202A 202E 206A 206F -D800 -DB7F DB80 -DBFF DC00 -DFFF E000 -F8FF +D800 F8FF FEFF FFF9 FFFB 1D173 1D17A E0001 E0020 E007F -F0000 -FFFFD -100000 -10FFFD +F0000 FFFFD +100000 10FFFD END diff --git a/lib/unicore/Is/Cntrl.pl b/lib/unicore/Is/Cntrl.pl index 818cbc0..b58d48d 100644 --- a/lib/unicore/Is/Cntrl.pl +++ b/lib/unicore/Is/Cntrl.pl @@ -9,9 +9,12 @@ return <<'END'; 200C 200F 202A 202E 206A 206F +D800 F8FF FEFF FFF9 FFFB 1D173 1D17A E0001 E0020 E007F +F0000 FFFFD +100000 10FFFD END diff --git a/lib/unicore/Is/Co.pl b/lib/unicore/Is/Co.pl index b7ee129..04f3129 100644 --- a/lib/unicore/Is/Co.pl +++ b/lib/unicore/Is/Co.pl @@ -2,10 +2,7 @@ # This file is built by mktables from e.g. Unicode.txt. # Any changes made here will be lost! return <<'END'; -E000 -F8FF -F0000 -FFFFD -100000 -10FFFD +E000 F8FF +F0000 FFFFD +100000 10FFFD END diff --git a/lib/unicore/Is/Cs.pl b/lib/unicore/Is/Cs.pl index 79facec..bd71bd1 100644 --- a/lib/unicore/Is/Cs.pl +++ b/lib/unicore/Is/Cs.pl @@ -2,8 +2,5 @@ # This file is built by mktables from e.g. Unicode.txt. # Any changes made here will be lost! return <<'END'; -D800 -DB7F DB80 -DBFF DC00 -DFFF +D800 DFFF END diff --git a/lib/unicore/Is/Graph.pl b/lib/unicore/Is/Graph.pl index 15c9f1f..5c13624 100644 --- a/lib/unicore/Is/Graph.pl +++ b/lib/unicore/Is/Graph.pl @@ -319,13 +319,16 @@ return <<'END'; 3300 3376 337B 33DD 33E0 33FE +3400 4DB5 +4E00 9FA5 A000 A48C A490 A4A1 A4A4 A4B3 A4B5 A4C0 A4C2 A4C4 A4C6 -F900 FA2D +AC00 D7A3 +E000 FA2D FB00 FB06 FB13 FB17 FB1D FB36 @@ -386,5 +389,8 @@ FFFC FFFD 1D552 1D6A3 1D6A8 1D7C9 1D7CE 1D7FF +20000 2A6D6 2F800 2FA1D +F0000 FFFFD +100000 10FFFD END diff --git a/lib/unicore/Is/L.pl b/lib/unicore/Is/L.pl index bb34126..811603b 100644 --- a/lib/unicore/Is/L.pl +++ b/lib/unicore/Is/L.pl @@ -228,13 +228,10 @@ return <<'END'; 3105 312C 3131 318E 31A0 31B7 -3400 -4DB5 -4E00 -9FA5 +3400 4DB5 +4E00 9FA5 A000 A48C -AC00 -D7A3 +AC00 D7A3 F900 FA2D FB00 FB06 FB13 FB17 @@ -295,7 +292,6 @@ FFDA FFDC 1D78A 1D7A8 1D7AA 1D7C2 1D7C4 1D7C9 -20000 -2A6D6 +20000 2A6D6 2F800 2FA1D END diff --git a/lib/unicore/Is/Lo.pl b/lib/unicore/Is/Lo.pl index ff84f2b..726bbf7 100644 --- a/lib/unicore/Is/Lo.pl +++ b/lib/unicore/Is/Lo.pl @@ -161,13 +161,10 @@ return <<'END'; 3105 312C 3131 318E 31A0 31B7 -3400 -4DB5 -4E00 -9FA5 +3400 4DB5 +4E00 9FA5 A000 A48C -AC00 -D7A3 +AC00 D7A3 F900 FA2D FB1D FB1F FB28 @@ -193,7 +190,6 @@ FFD2 FFD7 FFDA FFDC 10300 1031E 10330 10349 -20000 -2A6D6 +20000 2A6D6 2F800 2FA1D END diff --git a/lib/unicore/Is/Print.pl b/lib/unicore/Is/Print.pl index 27eb056..0b94722 100644 --- a/lib/unicore/Is/Print.pl +++ b/lib/unicore/Is/Print.pl @@ -320,13 +320,16 @@ return <<'END'; 3300 3376 337B 33DD 33E0 33FE +3400 4DB5 +4E00 9FA5 A000 A48C A490 A4A1 A4A4 A4B3 A4B5 A4C0 A4C2 A4C4 A4C6 -F900 FA2D +AC00 D7A3 +E000 FA2D FB00 FB06 FB13 FB17 FB1D FB36 @@ -387,5 +390,8 @@ FFFC FFFD 1D552 1D6A3 1D6A8 1D7C9 1D7CE 1D7FF +20000 2A6D6 2F800 2FA1D +F0000 FFFFD +100000 10FFFD END diff --git a/lib/unicore/Is/Word.pl b/lib/unicore/Is/Word.pl index 437c067..baba914 100644 --- a/lib/unicore/Is/Word.pl +++ b/lib/unicore/Is/Word.pl @@ -326,7 +326,10 @@ return <<'END'; 31A0 31B7 3220 3229 3280 3289 +3400 4DB5 +4E00 9FA5 A000 A48C +AC00 D7A3 F900 FA2D FB00 FB06 FB13 FB17 @@ -395,5 +398,6 @@ FFDA FFDC 1D7AA 1D7C2 1D7C4 1D7C9 1D7CE 1D7FF +20000 2A6D6 2F800 2FA1D END diff --git a/lib/unicore/Name.pl b/lib/unicore/Name.pl index de76f40..860f087 100644 --- a/lib/unicore/Name.pl +++ b/lib/unicore/Name.pl @@ -7950,6 +7950,8 @@ return <<'END'; 33FC IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY TWENTY-NINE 33FD IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY 33FE IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE +3400 4DB5 CJK Ideograph Extension A +4E00 9FA5 CJK Ideograph A000 YI SYLLABLE IT A001 YI SYLLABLE IX A002 YI SYLLABLE I @@ -9165,6 +9167,11 @@ A4C2 YI RADICAL SHOP A4C3 YI RADICAL CHE A4C4 YI RADICAL ZZIET A4C6 YI RADICAL KE +AC00 D7A3 Hangul Syllable +D800 DB7F Non Private Use High Surrogate +DB80 DBFF Private Use High Surrogate +DC00 DFFF Low Surrogate +E000 F8FF Private Use F900 CJK COMPATIBILITY IDEOGRAPH-F900 F901 CJK COMPATIBILITY IDEOGRAPH-F901 F902 CJK COMPATIBILITY IDEOGRAPH-F902 @@ -12138,6 +12145,7 @@ FFFD REPLACEMENT CHARACTER 1D7FD MATHEMATICAL MONOSPACE DIGIT SEVEN 1D7FE MATHEMATICAL MONOSPACE DIGIT EIGHT 1D7FF MATHEMATICAL MONOSPACE DIGIT NINE +20000 2A6D6 CJK Ideograph Extension B 2F800 CJK COMPATIBILITY IDEOGRAPH-2F800 2F801 CJK COMPATIBILITY IDEOGRAPH-2F801 2F802 CJK COMPATIBILITY IDEOGRAPH-2F802 @@ -12777,4 +12785,6 @@ E007C TAG VERTICAL LINE E007D TAG RIGHT CURLY BRACKET E007E TAG TILDE E007F CANCEL TAG +F0000 FFFFD Plane 15 Private Use +100000 10FFFD Plane 16 Private Use END diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 676e189..5615aee 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -114,6 +114,55 @@ my %Cat; my %General; my @General; +sub gencat { + my ($Name, $GeneralH, $GeneralA, $Cat, + $name, $cat, $code, $op) = @_; + + $op->($Name, $code, $name); + $op->($GeneralA, $code, $cat); + + $op->($GeneralH->{$name} ||= [], $code, $name); + + $op->($Cat->{$cat} ||= [], $code); + $op->($Cat->{substr($cat, 0, 1)} + ||= [], $code); + # 005F: SPACING UNDERSCORE + $op->($Cat->{Word} ||= [], $code) + if $cat =~ /^[LMN]/ or $code eq "005F"; + $op->($Cat->{Alnum} ||= [], $code) + if $cat =~ /^[LMN]/; + $op->($Cat->{Alpha} ||= [], $code) + if $cat =~ /^[LM]/; + # 0009: HORIZONTAL TABULATION + # 000A: LINE FEED + # 000B: VERTICAL TABULATION + # 000C: FORM FEED + # 000D: CARRIAGE RETURN + # 0020: SPACE + $op->($Cat->{Space} ||= [], $code) + if $cat =~ /^Z/ || + $code =~ /^(0009|000A|000B|000C|000D)$/; + $op->($Cat->{SpacePerl} ||= [], $code) + if $cat =~ /^Z/ || + $code =~ /^(0009|000A|000C|000D)$/; + $op->($Cat->{Blank} ||= [], $code) + if $code =~ /^(0020|0009)$/ || + $cat =~ /^Z[^lp]$/; + $op->($Cat->{Digit} ||= [], $code) if $cat eq "Nd"; + $op->($Cat->{Upper} ||= [], $code) if $cat eq "Lu"; + $op->($Cat->{Lower} ||= [], $code) if $cat eq "Ll"; + $op->($Cat->{Title} ||= [], $code) if $cat eq "Lt"; + $op->($Cat->{ASCII} ||= [], $code) if $code le "007F"; + $op->($Cat->{Cntrl} ||= [], $code) if $cat =~ /^C/; + $op->($Cat->{Graph} ||= [], $code) if $cat =~ /^([LMNPS]|Co)/; + $op->($Cat->{Print} ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/; + $op->($Cat->{Punct} ||= [], $code) if $cat =~ /^P/; + # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f + $op->($Cat->{XDigit} ||= [], $code) + if $code =~ /^00(3[0-9]|[46][1-6])$/; + +} + if (open(my $Unicode, "Unicode.txt")) { my @Name; my @Bidi; @@ -136,61 +185,18 @@ if (open(my $Unicode, "Unicode.txt")) { if ($name =~ /^<(.+), (First|Last)>$/) { $name = $1; - if ($2 eq 'First') { - append($General{$name} ||= [], $code, $name); - } else { - extend($General{$name} , $code); - } + gencat(\@Name, \%General, \@General, \%Cat, + $name, $cat, $code, + $2 eq 'First' ? \&append : \&extend); unless (defined $In{$name}) { $In{$name} = $InId++; $InIn{$name} = $General{$name}; } - append($Cat{$cat} ||= [], $code); - append($Cat{substr($cat, 0, 1)} - ||= [], $code); } else { - append(\@Name, $code, $name); - - append(\@General, $code, $cat); - - append($Cat{$cat} ||= [], $code); - append($Cat{substr($cat, 0, 1)} - ||= [], $code); - # 005F: SPACING UNDERSCORE - append($Cat{Word} ||= [], $code) - if $cat =~ /^[LMN]/ or $code eq "005F"; - append($Cat{Alnum} ||= [], $code) - if $cat =~ /^[LMN]/; - append($Cat{Alpha} ||= [], $code) - if $cat =~ /^[LM]/; - # 0009: HORIZONTAL TABULATION - # 000A: LINE FEED - # 000B: VERTICAL TABULATION - # 000C: FORM FEED - # 000D: CARRIAGE RETURN - # 0020: SPACE - append($Cat{Space} ||= [], $code) - if $cat =~ /^Z/ || - $code =~ /^(0009|000A|000B|000C|000D)$/; - append($Cat{SpacePerl} ||= [], $code) - if $cat =~ /^Z/ || - $code =~ /^(0009|000A|000C|000D)$/; - append($Cat{Blank} ||= [], $code) - if $code =~ /^(0020|0009)$/ || - $cat =~ /^Z[^lp]$/; - append($Cat{Digit} ||= [], $code) if $cat eq "Nd"; - append($Cat{Upper} ||= [], $code) if $cat eq "Lu"; - append($Cat{Lower} ||= [], $code) if $cat eq "Ll"; - append($Cat{Title} ||= [], $code) if $cat eq "Lt"; - append($Cat{ASCII} ||= [], $code) if $code le "007F"; - append($Cat{Cntrl} ||= [], $code) if $cat =~ /^C/; - append($Cat{Graph} ||= [], $code) if $cat =~ /^([LMNPS]|Co)/; - append($Cat{Print} ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/; - append($Cat{Punct} ||= [], $code) if $cat =~ /^P/; - # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f - append($Cat{XDigit} ||= [], $code) - if $code =~ /^00(3[0-9]|[46][1-6])$/; - + + gencat(\@Name, \%General, \@General, \%Cat, + $name, $cat, $code, \&append); + append($To{Upper} ||= [], $code, $upper) if $upper; append($To{Lower} ||= [], $code, $lower) if $lower; append($To{Title} ||= [], $code, $title) if $title; @@ -653,7 +659,7 @@ foreach my $in (sort { $In{$a} <=> $In{$b} } keys %In) { # # The mapping from General Category long forms to short forms is # currently hardwired here since no simple data file in the UCD -# seems to do that. +# seems to do that. Unicode 3.2 will assumedly correct this. # my %Is = (