From: Jarkko Hietaniemi Date: Fri, 19 Oct 2001 03:25:44 +0000 (+0000) Subject: Unicode categories continue: X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=71d929cb3e130a6486f59d4312ce76d7d6eea647;p=p5sagit%2Fp5-mst-13.2.git Unicode categories continue: implement Category=, Script=, Block= (these are based on an upcoming update of TR#18) Fix a bug where we got two In categories named "old italic", and another where shortcut for the Is categories wasn't taken. p4raw-id: //depot/perl@12500 --- diff --git a/lib/unicore/Blocks.pl b/lib/unicore/Blocks.pl index b58ff0d..c1b2617 100644 --- a/lib/unicore/Blocks.pl +++ b/lib/unicore/Blocks.pl @@ -90,7 +90,7 @@ FE70 FEFE Arabic Presentation Forms-B FEFF Specials FF00 FFEF Halfwidth and Fullwidth Forms FFF0 FFFD Specials -10300 1032F Old Italic +10300 1032F Old Italic Block 10330 1034F Gothic Block 10400 1044F Deseret Block 1D000 1D0FF Byzantine Musical Symbols diff --git a/lib/unicore/In.pl b/lib/unicore/In.pl index 8d59516..8e3cdf5 100644 --- a/lib/unicore/In.pl +++ b/lib/unicore/In.pl @@ -133,7 +133,7 @@ 'Number Forms' => '95', 'OGHAM' => '38', 'Ogham Block' => '84', -'Old Italic' => '137', +'Old Italic Block' => '137', 'OLD-ITALIC' => '47', 'Optical Character Recognition' => '100', 'ORIYA' => '23', @@ -405,7 +405,7 @@ 'Ogham(?:[-_]|\s+)?Block' => '84', }, 'ol' => { - 'Old(?:[-_]|\s+)?Italic' => '137', + 'Old(?:[-_]|\s+)?Italic(?:[-_]|\s+)?Block' => '137', 'OLD(?:[-_]|\s+)?ITALIC' => '47', }, 'op' => { @@ -489,3 +489,146 @@ 'Yi(?:[-_]|\s+)?Syllables' => '122', }, ); + +%utf8::InScript = +( + 10 => 'LATIN', + 11 => 'GREEK', + 12 => 'INHERITED', + 13 => 'CYRILLIC', + 14 => 'ARMENIAN', + 15 => 'HEBREW', + 16 => 'ARABIC', + 17 => 'SYRIAC', + 18 => 'THAANA', + 19 => 'DEVANAGARI', + 20 => 'BENGALI', + 21 => 'GURMUKHI', + 22 => 'GUJARATI', + 23 => 'ORIYA', + 24 => 'TAMIL', + 25 => 'TELUGU', + 26 => 'KANNADA', + 27 => 'MALAYALAM', + 28 => 'SINHALA', + 29 => 'THAI', + 30 => 'LAO', + 31 => 'TIBETAN', + 32 => 'MYANMAR', + 33 => 'GEORGIAN', + 34 => 'HANGUL', + 35 => 'ETHIOPIC', + 36 => 'CHEROKEE', + 37 => 'CANADIAN-ABORIGINAL', + 38 => 'OGHAM', + 39 => 'RUNIC', + 40 => 'KHMER', + 41 => 'MONGOLIAN', + 42 => 'HAN', + 43 => 'HIRAGANA', + 44 => 'KATAKANA', + 45 => 'BOPOMOFO', + 46 => 'YI', + 47 => 'OLD-ITALIC', + 48 => 'GOTHIC', + 49 => 'DESERET', +); + +%utf8::InBlock = +( + 51 => 'Basic Latin', + 52 => 'Latin-1 Supplement', + 53 => 'Latin Extended-A', + 54 => 'Latin Extended-B', + 55 => 'IPA Extensions', + 56 => 'Spacing Modifier Letters', + 57 => 'Combining Diacritical Marks', + 58 => 'Greek', + 59 => 'Cyrillic', + 60 => 'Armenian', + 61 => 'Hebrew', + 62 => 'Arabic', + 63 => 'Syriac', + 64 => 'Thaana', + 65 => 'Devanagari', + 66 => 'Bengali', + 67 => 'Gurmukhi', + 68 => 'Gujarati', + 69 => 'Oriya', + 70 => 'Tamil', + 71 => 'Telugu', + 72 => 'Kannada', + 73 => 'Malayalam', + 74 => 'Sinhala', + 75 => 'Thai', + 76 => 'Lao', + 77 => 'Tibetan', + 78 => 'Myanmar', + 79 => 'Georgian', + 80 => 'Hangul Jamo', + 81 => 'Ethiopic', + 82 => 'Cherokee', + 83 => 'Unified Canadian Aboriginal Syllabics', + 84 => 'Ogham', + 85 => 'Runic', + 86 => 'Khmer', + 87 => 'Mongolian', + 88 => 'Latin Extended Additional', + 89 => 'Greek Extended', + 90 => 'General Punctuation', + 91 => 'Superscripts and Subscripts', + 92 => 'Currency Symbols', + 93 => 'Combining Marks for Symbols', + 94 => 'Letterlike Symbols', + 95 => 'Number Forms', + 96 => 'Arrows', + 97 => 'Mathematical Operators', + 98 => 'Miscellaneous Technical', + 99 => 'Control Pictures', + 100 => 'Optical Character Recognition', + 101 => 'Enclosed Alphanumerics', + 102 => 'Box Drawing', + 103 => 'Block Elements', + 104 => 'Geometric Shapes', + 105 => 'Miscellaneous Symbols', + 106 => 'Dingbats', + 107 => 'Braille Patterns', + 108 => 'CJK Radicals Supplement', + 109 => 'Kangxi Radicals', + 110 => 'Ideographic Description Characters', + 111 => 'CJK Symbols and Punctuation', + 112 => 'Hiragana', + 113 => 'Katakana', + 114 => 'Bopomofo', + 115 => 'Hangul Compatibility Jamo', + 116 => 'Kanbun', + 117 => 'Bopomofo Extended', + 118 => 'Enclosed CJK Letters and Months', + 119 => 'CJK Compatibility', + 120 => 'CJK Unified Ideographs Extension A', + 121 => 'CJK Unified Ideographs', + 122 => 'Yi Syllables', + 123 => 'Yi Radicals', + 124 => 'Hangul Syllables', + 125 => 'High Surrogates', + 126 => 'High Private Use Surrogates', + 127 => 'Low Surrogates', + 128 => 'CJK Compatibility Ideographs', + 129 => 'Alphabetic Presentation Forms', + 130 => 'Arabic Presentation Forms-A', + 131 => 'Combining Half Marks', + 132 => 'CJK Compatibility Forms', + 133 => 'Small Form Variants', + 134 => 'Arabic Presentation Forms-B', + 135 => 'Specials', + 136 => 'Halfwidth and Fullwidth Forms', + 137 => 'Old Italic', + 138 => 'Gothic', + 139 => 'Deseret', + 140 => 'Byzantine Musical Symbols', + 141 => 'Musical Symbols', + 142 => 'Mathematical Alphanumeric Symbols', + 143 => 'CJK Unified Ideographs Extension B', + 144 => 'CJK Compatibility Ideographs Supplement', + 145 => 'Tags', +); diff --git a/lib/unicore/In/137.pl b/lib/unicore/In/137.pl index 7161573..6be2e0d 100644 --- a/lib/unicore/In/137.pl +++ b/lib/unicore/In/137.pl @@ -2,5 +2,5 @@ # This file is built by mktables from e.g. Unicode.txt. # Any changes made here will be lost! return <<'END'; -10300 1032F Old Italic +10300 1032F Old Italic Block END diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 060a0e6..3328f69 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -103,6 +103,9 @@ my %In; my $InId = 0; my %InIn; +my %InScript; +my %InBlock; + # # Read in the Unicode.txt, the main Unicode database. # @@ -355,8 +358,9 @@ for my $script (sort { $a->[0] <=> $b->[0] } @Scripts) { extend($Script{$name}, $last); } unless (defined $In{$name}) { - $In{$name} = $InId++; - $InIn{$name} = $Script{$name}; + $InScript{$InId} = $name; + $In{$name} = $InId++; + $InIn{$name} = $Script{$name}; } } @@ -382,11 +386,19 @@ if (open(my $Blocks, "Blocks.txt")) { next unless /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/; my ($first, $last, $name) = ($1, $2, $3); + my $origname = $name; # If there's a naming conflict (the script names are # in uppercase), the name of the block has " Block" # appended to it. - $name = "$name Block" if defined $In{"\U$name"}; + my $pat = $name; + $pat =~ s/([- _])/(?:[-_]|\\s+)?/g; + for my $i (values %InScript) { + if ($i =~ /^$pat$/i) { + $name .= " Block"; + last; + } + } append(\@Blocks, $first, $name); append($Blocks{$name} ||= [], $first, $name); @@ -395,8 +407,9 @@ if (open(my $Blocks, "Blocks.txt")) { extend($Blocks{$name}, $last); } unless (defined $In{$name}) { - $In{$name} = $InId++; - $InIn{$name} = $Blocks{$name}; + $InBlock{$InId} = $origname; + $In{$name} = $InId++; + $InIn{$name} = $Blocks{$name}; } } } else { @@ -591,6 +604,39 @@ EOT mapping(\%In, "In"); +# +# Append the InScript and InBlock mappings. +# These are needed only if Script= and Block= syntaxes are used. +# + +if (open(my $In, ">>In.pl")) { + print $In < $b } keys %InScript) { + printf $In "%4d => '$InScript{$i}',\n", $i; + } + print $In < $b } keys %InBlock) { + printf $In "%4d => '$InBlock{$i}',\n", $i; + } + print $In <{$k}; + print "inprefix = $inprefix, In = $In, k = $k, i = $i\n" if DEBUG; + next if $incat =~ /^S/ && + !exists $utf8::InScript{$i}; + next if $incat =~ /^B/ && + !exists $utf8::InBlock{$i}; + $file = "unicore/In/$i"; print "inprefix = $inprefix, In = $In, k = $k, file = $file\n" if DEBUG; last; } diff --git a/pod/perltodo.pod b/pod/perltodo.pod index b903593..5fae97a 100644 --- a/pod/perltodo.pod +++ b/pod/perltodo.pod @@ -77,12 +77,8 @@ Allow for the metaproperties: C, C, C, C (require the DerivedCoreProperties and DerviceNormalizationProperties files). -There are also enumerated properties: C, -C, C, C. These -properties have multiple values: for uniqueness the property -value should be appended. For example, C<\p{IsAlphabetic}> -wouldbe the binary property, while C<\p{AlphabeticLineBreak}> -would mean the enumerated property. +There are also multiple value properties still unimplemented: +C, C. =item * diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 9e3ca75..6bd0423 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -311,7 +311,7 @@ The scripts available for C<\p{In...}> and C<\P{In...}>, for example Hangul Ethiopic Cherokee - CanadianAboriginal + Canadian Aboriginal Ogham Runic Khmer @@ -321,7 +321,7 @@ The scripts available for C<\p{In...}> and C<\P{In...}>, for example Bopomofo Han Yi - OldItalic + Old Italic Gothic Deseret Inherited @@ -386,101 +386,101 @@ preferential Unicode character class definition; this meant that the definitions of some character classes changed (the ones in the below list that have the C appended). - BasicLatin - Latin1Supplement - LatinExtendedA - LatinExtendedB - IPAExtensions - SpacingModifierLetters - CombiningDiacriticalMarks - GreekBlock - CyrillicBlock - ArmenianBlock - HebrewBlock - ArabicBlock - SyriacBlock - ThaanaBlock - DevanagariBlock - BengaliBlock - GurmukhiBlock - GujaratiBlock - OriyaBlock - TamilBlock - TeluguBlock - KannadaBlock - MalayalamBlock - SinhalaBlock - ThaiBlock - LaoBlock - TibetanBlock - MyanmarBlock - GeorgianBlock - HangulJamo - EthiopicBlock - CherokeeBlock - UnifiedCanadianAboriginalSyllabics - OghamBlock - RunicBlock - KhmerBlock - MongolianBlock - LatinExtendedAdditional - GreekExtended - GeneralPunctuation - SuperscriptsandSubscripts - CurrencySymbols - CombiningMarksforSymbols - LetterlikeSymbols - NumberForms + Basic Latin + Latin 1 Supplement + Latin Extended-A + Latin Extended-B + IPA Extensions + Spacing Modifier Letters + Combining Diacritical Marks + Greek Block + Cyrillic Block + Armenian Block + Hebrew Block + Arabic Block + Syriac Block + Thaana Block + Devanagari Block + Bengali Block + Gurmukhi Block + Gujarati Block + Oriya Block + Tamil Block + Telugu Block + Kannada Block + Malayalam Block + Sinhala Block + Thai Block + Lao Block + Tibetan Block + Myanmar Block + Georgian Block + Hangul Jamo + Ethiopic Block + Cherokee Block + Unified Canadian Aboriginal Syllabics + Ogham Block + Runic Block + Khmer Block + Mongolian Block + Latin Extended Additional + Greek Extended + General Punctuation + Superscripts and Subscripts + Currency Symbols + Combining Marks for Symbols + Letterlike Symbols + Number Forms Arrows - MathematicalOperators - MiscellaneousTechnical - ControlPictures - OpticalCharacterRecognition - EnclosedAlphanumerics - BoxDrawing - BlockElements - GeometricShapes - MiscellaneousSymbols + Mathematical Operators + Miscellaneous Technical + Control Pictures + Optical Character Recognition + Enclosed Alphanumerics + Box Drawing + Block Elements + Geometric Shapes + Miscellaneous Symbols Dingbats - BraillePatterns - CJKRadicalsSupplement - KangxiRadicals - IdeographicDescriptionCharacters - CJKSymbolsandPunctuation - HiraganaBlock - KatakanaBlock - BopomofoBlock - HangulCompatibilityJamo + Braille Patterns + CJK Radicals Supplement + Kangxi Radicals + Ideographic Description Characters + CJK Symbols and Punctuation + Hiragana Block + Katakana Block + Bopomofo Block + Hangul Compatibility Jamo Kanbun - BopomofoExtended - EnclosedCJKLettersandMonths - CJKCompatibility - CJKUnifiedIdeographsExtensionA - CJKUnifiedIdeographs - YiSyllables - YiRadicals - HangulSyllables - HighSurrogates - HighPrivateUseSurrogates - LowSurrogates - PrivateUse - CJKCompatibilityIdeographs - AlphabeticPresentationForms - ArabicPresentationFormsA - CombiningHalfMarks - CJKCompatibilityForms - SmallFormVariants - ArabicPresentationFormsB + Bopomofo Extended + Enclosed CJK Letters and Months + CJK Compatibility + CJK Unified Ideographs Extension A + CJK Unified Ideographs + Yi Syllables + Yi Radicals + Hangul Syllables + High Surrogates + High Private Use Surrogates + Low Surrogates + Private Use + CJK Compatibility Ideographs + Alphabetic Presentation Forms + Arabic Presentation Forms-A + Combining Half Marks + CJK Compatibility Forms + Small Form Variants + Arabic Presentation Forms-B Specials - HalfwidthandFullwidthForms - OldItalicBlock - GothicBlock - DeseretBlock - ByzantineMusicalSymbols - MusicalSymbols - MathematicalAlphanumericSymbols - CJKUnifiedIdeographsExtensionB - CJKCompatibilityIdeographsSupplement + Halfwidth and Fullwidth Forms + Old Italic Block + Gothic Block + Deseret Block + Byzantine Musical Symbols + Musical Symbols + Mathematical Alphanumeric Symbols + CJK Unified Ideographs Extension B + CJK Compatibility Ideographs Supplement Tags =item * diff --git a/t/op/pat.t b/t/op/pat.t index 0f978d1..6617921 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -6,7 +6,7 @@ $| = 1; -print "1..747\n"; +print "1..750\n"; BEGIN { chdir 't' if -d 't'; @@ -2243,3 +2243,15 @@ print "# some Unicode properties\n"; print "not " unless "\x{AC00}" =~ /\p{HangulSyllable}/; print "ok 747\n"; } + +{ + print "not " unless "\x{0100}" =~ /\p{Script=Latin}/; + print "ok 748\n"; + + print "not " unless "\x{0100}" =~ /\p{Block=LatinExtendedA}/; + print "ok 749\n"; + + print "not " unless "\x{0100}" =~ /\p{Category=UppercaseLetter}/; + print "ok 750\n"; +} +