FEFF Specials
FF00 FFEF Halfwidth and Fullwidth Forms
FFF0 FFFD Specials
-10300 1032F Old Italic
+10300 1032F Old Italic Block
10330 1034F Gothic Block
10400 1044F Deseret Block
1D000 1D0FF Byzantine Musical Symbols
'Number Forms' => '95',
'OGHAM' => '38',
'Ogham Block' => '84',
-'Old Italic' => '137',
+'Old Italic Block' => '137',
'OLD-ITALIC' => '47',
'Optical Character Recognition' => '100',
'ORIYA' => '23',
'Ogham(?:[-_]|\s+)?Block' => '84',
},
'ol' => {
- 'Old(?:[-_]|\s+)?Italic' => '137',
+ 'Old(?:[-_]|\s+)?Italic(?:[-_]|\s+)?Block' => '137',
'OLD(?:[-_]|\s+)?ITALIC' => '47',
},
'op' => {
'Yi(?:[-_]|\s+)?Syllables' => '122',
},
);
+
+%utf8::InScript =
+(
+ 10 => 'LATIN',
+ 11 => 'GREEK',
+ 12 => 'INHERITED',
+ 13 => 'CYRILLIC',
+ 14 => 'ARMENIAN',
+ 15 => 'HEBREW',
+ 16 => 'ARABIC',
+ 17 => 'SYRIAC',
+ 18 => 'THAANA',
+ 19 => 'DEVANAGARI',
+ 20 => 'BENGALI',
+ 21 => 'GURMUKHI',
+ 22 => 'GUJARATI',
+ 23 => 'ORIYA',
+ 24 => 'TAMIL',
+ 25 => 'TELUGU',
+ 26 => 'KANNADA',
+ 27 => 'MALAYALAM',
+ 28 => 'SINHALA',
+ 29 => 'THAI',
+ 30 => 'LAO',
+ 31 => 'TIBETAN',
+ 32 => 'MYANMAR',
+ 33 => 'GEORGIAN',
+ 34 => 'HANGUL',
+ 35 => 'ETHIOPIC',
+ 36 => 'CHEROKEE',
+ 37 => 'CANADIAN-ABORIGINAL',
+ 38 => 'OGHAM',
+ 39 => 'RUNIC',
+ 40 => 'KHMER',
+ 41 => 'MONGOLIAN',
+ 42 => 'HAN',
+ 43 => 'HIRAGANA',
+ 44 => 'KATAKANA',
+ 45 => 'BOPOMOFO',
+ 46 => 'YI',
+ 47 => 'OLD-ITALIC',
+ 48 => 'GOTHIC',
+ 49 => 'DESERET',
+);
+
+%utf8::InBlock =
+(
+ 51 => 'Basic Latin',
+ 52 => 'Latin-1 Supplement',
+ 53 => 'Latin Extended-A',
+ 54 => 'Latin Extended-B',
+ 55 => 'IPA Extensions',
+ 56 => 'Spacing Modifier Letters',
+ 57 => 'Combining Diacritical Marks',
+ 58 => 'Greek',
+ 59 => 'Cyrillic',
+ 60 => 'Armenian',
+ 61 => 'Hebrew',
+ 62 => 'Arabic',
+ 63 => 'Syriac',
+ 64 => 'Thaana',
+ 65 => 'Devanagari',
+ 66 => 'Bengali',
+ 67 => 'Gurmukhi',
+ 68 => 'Gujarati',
+ 69 => 'Oriya',
+ 70 => 'Tamil',
+ 71 => 'Telugu',
+ 72 => 'Kannada',
+ 73 => 'Malayalam',
+ 74 => 'Sinhala',
+ 75 => 'Thai',
+ 76 => 'Lao',
+ 77 => 'Tibetan',
+ 78 => 'Myanmar',
+ 79 => 'Georgian',
+ 80 => 'Hangul Jamo',
+ 81 => 'Ethiopic',
+ 82 => 'Cherokee',
+ 83 => 'Unified Canadian Aboriginal Syllabics',
+ 84 => 'Ogham',
+ 85 => 'Runic',
+ 86 => 'Khmer',
+ 87 => 'Mongolian',
+ 88 => 'Latin Extended Additional',
+ 89 => 'Greek Extended',
+ 90 => 'General Punctuation',
+ 91 => 'Superscripts and Subscripts',
+ 92 => 'Currency Symbols',
+ 93 => 'Combining Marks for Symbols',
+ 94 => 'Letterlike Symbols',
+ 95 => 'Number Forms',
+ 96 => 'Arrows',
+ 97 => 'Mathematical Operators',
+ 98 => 'Miscellaneous Technical',
+ 99 => 'Control Pictures',
+ 100 => 'Optical Character Recognition',
+ 101 => 'Enclosed Alphanumerics',
+ 102 => 'Box Drawing',
+ 103 => 'Block Elements',
+ 104 => 'Geometric Shapes',
+ 105 => 'Miscellaneous Symbols',
+ 106 => 'Dingbats',
+ 107 => 'Braille Patterns',
+ 108 => 'CJK Radicals Supplement',
+ 109 => 'Kangxi Radicals',
+ 110 => 'Ideographic Description Characters',
+ 111 => 'CJK Symbols and Punctuation',
+ 112 => 'Hiragana',
+ 113 => 'Katakana',
+ 114 => 'Bopomofo',
+ 115 => 'Hangul Compatibility Jamo',
+ 116 => 'Kanbun',
+ 117 => 'Bopomofo Extended',
+ 118 => 'Enclosed CJK Letters and Months',
+ 119 => 'CJK Compatibility',
+ 120 => 'CJK Unified Ideographs Extension A',
+ 121 => 'CJK Unified Ideographs',
+ 122 => 'Yi Syllables',
+ 123 => 'Yi Radicals',
+ 124 => 'Hangul Syllables',
+ 125 => 'High Surrogates',
+ 126 => 'High Private Use Surrogates',
+ 127 => 'Low Surrogates',
+ 128 => 'CJK Compatibility Ideographs',
+ 129 => 'Alphabetic Presentation Forms',
+ 130 => 'Arabic Presentation Forms-A',
+ 131 => 'Combining Half Marks',
+ 132 => 'CJK Compatibility Forms',
+ 133 => 'Small Form Variants',
+ 134 => 'Arabic Presentation Forms-B',
+ 135 => 'Specials',
+ 136 => 'Halfwidth and Fullwidth Forms',
+ 137 => 'Old Italic',
+ 138 => 'Gothic',
+ 139 => 'Deseret',
+ 140 => 'Byzantine Musical Symbols',
+ 141 => 'Musical Symbols',
+ 142 => 'Mathematical Alphanumeric Symbols',
+ 143 => 'CJK Unified Ideographs Extension B',
+ 144 => 'CJK Compatibility Ideographs Supplement',
+ 145 => 'Tags',
+);
# This file is built by mktables from e.g. Unicode.txt.
# Any changes made here will be lost!
return <<'END';
-10300 1032F Old Italic
+10300 1032F Old Italic Block
END
my $InId = 0;
my %InIn;
+my %InScript;
+my %InBlock;
+
#
# Read in the Unicode.txt, the main Unicode database.
#
extend($Script{$name}, $last);
}
unless (defined $In{$name}) {
- $In{$name} = $InId++;
- $InIn{$name} = $Script{$name};
+ $InScript{$InId} = $name;
+ $In{$name} = $InId++;
+ $InIn{$name} = $Script{$name};
}
}
next unless /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/;
my ($first, $last, $name) = ($1, $2, $3);
+ my $origname = $name;
# If there's a naming conflict (the script names are
# in uppercase), the name of the block has " Block"
# appended to it.
- $name = "$name Block" if defined $In{"\U$name"};
+ my $pat = $name;
+ $pat =~ s/([- _])/(?:[-_]|\\s+)?/g;
+ for my $i (values %InScript) {
+ if ($i =~ /^$pat$/i) {
+ $name .= " Block";
+ last;
+ }
+ }
append(\@Blocks, $first, $name);
append($Blocks{$name} ||= [], $first, $name);
extend($Blocks{$name}, $last);
}
unless (defined $In{$name}) {
- $In{$name} = $InId++;
- $InIn{$name} = $Blocks{$name};
+ $InBlock{$InId} = $origname;
+ $In{$name} = $InId++;
+ $InIn{$name} = $Blocks{$name};
}
}
} else {
mapping(\%In, "In");
+#
+# Append the InScript and InBlock mappings.
+# These are needed only if Script= and Block= syntaxes are used.
+#
+
+if (open(my $In, ">>In.pl")) {
+ print $In <<EOT;
+
+%utf8::InScript =
+(
+EOT
+ for my $i (sort { $a <=> $b } keys %InScript) {
+ printf $In "%4d => '$InScript{$i}',\n", $i;
+ }
+ print $In <<EOT;
+);
+EOT
+
+ print $In <<EOT;
+
+%utf8::InBlock =
+(
+EOT
+ for my $i (sort { $a <=> $b } keys %InBlock) {
+ printf $In "%4d => '$InBlock{$i}',\n", $i;
+ }
+ print $In <<EOT;
+);
+EOT
+} else {
+ die "$0: In.pl: $!\n";
+}
+
# Easy low-calorie cheat.
use File::Copy;
copy("In/$In{Noncharacter_Code_Point}.pl", "Is/Cn.pl");
unless (defined $file) {
defined %utf8::Is || do "unicore/Is.pl";
- if ($type =~ /^(?:Is)?[- _]?([A-Z].*)$/i) {
+ if ($type =~ /^(?:Is|Category\s*=\s*)?[- _]?([A-Z].*)$/i) {
my $istype = $1;
print "istype = $istype\n" if DEBUG;
unless ($list = do "unicore/Is/$istype.pl") {
unless (defined $file) {
defined %utf8::In || do "unicore/In.pl";
$type = 'Lampersand' if $type =~ /^(?:Is)?L&$/;
- if ($type =~ /^(?:In)?[- _]?(?!herited$)(.+)/i) {
- my $intype = $1;
- print "intype = $intype\n" if DEBUG;
- if (exists $utf8::Is{$istype}) {
+ if ($type =~ /^(In|(?:Script|Block)\s*=\s*)?[- _]?(?!herited$)(.+)/i) {
+ my $incat = $1;
+ my $intype = $2;
+ print "incat = $incat, intype = $intype\n" if DEBUG;
+ if (exists $utf8::In{$intype}) {
$file = "unicore/In/$utf8::In{$intype}";
} else {
my $inprefix = substr(lc($intype), 0, 2);
for my $k (keys %{$utf8::InPat{$inprefix}}) {
print "inprefix = $inprefix, In = $In, k = $k\n" if DEBUG;
if ($In =~ /^$k$/i) {
- $file = "unicore/In/$utf8::InPat{$inprefix}->{$k}";
+ my $i = $utf8::InPat{$inprefix}->{$k};
+ print "inprefix = $inprefix, In = $In, k = $k, i = $i\n" if DEBUG;
+ next if $incat =~ /^S/ &&
+ !exists $utf8::InScript{$i};
+ next if $incat =~ /^B/ &&
+ !exists $utf8::InBlock{$i};
+ $file = "unicore/In/$i";
print "inprefix = $inprefix, In = $In, k = $k, file = $file\n" if DEBUG;
last;
}
C<NF*_NO>, C<NF*_MAYBE> (require the DerivedCoreProperties and
DerviceNormalizationProperties files).
-There are also enumerated properties: C<Decomposition Type>,
-C<Numeric Type>, C<East Asian Width>, C<Line Break>. These
-properties have multiple values: for uniqueness the property
-value should be appended. For example, C<\p{IsAlphabetic}>
-wouldbe the binary property, while C<\p{AlphabeticLineBreak}>
-would mean the enumerated property.
+There are also multiple value properties still unimplemented:
+C<Numeric Type>, C<East Asian Width>.
=item *
Hangul
Ethiopic
Cherokee
- CanadianAboriginal
+ Canadian Aboriginal
Ogham
Runic
Khmer
Bopomofo
Han
Yi
- OldItalic
+ Old Italic
Gothic
Deseret
Inherited
the definitions of some character classes changed (the ones in the
below list that have the C<Block> appended).
- BasicLatin
- Latin1Supplement
- LatinExtendedA
- LatinExtendedB
- IPAExtensions
- SpacingModifierLetters
- CombiningDiacriticalMarks
- GreekBlock
- CyrillicBlock
- ArmenianBlock
- HebrewBlock
- ArabicBlock
- SyriacBlock
- ThaanaBlock
- DevanagariBlock
- BengaliBlock
- GurmukhiBlock
- GujaratiBlock
- OriyaBlock
- TamilBlock
- TeluguBlock
- KannadaBlock
- MalayalamBlock
- SinhalaBlock
- ThaiBlock
- LaoBlock
- TibetanBlock
- MyanmarBlock
- GeorgianBlock
- HangulJamo
- EthiopicBlock
- CherokeeBlock
- UnifiedCanadianAboriginalSyllabics
- OghamBlock
- RunicBlock
- KhmerBlock
- MongolianBlock
- LatinExtendedAdditional
- GreekExtended
- GeneralPunctuation
- SuperscriptsandSubscripts
- CurrencySymbols
- CombiningMarksforSymbols
- LetterlikeSymbols
- NumberForms
+ Basic Latin
+ Latin 1 Supplement
+ Latin Extended-A
+ Latin Extended-B
+ IPA Extensions
+ Spacing Modifier Letters
+ Combining Diacritical Marks
+ Greek Block
+ Cyrillic Block
+ Armenian Block
+ Hebrew Block
+ Arabic Block
+ Syriac Block
+ Thaana Block
+ Devanagari Block
+ Bengali Block
+ Gurmukhi Block
+ Gujarati Block
+ Oriya Block
+ Tamil Block
+ Telugu Block
+ Kannada Block
+ Malayalam Block
+ Sinhala Block
+ Thai Block
+ Lao Block
+ Tibetan Block
+ Myanmar Block
+ Georgian Block
+ Hangul Jamo
+ Ethiopic Block
+ Cherokee Block
+ Unified Canadian Aboriginal Syllabics
+ Ogham Block
+ Runic Block
+ Khmer Block
+ Mongolian Block
+ Latin Extended Additional
+ Greek Extended
+ General Punctuation
+ Superscripts and Subscripts
+ Currency Symbols
+ Combining Marks for Symbols
+ Letterlike Symbols
+ Number Forms
Arrows
- MathematicalOperators
- MiscellaneousTechnical
- ControlPictures
- OpticalCharacterRecognition
- EnclosedAlphanumerics
- BoxDrawing
- BlockElements
- GeometricShapes
- MiscellaneousSymbols
+ Mathematical Operators
+ Miscellaneous Technical
+ Control Pictures
+ Optical Character Recognition
+ Enclosed Alphanumerics
+ Box Drawing
+ Block Elements
+ Geometric Shapes
+ Miscellaneous Symbols
Dingbats
- BraillePatterns
- CJKRadicalsSupplement
- KangxiRadicals
- IdeographicDescriptionCharacters
- CJKSymbolsandPunctuation
- HiraganaBlock
- KatakanaBlock
- BopomofoBlock
- HangulCompatibilityJamo
+ Braille Patterns
+ CJK Radicals Supplement
+ Kangxi Radicals
+ Ideographic Description Characters
+ CJK Symbols and Punctuation
+ Hiragana Block
+ Katakana Block
+ Bopomofo Block
+ Hangul Compatibility Jamo
Kanbun
- BopomofoExtended
- EnclosedCJKLettersandMonths
- CJKCompatibility
- CJKUnifiedIdeographsExtensionA
- CJKUnifiedIdeographs
- YiSyllables
- YiRadicals
- HangulSyllables
- HighSurrogates
- HighPrivateUseSurrogates
- LowSurrogates
- PrivateUse
- CJKCompatibilityIdeographs
- AlphabeticPresentationForms
- ArabicPresentationFormsA
- CombiningHalfMarks
- CJKCompatibilityForms
- SmallFormVariants
- ArabicPresentationFormsB
+ Bopomofo Extended
+ Enclosed CJK Letters and Months
+ CJK Compatibility
+ CJK Unified Ideographs Extension A
+ CJK Unified Ideographs
+ Yi Syllables
+ Yi Radicals
+ Hangul Syllables
+ High Surrogates
+ High Private Use Surrogates
+ Low Surrogates
+ Private Use
+ CJK Compatibility Ideographs
+ Alphabetic Presentation Forms
+ Arabic Presentation Forms-A
+ Combining Half Marks
+ CJK Compatibility Forms
+ Small Form Variants
+ Arabic Presentation Forms-B
Specials
- HalfwidthandFullwidthForms
- OldItalicBlock
- GothicBlock
- DeseretBlock
- ByzantineMusicalSymbols
- MusicalSymbols
- MathematicalAlphanumericSymbols
- CJKUnifiedIdeographsExtensionB
- CJKCompatibilityIdeographsSupplement
+ Halfwidth and Fullwidth Forms
+ Old Italic Block
+ Gothic Block
+ Deseret Block
+ Byzantine Musical Symbols
+ Musical Symbols
+ Mathematical Alphanumeric Symbols
+ CJK Unified Ideographs Extension B
+ CJK Compatibility Ideographs Supplement
Tags
=item *
$| = 1;
-print "1..747\n";
+print "1..750\n";
BEGIN {
chdir 't' if -d 't';
print "not " unless "\x{AC00}" =~ /\p{HangulSyllable}/;
print "ok 747\n";
}
+
+{
+ print "not " unless "\x{0100}" =~ /\p{Script=Latin}/;
+ print "ok 748\n";
+
+ print "not " unless "\x{0100}" =~ /\p{Block=LatinExtendedA}/;
+ print "ok 749\n";
+
+ print "not " unless "\x{0100}" =~ /\p{Category=UppercaseLetter}/;
+ print "ok 750\n";
+}
+