Prettyprinting.
Jarkko Hietaniemi [Thu, 18 Oct 2001 02:24:52 +0000 (02:24 +0000)]
p4raw-id: //depot/perl@12489

lib/unicore/In.pl
lib/unicore/Is.pl
lib/unicore/mktables

index cd872fa..8d59516 100644 (file)
 # Any changes made here will be lost!
 %utf8::In =
 (
-'ARABIC' => '16',
-'ARMENIAN' => '14',
-'ASCII_Hex_Digit' => '152',
-'Alphabetic' => '164',
-'Alphabetic Presentation Forms' => '129',
-'Any' => '171',
-'Arabic Block' => '62',
-'Arabic Presentation Forms-A' => '130',
-'Arabic Presentation Forms-B' => '134',
-'Armenian Block' => '60',
-'Arrows' => '96',
-'Assigned' => '163',
-'BENGALI' => '20',
-'BOPOMOFO' => '45',
-'Basic Latin' => '51',
-'Bengali Block' => '66',
-'Bidi_Control' => '159',
-'Block Elements' => '103',
-'Bopomofo Block' => '114',
-'Bopomofo Extended' => '117',
-'Box Drawing' => '102',
-'Braille Patterns' => '107',
-'Byzantine Musical Symbols' => '140',
-'CANADIAN-ABORIGINAL' => '37',
-'CHEROKEE' => '36',
-'CJK Compatibility' => '119',
-'CJK Compatibility Forms' => '132',
-'CJK Compatibility Ideographs' => '128',
-'CJK Compatibility Ideographs Supplement' => '144',
-'CJK Ideograph' => '1',
-'CJK Ideograph Extension A' => '0',
-'CJK Ideograph Extension B' => '7',
-'CJK Radicals Supplement' => '108',
-'CJK Symbols and Punctuation' => '111',
-'CJK Unified Ideographs' => '121',
-'CJK Unified Ideographs Extension A' => '120',
-'CJK Unified Ideographs Extension B' => '143',
-'CYRILLIC' => '13',
-'Cherokee Block' => '82',
-'Combining Diacritical Marks' => '57',
-'Combining Half Marks' => '131',
-'Combining Marks for Symbols' => '93',
-'Common' => '50',
-'Control Pictures' => '99',
-'Currency Symbols' => '92',
-'Cyrillic Block' => '59',
-'DESERET' => '49',
-'DEVANAGARI' => '19',
-'Dash' => '151',
-'Deseret Block' => '139',
-'Devanagari Block' => '65',
-'Diacritic' => '154',
-'Dingbats' => '106',
-'ETHIOPIC' => '35',
-'Enclosed Alphanumerics' => '101',
-'Enclosed CJK Letters and Months' => '118',
-'Ethiopic Block' => '81',
-'Extender' => '155',
-'GEORGIAN' => '33',
-'GOTHIC' => '48',
-'GREEK' => '11',
-'GUJARATI' => '22',
-'GURMUKHI' => '21',
-'General Punctuation' => '90',
-'Geometric Shapes' => '104',
-'Georgian Block' => '79',
-'Gothic Block' => '138',
-'Greek Block' => '58',
-'Greek Extended' => '89',
-'Gujarati Block' => '68',
-'Gurmukhi Block' => '67',
-'HAN' => '42',
-'HANGUL' => '34',
-'HEBREW' => '15',
-'HIRAGANA' => '43',
-'Halfwidth and Fullwidth Forms' => '136',
-'Hangul Compatibility Jamo' => '115',
-'Hangul Jamo' => '80',
-'Hangul Syllable' => '2',
-'Hangul Syllables' => '124',
-'Hebrew Block' => '61',
-'Hex_Digit' => '153',
-'High Private Use Surrogates' => '126',
-'High Surrogates' => '125',
-'Hiragana Block' => '112',
-'Hyphen' => '150',
-'ID_Continue' => '170',
-'ID_Start' => '169',
-'INHERITED' => '12',
-'IPA Extensions' => '55',
-'Ideographic' => '161',
-'Ideographic Description Characters' => '110',
-'Join_Control' => '158',
-'KANNADA' => '26',
-'KATAKANA' => '44',
-'KHMER' => '40',
-'Kanbun' => '116',
-'Kangxi Radicals' => '109',
-'Kannada Block' => '72',
-'Katakana Block' => '113',
-'Khmer Block' => '86',
-'LAO' => '30',
-'LATIN' => '10',
-'Lampersand' => '168',
-'Lao Block' => '76',
-'Latin Extended Additional' => '88',
-'Latin Extended-A' => '53',
-'Latin Extended-B' => '54',
-'Latin-1 Supplement' => '52',
-'Letterlike Symbols' => '94',
-'Low Surrogate' => '5',
-'Low Surrogates' => '127',
-'Lowercase' => '165',
-'MALAYALAM' => '27',
-'MONGOLIAN' => '41',
-'MYANMAR' => '32',
-'Malayalam Block' => '73',
-'Math' => '167',
-'Mathematical Alphanumeric Symbols' => '142',
-'Mathematical Operators' => '97',
-'Miscellaneous Symbols' => '105',
-'Miscellaneous Technical' => '98',
-'Mongolian Block' => '87',
-'Musical Symbols' => '141',
-'Myanmar Block' => '78',
-'Non Private Use High Surrogate' => '3',
-'Noncharacter_Code_Point' => '162',
-'Number Forms' => '95',
-'OGHAM' => '38',
-'OLD-ITALIC' => '47',
-'ORIYA' => '23',
-'Ogham Block' => '84',
-'Old Italic' => '137',
-'Optical Character Recognition' => '100',
-'Oriya Block' => '69',
-'Other_Alphabetic' => '157',
-'Other_Lowercase' => '156',
-'Other_Math' => '149',
-'Other_Uppercase' => '160',
-'Plane 15 Private Use' => '8',
-'Plane 16 Private Use' => '9',
-'Private Use' => '6',
-'Private Use High Surrogate' => '4',
-'Quotation_Mark' => '148',
-'RUNIC' => '39',
-'Runic Block' => '85',
-'SINHALA' => '28',
-'SYRIAC' => '17',
-'Sinhala Block' => '74',
-'Small Form Variants' => '133',
-'Spacing Modifier Letters' => '56',
-'Specials' => '135',
-'Superscripts and Subscripts' => '91',
-'Syriac Block' => '63',
-'TAMIL' => '24',
-'TELUGU' => '25',
-'THAANA' => '18',
-'THAI' => '29',
-'TIBETAN' => '31',
-'Tags' => '145',
-'Tamil Block' => '70',
-'Telugu Block' => '71',
-'Terminal_Punctuation' => '147',
-'Thaana Block' => '64',
-'Thai Block' => '75',
-'Tibetan Block' => '77',
-'Unified Canadian Aboriginal Syllabics' => '83',
-'Uppercase' => '166',
-'White_space' => '146',
-'YI' => '46',
-'Yi Radicals' => '123',
-'Yi Syllables' => '122',
+'Alphabetic'                                  => '164',
+'Alphabetic Presentation Forms'               => '129',
+'Any'                                         => '171',
+'ARABIC'                                      => '16',
+'Arabic Block'                                => '62',
+'Arabic Presentation Forms-A'                 => '130',
+'Arabic Presentation Forms-B'                 => '134',
+'ARMENIAN'                                    => '14',
+'Armenian Block'                              => '60',
+'Arrows'                                      => '96',
+'ASCII_Hex_Digit'                             => '152',
+'Assigned'                                    => '163',
+'Basic Latin'                                 => '51',
+'BENGALI'                                     => '20',
+'Bengali Block'                               => '66',
+'Bidi_Control'                                => '159',
+'Block Elements'                              => '103',
+'BOPOMOFO'                                    => '45',
+'Bopomofo Block'                              => '114',
+'Bopomofo Extended'                           => '117',
+'Box Drawing'                                 => '102',
+'Braille Patterns'                            => '107',
+'Byzantine Musical Symbols'                   => '140',
+'CANADIAN-ABORIGINAL'                         => '37',
+'CHEROKEE'                                    => '36',
+'Cherokee Block'                              => '82',
+'CJK Compatibility'                           => '119',
+'CJK Compatibility Forms'                     => '132',
+'CJK Compatibility Ideographs'                => '128',
+'CJK Compatibility Ideographs Supplement'     => '144',
+'CJK Ideograph'                               => '1',
+'CJK Ideograph Extension A'                   => '0',
+'CJK Ideograph Extension B'                   => '7',
+'CJK Radicals Supplement'                     => '108',
+'CJK Symbols and Punctuation'                 => '111',
+'CJK Unified Ideographs'                      => '121',
+'CJK Unified Ideographs Extension A'          => '120',
+'CJK Unified Ideographs Extension B'          => '143',
+'Combining Diacritical Marks'                 => '57',
+'Combining Half Marks'                        => '131',
+'Combining Marks for Symbols'                 => '93',
+'Common'                                      => '50',
+'Control Pictures'                            => '99',
+'Currency Symbols'                            => '92',
+'CYRILLIC'                                    => '13',
+'Cyrillic Block'                              => '59',
+'Dash'                                        => '151',
+'DESERET'                                     => '49',
+'Deseret Block'                               => '139',
+'DEVANAGARI'                                  => '19',
+'Devanagari Block'                            => '65',
+'Diacritic'                                   => '154',
+'Dingbats'                                    => '106',
+'Enclosed Alphanumerics'                      => '101',
+'Enclosed CJK Letters and Months'             => '118',
+'ETHIOPIC'                                    => '35',
+'Ethiopic Block'                              => '81',
+'Extender'                                    => '155',
+'General Punctuation'                         => '90',
+'Geometric Shapes'                            => '104',
+'GEORGIAN'                                    => '33',
+'Georgian Block'                              => '79',
+'GOTHIC'                                      => '48',
+'Gothic Block'                                => '138',
+'GREEK'                                       => '11',
+'Greek Block'                                 => '58',
+'Greek Extended'                              => '89',
+'GUJARATI'                                    => '22',
+'Gujarati Block'                              => '68',
+'GURMUKHI'                                    => '21',
+'Gurmukhi Block'                              => '67',
+'Halfwidth and Fullwidth Forms'               => '136',
+'HAN'                                         => '42',
+'HANGUL'                                      => '34',
+'Hangul Compatibility Jamo'                   => '115',
+'Hangul Jamo'                                 => '80',
+'Hangul Syllable'                             => '2',
+'Hangul Syllables'                            => '124',
+'HEBREW'                                      => '15',
+'Hebrew Block'                                => '61',
+'Hex_Digit'                                   => '153',
+'High Private Use Surrogates'                 => '126',
+'High Surrogates'                             => '125',
+'HIRAGANA'                                    => '43',
+'Hiragana Block'                              => '112',
+'Hyphen'                                      => '150',
+'ID_Continue'                                 => '170',
+'ID_Start'                                    => '169',
+'Ideographic'                                 => '161',
+'Ideographic Description Characters'          => '110',
+'INHERITED'                                   => '12',
+'IPA Extensions'                              => '55',
+'Join_Control'                                => '158',
+'Kanbun'                                      => '116',
+'Kangxi Radicals'                             => '109',
+'KANNADA'                                     => '26',
+'Kannada Block'                               => '72',
+'KATAKANA'                                    => '44',
+'Katakana Block'                              => '113',
+'KHMER'                                       => '40',
+'Khmer Block'                                 => '86',
+'Lampersand'                                  => '168',
+'LAO'                                         => '30',
+'Lao Block'                                   => '76',
+'LATIN'                                       => '10',
+'Latin Extended Additional'                   => '88',
+'Latin Extended-A'                            => '53',
+'Latin Extended-B'                            => '54',
+'Latin-1 Supplement'                          => '52',
+'Letterlike Symbols'                          => '94',
+'Low Surrogate'                               => '5',
+'Low Surrogates'                              => '127',
+'Lowercase'                                   => '165',
+'MALAYALAM'                                   => '27',
+'Malayalam Block'                             => '73',
+'Math'                                        => '167',
+'Mathematical Alphanumeric Symbols'           => '142',
+'Mathematical Operators'                      => '97',
+'Miscellaneous Symbols'                       => '105',
+'Miscellaneous Technical'                     => '98',
+'MONGOLIAN'                                   => '41',
+'Mongolian Block'                             => '87',
+'Musical Symbols'                             => '141',
+'MYANMAR'                                     => '32',
+'Myanmar Block'                               => '78',
+'Non Private Use High Surrogate'              => '3',
+'Noncharacter_Code_Point'                     => '162',
+'Number Forms'                                => '95',
+'OGHAM'                                       => '38',
+'Ogham Block'                                 => '84',
+'Old Italic'                                  => '137',
+'OLD-ITALIC'                                  => '47',
+'Optical Character Recognition'               => '100',
+'ORIYA'                                       => '23',
+'Oriya Block'                                 => '69',
+'Other_Alphabetic'                            => '157',
+'Other_Lowercase'                             => '156',
+'Other_Math'                                  => '149',
+'Other_Uppercase'                             => '160',
+'Plane 15 Private Use'                        => '8',
+'Plane 16 Private Use'                        => '9',
+'Private Use'                                 => '6',
+'Private Use High Surrogate'                  => '4',
+'Quotation_Mark'                              => '148',
+'RUNIC'                                       => '39',
+'Runic Block'                                 => '85',
+'SINHALA'                                     => '28',
+'Sinhala Block'                               => '74',
+'Small Form Variants'                         => '133',
+'Spacing Modifier Letters'                    => '56',
+'Specials'                                    => '135',
+'Superscripts and Subscripts'                 => '91',
+'SYRIAC'                                      => '17',
+'Syriac Block'                                => '63',
+'Tags'                                        => '145',
+'TAMIL'                                       => '24',
+'Tamil Block'                                 => '70',
+'TELUGU'                                      => '25',
+'Telugu Block'                                => '71',
+'Terminal_Punctuation'                        => '147',
+'THAANA'                                      => '18',
+'Thaana Block'                                => '64',
+'THAI'                                        => '29',
+'Thai Block'                                  => '75',
+'TIBETAN'                                     => '31',
+'Tibetan Block'                               => '77',
+'Unified Canadian Aboriginal Syllabics'       => '83',
+'Uppercase'                                   => '166',
+'White_space'                                 => '146',
+'YI'                                          => '46',
+'Yi Radicals'                                 => '123',
+'Yi Syllables'                                => '122',
 );
 %utf8::InPat =
 (
 },
 'ar' => {
        'ARABIC' => '16',
-       'ARMENIAN' => '14',
        'Arabic(?:[-_]|\s+)?Block' => '62',
        'Arabic(?:[-_]|\s+)?Presentation(?:[-_]|\s+)?Forms(?:[-_]|\s+)?A' => '130',
        'Arabic(?:[-_]|\s+)?Presentation(?:[-_]|\s+)?Forms(?:[-_]|\s+)?B' => '134',
+       'ARMENIAN' => '14',
        'Armenian(?:[-_]|\s+)?Block' => '60',
        'Arrows' => '96',
 },
 },
 'de' => {
        'DESERET' => '49',
-       'DEVANAGARI' => '19',
        'Deseret(?:[-_]|\s+)?Block' => '139',
+       'DEVANAGARI' => '19',
        'Devanagari(?:[-_]|\s+)?Block' => '65',
 },
 'di' => {
        'Extender' => '155',
 },
 'ge' => {
-       'GEORGIAN' => '33',
        'General(?:[-_]|\s+)?Punctuation' => '90',
        'Geometric(?:[-_]|\s+)?Shapes' => '104',
+       'GEORGIAN' => '33',
        'Georgian(?:[-_]|\s+)?Block' => '79',
 },
 'go' => {
 },
 'gu' => {
        'GUJARATI' => '22',
-       'GURMUKHI' => '21',
        'Gujarati(?:[-_]|\s+)?Block' => '68',
+       'GURMUKHI' => '21',
        'Gurmukhi(?:[-_]|\s+)?Block' => '67',
 },
 'ha' => {
+       'Halfwidth(?:[-_]|\s+)?and(?:[-_]|\s+)?Fullwidth(?:[-_]|\s+)?Forms' => '136',
        'HAN' => '42',
        'HANGUL' => '34',
-       'Halfwidth(?:[-_]|\s+)?and(?:[-_]|\s+)?Fullwidth(?:[-_]|\s+)?Forms' => '136',
        'Hangul(?:[-_]|\s+)?Compatibility(?:[-_]|\s+)?Jamo' => '115',
        'Hangul(?:[-_]|\s+)?Jamo' => '80',
        'Hangul(?:[-_]|\s+)?Syllable' => '2',
        'Hex(?:[-_]|\s+)?Digit' => '153',
 },
 'hi' => {
-       'HIRAGANA' => '43',
        'High(?:[-_]|\s+)?Private(?:[-_]|\s+)?Use(?:[-_]|\s+)?Surrogates' => '126',
        'High(?:[-_]|\s+)?Surrogates' => '125',
+       'HIRAGANA' => '43',
        'Hiragana(?:[-_]|\s+)?Block' => '112',
 },
 'hy' => {
        'Join(?:[-_]|\s+)?Control' => '158',
 },
 'ka' => {
-       'KANNADA' => '26',
-       'KATAKANA' => '44',
        'Kanbun' => '116',
        'Kangxi(?:[-_]|\s+)?Radicals' => '109',
+       'KANNADA' => '26',
        'Kannada(?:[-_]|\s+)?Block' => '72',
+       'KATAKANA' => '44',
        'Katakana(?:[-_]|\s+)?Block' => '113',
 },
 'kh' => {
        'Khmer(?:[-_]|\s+)?Block' => '86',
 },
 'la' => {
-       'LAO' => '30',
-       'LATIN' => '10',
        'Lampersand' => '168',
+       'LAO' => '30',
        'Lao(?:[-_]|\s+)?Block' => '76',
+       'LATIN' => '10',
        'Latin(?:[-_]|\s+)?Extended(?:[-_]|\s+)?Additional' => '88',
        'Latin(?:[-_]|\s+)?Extended(?:[-_]|\s+)?A' => '53',
        'Latin(?:[-_]|\s+)?Extended(?:[-_]|\s+)?B' => '54',
        'Ogham(?:[-_]|\s+)?Block' => '84',
 },
 'ol' => {
-       'OLD(?:[-_]|\s+)?ITALIC' => '47',
        'Old(?:[-_]|\s+)?Italic' => '137',
+       'OLD(?:[-_]|\s+)?ITALIC' => '47',
 },
 'op' => {
        'Optical(?:[-_]|\s+)?Character(?:[-_]|\s+)?Recognition' => '100',
        'Syriac(?:[-_]|\s+)?Block' => '63',
 },
 'ta' => {
-       'TAMIL' => '24',
        'Tags' => '145',
+       'TAMIL' => '24',
        'Tamil(?:[-_]|\s+)?Block' => '70',
 },
 'te' => {
 },
 'th' => {
        'THAANA' => '18',
-       'THAI' => '29',
        'Thaana(?:[-_]|\s+)?Block' => '64',
+       'THAI' => '29',
        'Thai(?:[-_]|\s+)?Block' => '75',
 },
 'ti' => {
index ef99c29..6ee87e3 100644 (file)
@@ -3,48 +3,48 @@
 # Any changes made here will be lost!
 %utf8::Is =
 (
-'Close Punctuation' => 'Pe',
-'Connector Punctuation' => 'Pc',
-'Control' => 'Cc',
-'Currency Symbol' => 'Sc',
-'Dash Punctuation' => 'Pd',
-'Decimal Digit Number' => 'Nd',
-'Enclosing Mark' => 'Me',
-'Final Punctuation' => 'Pf',
-'Format' => 'Cf',
-'Initial Punctuation' => 'Pi',
-'Letter' => 'L',
-'Letter Number' => 'Nl',
-'Line Separator' => 'Zl',
-'Lowercase Letter' => 'Ll',
-'Mark' => 'M',
-'Math Symbol' => 'Sm',
-'Modifier Letter' => 'Lm',
-'Modifier Symbol' => 'Sk',
-'Non-Spacing Mark' => 'Mn',
-'Not Assigned' => 'Cn',
-'Number' => 'N',
-'Open Punctuation' => 'Ps',
-'Other' => 'C',
-'Other Control' => 'Cc',
-'Other Format' => 'Cf',
-'Other Letter' => 'Lo',
-'Other Not Assigned' => 'Cn',
-'Other Number' => 'No',
-'Other Private Use' => 'Co',
-'Other Punctuation' => 'Po',
-'Other Surrogate' => 'Cs',
-'Other Symbol' => 'So',
-'Paragraph Separator' => 'Zp',
-'Private Use' => 'Co',
-'Punctuation' => 'P',
-'Separator' => 'Z',
-'Space Separator' => 'Zs',
-'Spacing Combining Mark' => 'Mc',
-'Surrogate' => 'Cs',
-'Symbol' => 'S',
-'Titlecase Letter' => 'Lt',
-'Uppercase Letter' => 'Lu',
+'Close Punctuation'                           => 'Pe',
+'Connector Punctuation'                       => 'Pc',
+'Control'                                     => 'Cc',
+'Currency Symbol'                             => 'Sc',
+'Dash Punctuation'                            => 'Pd',
+'Decimal Digit Number'                        => 'Nd',
+'Enclosing Mark'                              => 'Me',
+'Final Punctuation'                           => 'Pf',
+'Format'                                      => 'Cf',
+'Initial Punctuation'                         => 'Pi',
+'Letter'                                      => 'L',
+'Letter Number'                               => 'Nl',
+'Line Separator'                              => 'Zl',
+'Lowercase Letter'                            => 'Ll',
+'Mark'                                        => 'M',
+'Math Symbol'                                 => 'Sm',
+'Modifier Letter'                             => 'Lm',
+'Modifier Symbol'                             => 'Sk',
+'Non-Spacing Mark'                            => 'Mn',
+'Not Assigned'                                => 'Cn',
+'Number'                                      => 'N',
+'Open Punctuation'                            => 'Ps',
+'Other'                                       => 'C',
+'Other Control'                               => 'Cc',
+'Other Format'                                => 'Cf',
+'Other Letter'                                => 'Lo',
+'Other Not Assigned'                          => 'Cn',
+'Other Number'                                => 'No',
+'Other Private Use'                           => 'Co',
+'Other Punctuation'                           => 'Po',
+'Other Surrogate'                             => 'Cs',
+'Other Symbol'                                => 'So',
+'Paragraph Separator'                         => 'Zp',
+'Private Use'                                 => 'Co',
+'Punctuation'                                 => 'P',
+'Separator'                                   => 'Z',
+'Space Separator'                             => 'Zs',
+'Spacing Combining Mark'                      => 'Mc',
+'Surrogate'                                   => 'Cs',
+'Symbol'                                      => 'S',
+'Titlecase Letter'                            => 'Lt',
+'Uppercase Letter'                            => 'Lu',
 );
 %utf8::IsPat =
 (
index f851302..060a0e6 100644 (file)
@@ -545,7 +545,7 @@ sub mapping {
 %utf8::${name} =
 (
 EOT
-        for my $i (sort keys %$map) {
+        for my $i (sort { lc $a cmp lc $b } keys %$map) {
            my $pat = $i;
            # Here is the 'fuzzification': accept any space,
            # dash, or underbar where in the official name
@@ -555,7 +555,7 @@ EOT
            # The prefix length of 2 is enough spread,
            # and besides, we have 'Yi' as an In category.
            push @{$pat{lc(substr($i, 0, 2))}}, [ $i, $pat ];
-           print $fh "'$i' => '$map->{$i}',\n";
+           printf $fh "%-45s => '$map->{$i}',\n", "'$i'";
        }
         print $fh <<EOT;
 );