From: Jarkko Hietaniemi Date: Sat, 27 Oct 2001 16:47:07 +0000 (+0000) Subject: Unicode: property alias naming cleanup. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=e150c829b97ab41f324abbc8734f50bd5f4a4838;p=p5sagit%2Fp5-mst-13.2.git Unicode: property alias naming cleanup. p4raw-id: //depot/perl@12707 --- diff --git a/lib/unicore/Is.pl b/lib/unicore/Is.pl index 6ee87e3..86e5926 100644 --- a/lib/unicore/Is.pl +++ b/lib/unicore/Is.pl @@ -3,48 +3,43 @@ # Any changes made here will be lost! %utf8::Is = ( -'Close Punctuation' => 'Pe', -'Connector Punctuation' => 'Pc', +'Close_Punctuation' => 'Pe', +'Connector_Punctuation' => 'Pc', 'Control' => 'Cc', -'Currency Symbol' => 'Sc', -'Dash Punctuation' => 'Pd', -'Decimal Digit Number' => 'Nd', -'Enclosing Mark' => 'Me', -'Final Punctuation' => 'Pf', +'Currency_Symbol' => 'Sc', +'Dash_Punctuation' => 'Pd', +'Decimal_Number' => 'Nd', +'Enclosing_Mark' => 'Me', +'Final_Punctuation' => 'Pf', 'Format' => 'Cf', -'Initial Punctuation' => 'Pi', +'Initial_Punctuation' => 'Pi', 'Letter' => 'L', -'Letter Number' => 'Nl', -'Line Separator' => 'Zl', -'Lowercase Letter' => 'Ll', +'Letter_Number' => 'Nl', +'Line_Separator' => 'Zl', +'Lowercase_Letter' => 'Ll', 'Mark' => 'M', -'Math Symbol' => 'Sm', -'Modifier Letter' => 'Lm', -'Modifier Symbol' => 'Sk', -'Non-Spacing Mark' => 'Mn', -'Not Assigned' => 'Cn', +'Math_Symbol' => 'Sm', +'Modifier_Letter' => 'Lm', +'Modifier_Symbol' => 'Sk', +'Non_Spacing_Mark' => 'Mn', 'Number' => 'N', -'Open Punctuation' => 'Ps', +'Open_Punctuation' => 'Ps', 'Other' => 'C', -'Other Control' => 'Cc', -'Other Format' => 'Cf', -'Other Letter' => 'Lo', -'Other Not Assigned' => 'Cn', -'Other Number' => 'No', -'Other Private Use' => 'Co', -'Other Punctuation' => 'Po', -'Other Surrogate' => 'Cs', -'Other Symbol' => 'So', -'Paragraph Separator' => 'Zp', +'Other_Letter' => 'Lo', +'Other_Number' => 'No', +'Other_Punctuation' => 'Po', +'Other_Symbol' => 'So', +'Paragraph_Separator' => 'Zp', 'Private Use' => 'Co', 'Punctuation' => 'P', 'Separator' => 'Z', -'Space Separator' => 'Zs', -'Spacing Combining Mark' => 'Mc', +'Space_Separator' => 'Zs', +'Spacing_Mark' => 'Mc', 'Surrogate' => 'Cs', 'Symbol' => 'S', -'Titlecase Letter' => 'Lt', -'Uppercase Letter' => 'Lu', +'Titlecase_Letter' => 'Lt', +'Unassigned' => 'Cn', +'Uppercase_Letter' => 'Lu', ); %utf8::IsPat = ( @@ -62,7 +57,7 @@ 'Dash(?:[-_]|\s+)?Punctuation' => 'Pd', }, 'de' => { - 'Decimal(?:[-_]|\s+)?Digit(?:[-_]|\s+)?Number' => 'Nd', + 'Decimal(?:[-_]|\s+)?Number' => 'Nd', }, 'en' => { 'Enclosing(?:[-_]|\s+)?Mark' => 'Me', @@ -96,7 +91,6 @@ }, 'no' => { 'Non(?:[-_]|\s+)?Spacing(?:[-_]|\s+)?Mark' => 'Mn', - 'Not(?:[-_]|\s+)?Assigned' => 'Cn', }, 'nu' => { 'Number' => 'N', @@ -106,14 +100,9 @@ }, 'ot' => { 'Other' => 'C', - 'Other(?:[-_]|\s+)?Control' => 'Cc', - 'Other(?:[-_]|\s+)?Format' => 'Cf', 'Other(?:[-_]|\s+)?Letter' => 'Lo', - 'Other(?:[-_]|\s+)?Not(?:[-_]|\s+)?Assigned' => 'Cn', 'Other(?:[-_]|\s+)?Number' => 'No', - 'Other(?:[-_]|\s+)?Private(?:[-_]|\s+)?Use' => 'Co', 'Other(?:[-_]|\s+)?Punctuation' => 'Po', - 'Other(?:[-_]|\s+)?Surrogate' => 'Cs', 'Other(?:[-_]|\s+)?Symbol' => 'So', }, 'pa' => { @@ -130,7 +119,7 @@ }, 'sp' => { 'Space(?:[-_]|\s+)?Separator' => 'Zs', - 'Spacing(?:[-_]|\s+)?Combining(?:[-_]|\s+)?Mark' => 'Mc', + 'Spacing(?:[-_]|\s+)?Mark' => 'Mc', }, 'su' => { 'Surrogate' => 'Cs', @@ -141,6 +130,9 @@ 'ti' => { 'Titlecase(?:[-_]|\s+)?Letter' => 'Lt', }, +'un' => { + 'Unassigned' => 'Cn', +}, 'up' => { 'Uppercase(?:[-_]|\s+)?Letter' => 'Lu', }, diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 66027a5..676e189 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -658,54 +658,48 @@ foreach my $in (sort { $In{$a} <=> $In{$b} } keys %In) { my %Is = ( 'Letter' => 'L', - 'Uppercase Letter' => 'Lu', - 'Lowercase Letter' => 'Ll', - 'Titlecase Letter' => 'Lt', - 'Modifier Letter' => 'Lm', - 'Other Letter' => 'Lo', + 'Uppercase_Letter' => 'Lu', + 'Lowercase_Letter' => 'Ll', + 'Titlecase_Letter' => 'Lt', + 'Modifier_Letter' => 'Lm', + 'Other_Letter' => 'Lo', 'Mark' => 'M', - 'Non-Spacing Mark' => 'Mn', - 'Spacing Combining Mark' => 'Mc', - 'Enclosing Mark' => 'Me', + 'Non_Spacing_Mark' => 'Mn', + 'Spacing_Mark' => 'Mc', + 'Enclosing_Mark' => 'Me', 'Separator' => 'Z', - 'Space Separator' => 'Zs', - 'Line Separator' => 'Zl', - 'Paragraph Separator' => 'Zp', + 'Space_Separator' => 'Zs', + 'Line_Separator' => 'Zl', + 'Paragraph_Separator' => 'Zp', 'Number' => 'N', - 'Decimal Digit Number' => 'Nd', - 'Letter Number' => 'Nl', - 'Other Number' => 'No', + 'Decimal_Number' => 'Nd', + 'Letter_Number' => 'Nl', + 'Other_Number' => 'No', 'Punctuation' => 'P', - 'Connector Punctuation' => 'Pc', - 'Dash Punctuation' => 'Pd', - 'Open Punctuation' => 'Ps', - 'Close Punctuation' => 'Pe', - 'Initial Punctuation' => 'Pi', - 'Final Punctuation' => 'Pf', - 'Other Punctuation' => 'Po', + 'Connector_Punctuation' => 'Pc', + 'Dash_Punctuation' => 'Pd', + 'Open_Punctuation' => 'Ps', + 'Close_Punctuation' => 'Pe', + 'Initial_Punctuation' => 'Pi', + 'Final_Punctuation' => 'Pf', + 'Other_Punctuation' => 'Po', 'Symbol' => 'S', - 'Math Symbol' => 'Sm', - 'Currency Symbol' => 'Sc', - 'Modifier Symbol' => 'Sk', - 'Other Symbol' => 'So', + 'Math_Symbol' => 'Sm', + 'Currency_Symbol' => 'Sc', + 'Modifier_Symbol' => 'Sk', + 'Other_Symbol' => 'So', 'Other' => 'C', 'Control' => 'Cc', 'Format' => 'Cf', 'Surrogate' => 'Cs', 'Private Use' => 'Co', - 'Not Assigned' => 'Cn', - # 'Other' aliases - 'Other Control' => 'Cc', - 'Other Format' => 'Cf', - 'Other Surrogate' => 'Cs', - 'Other Private Use' => 'Co', - 'Other Not Assigned' => 'Cn', + 'Unassigned' => 'Cn', ); # diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 37e2f22..0b52afa 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -173,85 +173,85 @@ are available, such as C<\p{IsMirrored}> and C<\p{InTibetan}>. The C<\p{Is...}> test for "general properties" such as "letter", "digit", while the C<\p{In...}> test for Unicode scripts and blocks. -The official Unicode script and block names have spaces and -dashes and separators, but for convenience you can have -dashes, spaces, and underbars at every word division, and -you need not care about correct casing. It is recommended, -however, that for consistency you use the following naming: -the official Unicode script or block name (see below for -the additional rules that apply to block names), with the whitespace -and dashes removed, and the words "uppercase-first-lowercase-otherwise". -That is, "Latin-1 Supplement" becomes "Latin1Supplement". +The official Unicode script and block names have spaces and dashes and +separators, but for convenience you can have dashes, spaces, and +underbars at every word division, and you need not care about correct +casing. It is recommended, however, that for consistency you use the +following naming: the official Unicode script, block, or property name +(see below for the additional rules that apply to block names), +with whitespace and dashes replaced with underbar, and the words +"uppercase-first-lowercase-rest". That is, "Latin-1 Supplement" +becomes "Latin_1_Supplement". You can also negate both C<\p{}> and C<\P{}> by introducing a caret -(^) between the first curly and the property name: C<\p{^InTamil}> is -equal to C<\P{InTamil}>. +(^) between the first curly and the property name: C<\p{^In_Tamil}> is +equal to C<\P{In_Tamil}>. The C and C can be left out: C<\p{Greek}> is equal to -C<\p{InGreek}>, C<\P{Pd}> is equal to C<\P{Pd}>. +C<\p{In_Greek}>, C<\P{Pd}> is equal to C<\P{Pd}>. Short Long L Letter - Lu Uppercase Letter - Ll Lowercase Letter - Lt Titlecase Letter - Lm Modifier Letter - Lo Other Letter + Lu Uppercase_Letter + Ll Lowercase_Letter + Lt Titlecase_Letter + Lm Modifier_Letter + Lo Other_Letter M Mark - Mn Non-Spacing Mark - Mc Spacing Combining Mark - Me Enclosing Mark + Mn Nonspacing_Mark + Mc Spacing_Mark + Me Enclosing_Mark N Number - Nd Decimal Digit Number - Nl Letter Number - No Other Number + Nd Decimal_Number + Nl Letter_Number + No Other_Number P Punctuation - Pc Connector Punctuation - Pd Dash Punctuation - Ps Open Punctuation - Pe Close Punctuation - Pi Initial Punctuation + Pc Connector_Punctuation + Pd Dash_Punctuation + Ps Open_Punctuation + Pe Close_Punctuation + Pi Initial_Punctuation (may behave like Ps or Pe depending on usage) - Pf Final Punctuation + Pf Final_Punctuation (may behave like Ps or Pe depending on usage) - Po Other Punctuation + Po Other_Punctuation S Symbol - Sm Math Symbol - Sc Currency Symbol - Sk Modifier Symbol - So Other Symbol + Sm Math_Symbol + Sc Currency_Symbol + Sk Modifier_Symbol + So Other_Symbol Z Separator - Zs Space Separator - Zl Line Separator - Zp Paragraph Separator + Zs Space_Separator + Zl Line_Separator + Zp Paragraph_Separator C Other - Cc (Other) Control - Cf (Other) Format - Cs (Other) Surrogate - Co (Other) Private Use - Cn (Other) Not Assigned + Cc Control + Cf Format + Cs Surrogate + Co Private_Use + Cn Unassigned There's also C which is an alias for C, C, and C. The following reserved ranges have C tests: - CJK Ideograph Extension A - CJK Ideograph - Hangul Syllable - Non Private Use High Surrogate - Private Use High Surrogate - Low Surrogate - Private Surrogate - CJK Ideograph Extension B - Plane 15 Private Use - Plane 16 Private Use + CJK_Ideograph_Extension_A + CJK_Ideograph + Hangul_Syllable + Non_Private_Use_High_Surrogate + Private_Use_High_Surrogate + Low_Surrogate + Private_Surrogate + CJK_Ideograph_Extension_B + Plane_15_Private_Use + Plane_16_Private_Use For example C<"\x{AC00}" =~ \p{HangulSyllable}> will test true. (Handling of surrogates is not implemented yet, because Perl @@ -345,7 +345,7 @@ properties, defined by the F Unicode database: Other_Math Other_Uppercase Quotation_Mark - White_space + White_Space and further derived properties: @@ -360,7 +360,7 @@ and further derived properties: Any Any character Assigned Any non-Cn character Common Any character (or unassigned code point) - not explicitly assigned to a script. + not explicitly assigned to a script =head2 Blocks @@ -385,7 +385,7 @@ a script called C and a block called C, the block version has C appended to its name, C<\p{InKatakanaBlock}>. Notice that this definition was introduced in Perl 5.8.0: in Perl -5.6.0 only the blocks were used; in Perl 5.8.0 scripts became the +5.6 only the blocks were used; in Perl 5.8.0 scripts became the preferential Unicode character class definition; this meant that the definitions of some character classes changed (the ones in the below list that have the C appended).