# Any changes made here will be lost!
%utf8::Is =
(
-'Close Punctuation' => 'Pe',
-'Connector Punctuation' => 'Pc',
+'Close_Punctuation' => 'Pe',
+'Connector_Punctuation' => 'Pc',
'Control' => 'Cc',
-'Currency Symbol' => 'Sc',
-'Dash Punctuation' => 'Pd',
-'Decimal Digit Number' => 'Nd',
-'Enclosing Mark' => 'Me',
-'Final Punctuation' => 'Pf',
+'Currency_Symbol' => 'Sc',
+'Dash_Punctuation' => 'Pd',
+'Decimal_Number' => 'Nd',
+'Enclosing_Mark' => 'Me',
+'Final_Punctuation' => 'Pf',
'Format' => 'Cf',
-'Initial Punctuation' => 'Pi',
+'Initial_Punctuation' => 'Pi',
'Letter' => 'L',
-'Letter Number' => 'Nl',
-'Line Separator' => 'Zl',
-'Lowercase Letter' => 'Ll',
+'Letter_Number' => 'Nl',
+'Line_Separator' => 'Zl',
+'Lowercase_Letter' => 'Ll',
'Mark' => 'M',
-'Math Symbol' => 'Sm',
-'Modifier Letter' => 'Lm',
-'Modifier Symbol' => 'Sk',
-'Non-Spacing Mark' => 'Mn',
-'Not Assigned' => 'Cn',
+'Math_Symbol' => 'Sm',
+'Modifier_Letter' => 'Lm',
+'Modifier_Symbol' => 'Sk',
+'Non_Spacing_Mark' => 'Mn',
'Number' => 'N',
-'Open Punctuation' => 'Ps',
+'Open_Punctuation' => 'Ps',
'Other' => 'C',
-'Other Control' => 'Cc',
-'Other Format' => 'Cf',
-'Other Letter' => 'Lo',
-'Other Not Assigned' => 'Cn',
-'Other Number' => 'No',
-'Other Private Use' => 'Co',
-'Other Punctuation' => 'Po',
-'Other Surrogate' => 'Cs',
-'Other Symbol' => 'So',
-'Paragraph Separator' => 'Zp',
+'Other_Letter' => 'Lo',
+'Other_Number' => 'No',
+'Other_Punctuation' => 'Po',
+'Other_Symbol' => 'So',
+'Paragraph_Separator' => 'Zp',
'Private Use' => 'Co',
'Punctuation' => 'P',
'Separator' => 'Z',
-'Space Separator' => 'Zs',
-'Spacing Combining Mark' => 'Mc',
+'Space_Separator' => 'Zs',
+'Spacing_Mark' => 'Mc',
'Surrogate' => 'Cs',
'Symbol' => 'S',
-'Titlecase Letter' => 'Lt',
-'Uppercase Letter' => 'Lu',
+'Titlecase_Letter' => 'Lt',
+'Unassigned' => 'Cn',
+'Uppercase_Letter' => 'Lu',
);
%utf8::IsPat =
(
'Dash(?:[-_]|\s+)?Punctuation' => 'Pd',
},
'de' => {
- 'Decimal(?:[-_]|\s+)?Digit(?:[-_]|\s+)?Number' => 'Nd',
+ 'Decimal(?:[-_]|\s+)?Number' => 'Nd',
},
'en' => {
'Enclosing(?:[-_]|\s+)?Mark' => 'Me',
},
'no' => {
'Non(?:[-_]|\s+)?Spacing(?:[-_]|\s+)?Mark' => 'Mn',
- 'Not(?:[-_]|\s+)?Assigned' => 'Cn',
},
'nu' => {
'Number' => 'N',
},
'ot' => {
'Other' => 'C',
- 'Other(?:[-_]|\s+)?Control' => 'Cc',
- 'Other(?:[-_]|\s+)?Format' => 'Cf',
'Other(?:[-_]|\s+)?Letter' => 'Lo',
- 'Other(?:[-_]|\s+)?Not(?:[-_]|\s+)?Assigned' => 'Cn',
'Other(?:[-_]|\s+)?Number' => 'No',
- 'Other(?:[-_]|\s+)?Private(?:[-_]|\s+)?Use' => 'Co',
'Other(?:[-_]|\s+)?Punctuation' => 'Po',
- 'Other(?:[-_]|\s+)?Surrogate' => 'Cs',
'Other(?:[-_]|\s+)?Symbol' => 'So',
},
'pa' => {
},
'sp' => {
'Space(?:[-_]|\s+)?Separator' => 'Zs',
- 'Spacing(?:[-_]|\s+)?Combining(?:[-_]|\s+)?Mark' => 'Mc',
+ 'Spacing(?:[-_]|\s+)?Mark' => 'Mc',
},
'su' => {
'Surrogate' => 'Cs',
'ti' => {
'Titlecase(?:[-_]|\s+)?Letter' => 'Lt',
},
+'un' => {
+ 'Unassigned' => 'Cn',
+},
'up' => {
'Uppercase(?:[-_]|\s+)?Letter' => 'Lu',
},
my %Is = (
'Letter' => 'L',
- 'Uppercase Letter' => 'Lu',
- 'Lowercase Letter' => 'Ll',
- 'Titlecase Letter' => 'Lt',
- 'Modifier Letter' => 'Lm',
- 'Other Letter' => 'Lo',
+ 'Uppercase_Letter' => 'Lu',
+ 'Lowercase_Letter' => 'Ll',
+ 'Titlecase_Letter' => 'Lt',
+ 'Modifier_Letter' => 'Lm',
+ 'Other_Letter' => 'Lo',
'Mark' => 'M',
- 'Non-Spacing Mark' => 'Mn',
- 'Spacing Combining Mark' => 'Mc',
- 'Enclosing Mark' => 'Me',
+ 'Non_Spacing_Mark' => 'Mn',
+ 'Spacing_Mark' => 'Mc',
+ 'Enclosing_Mark' => 'Me',
'Separator' => 'Z',
- 'Space Separator' => 'Zs',
- 'Line Separator' => 'Zl',
- 'Paragraph Separator' => 'Zp',
+ 'Space_Separator' => 'Zs',
+ 'Line_Separator' => 'Zl',
+ 'Paragraph_Separator' => 'Zp',
'Number' => 'N',
- 'Decimal Digit Number' => 'Nd',
- 'Letter Number' => 'Nl',
- 'Other Number' => 'No',
+ 'Decimal_Number' => 'Nd',
+ 'Letter_Number' => 'Nl',
+ 'Other_Number' => 'No',
'Punctuation' => 'P',
- 'Connector Punctuation' => 'Pc',
- 'Dash Punctuation' => 'Pd',
- 'Open Punctuation' => 'Ps',
- 'Close Punctuation' => 'Pe',
- 'Initial Punctuation' => 'Pi',
- 'Final Punctuation' => 'Pf',
- 'Other Punctuation' => 'Po',
+ 'Connector_Punctuation' => 'Pc',
+ 'Dash_Punctuation' => 'Pd',
+ 'Open_Punctuation' => 'Ps',
+ 'Close_Punctuation' => 'Pe',
+ 'Initial_Punctuation' => 'Pi',
+ 'Final_Punctuation' => 'Pf',
+ 'Other_Punctuation' => 'Po',
'Symbol' => 'S',
- 'Math Symbol' => 'Sm',
- 'Currency Symbol' => 'Sc',
- 'Modifier Symbol' => 'Sk',
- 'Other Symbol' => 'So',
+ 'Math_Symbol' => 'Sm',
+ 'Currency_Symbol' => 'Sc',
+ 'Modifier_Symbol' => 'Sk',
+ 'Other_Symbol' => 'So',
'Other' => 'C',
'Control' => 'Cc',
'Format' => 'Cf',
'Surrogate' => 'Cs',
'Private Use' => 'Co',
- 'Not Assigned' => 'Cn',
- # 'Other' aliases
- 'Other Control' => 'Cc',
- 'Other Format' => 'Cf',
- 'Other Surrogate' => 'Cs',
- 'Other Private Use' => 'Co',
- 'Other Not Assigned' => 'Cn',
+ 'Unassigned' => 'Cn',
);
#
The C<\p{Is...}> test for "general properties" such as "letter",
"digit", while the C<\p{In...}> test for Unicode scripts and blocks.
-The official Unicode script and block names have spaces and
-dashes and separators, but for convenience you can have
-dashes, spaces, and underbars at every word division, and
-you need not care about correct casing. It is recommended,
-however, that for consistency you use the following naming:
-the official Unicode script or block name (see below for
-the additional rules that apply to block names), with the whitespace
-and dashes removed, and the words "uppercase-first-lowercase-otherwise".
-That is, "Latin-1 Supplement" becomes "Latin1Supplement".
+The official Unicode script and block names have spaces and dashes and
+separators, but for convenience you can have dashes, spaces, and
+underbars at every word division, and you need not care about correct
+casing. It is recommended, however, that for consistency you use the
+following naming: the official Unicode script, block, or property name
+(see below for the additional rules that apply to block names),
+with whitespace and dashes replaced with underbar, and the words
+"uppercase-first-lowercase-rest". That is, "Latin-1 Supplement"
+becomes "Latin_1_Supplement".
You can also negate both C<\p{}> and C<\P{}> by introducing a caret
-(^) between the first curly and the property name: C<\p{^InTamil}> is
-equal to C<\P{InTamil}>.
+(^) between the first curly and the property name: C<\p{^In_Tamil}> is
+equal to C<\P{In_Tamil}>.
The C<In> and C<Is> can be left out: C<\p{Greek}> is equal to
-C<\p{InGreek}>, C<\P{Pd}> is equal to C<\P{Pd}>.
+C<\p{In_Greek}>, C<\P{Pd}> is equal to C<\P{Pd}>.
Short Long
L Letter
- Lu Uppercase Letter
- Ll Lowercase Letter
- Lt Titlecase Letter
- Lm Modifier Letter
- Lo Other Letter
+ Lu Uppercase_Letter
+ Ll Lowercase_Letter
+ Lt Titlecase_Letter
+ Lm Modifier_Letter
+ Lo Other_Letter
M Mark
- Mn Non-Spacing Mark
- Mc Spacing Combining Mark
- Me Enclosing Mark
+ Mn Nonspacing_Mark
+ Mc Spacing_Mark
+ Me Enclosing_Mark
N Number
- Nd Decimal Digit Number
- Nl Letter Number
- No Other Number
+ Nd Decimal_Number
+ Nl Letter_Number
+ No Other_Number
P Punctuation
- Pc Connector Punctuation
- Pd Dash Punctuation
- Ps Open Punctuation
- Pe Close Punctuation
- Pi Initial Punctuation
+ Pc Connector_Punctuation
+ Pd Dash_Punctuation
+ Ps Open_Punctuation
+ Pe Close_Punctuation
+ Pi Initial_Punctuation
(may behave like Ps or Pe depending on usage)
- Pf Final Punctuation
+ Pf Final_Punctuation
(may behave like Ps or Pe depending on usage)
- Po Other Punctuation
+ Po Other_Punctuation
S Symbol
- Sm Math Symbol
- Sc Currency Symbol
- Sk Modifier Symbol
- So Other Symbol
+ Sm Math_Symbol
+ Sc Currency_Symbol
+ Sk Modifier_Symbol
+ So Other_Symbol
Z Separator
- Zs Space Separator
- Zl Line Separator
- Zp Paragraph Separator
+ Zs Space_Separator
+ Zl Line_Separator
+ Zp Paragraph_Separator
C Other
- Cc (Other) Control
- Cf (Other) Format
- Cs (Other) Surrogate
- Co (Other) Private Use
- Cn (Other) Not Assigned
+ Cc Control
+ Cf Format
+ Cs Surrogate
+ Co Private_Use
+ Cn Unassigned
There's also C<L&> which is an alias for C<Ll>, C<Lu>, and C<Lt>.
The following reserved ranges have C<In> tests:
- CJK Ideograph Extension A
- CJK Ideograph
- Hangul Syllable
- Non Private Use High Surrogate
- Private Use High Surrogate
- Low Surrogate
- Private Surrogate
- CJK Ideograph Extension B
- Plane 15 Private Use
- Plane 16 Private Use
+ CJK_Ideograph_Extension_A
+ CJK_Ideograph
+ Hangul_Syllable
+ Non_Private_Use_High_Surrogate
+ Private_Use_High_Surrogate
+ Low_Surrogate
+ Private_Surrogate
+ CJK_Ideograph_Extension_B
+ Plane_15_Private_Use
+ Plane_16_Private_Use
For example C<"\x{AC00}" =~ \p{HangulSyllable}> will test true.
(Handling of surrogates is not implemented yet, because Perl
Other_Math
Other_Uppercase
Quotation_Mark
- White_space
+ White_Space
and further derived properties:
Any Any character
Assigned Any non-Cn character
Common Any character (or unassigned code point)
- not explicitly assigned to a script.
+ not explicitly assigned to a script
=head2 Blocks
version has C<Block> appended to its name, C<\p{InKatakanaBlock}>.
Notice that this definition was introduced in Perl 5.8.0: in Perl
-5.6.0 only the blocks were used; in Perl 5.8.0 scripts became the
+5.6 only the blocks were used; in Perl 5.8.0 scripts became the
preferential Unicode character class definition; this meant that
the definitions of some character classes changed (the ones in the
below list that have the C<Block> appended).