From: Jarkko Hietaniemi Date: Fri, 18 May 2007 03:45:29 +0000 (+0300) Subject: Unicode::UCD: add general category and bidi type interfaces X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=ea508aee27b2f1cd6703f49dc50601d7c9c3d20a;p=p5sagit%2Fp5-mst-13.2.git Unicode::UCD: add general category and bidi type interfaces Message-Id: <200705180045.l4I0jTeI221780@kosh.hut.fi> p4raw-id: //depot/perl@31237 --- diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index 6a2b5e1..23feae0 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -3,7 +3,7 @@ package Unicode::UCD; use strict; use warnings; -our $VERSION = '0.24'; +our $VERSION = '0.25'; use Storable qw(dclone); @@ -15,6 +15,7 @@ our @EXPORT_OK = qw(charinfo charblock charscript charblocks charscripts charinrange + general_categories bidi_types compexcl casefold casespec namedseq); @@ -40,12 +41,16 @@ Unicode::UCD - Unicode character database my $charblocks = charblocks(); use Unicode::UCD 'charscripts'; - my %charscripts = charscripts(); + my $charscripts = charscripts(); use Unicode::UCD qw(charscript charinrange); my $range = charscript($script); print "looks like $script\n" if charinrange($range, $codepoint); + use Unicode::UCD qw(general_categories bidi_types); + my $categories = general_categories(); + my $types = bidi_types(); + use Unicode::UCD 'compexcl'; my $compexcl = compexcl($codepoint); @@ -102,7 +107,7 @@ as defined by the Unicode standard: name name of the character IN UPPER CASE category general category of the character combining classes used in the Canonical Ordering Algorithm - bidi bidirectional category + bidi bidirectional type decomposition character decomposition mapping decimal if decimal digit this is the integer numeric value digit if digit this is the numeric value @@ -423,10 +428,11 @@ sub charblocks { use Unicode::UCD 'charscripts'; - my %charscripts = charscripts(); + my $charscripts = charscripts(); -charscripts() returns a hash with the known script names as the keys, -and the code point ranges (see L) as the values. +charscripts() returns a reference to a hash with the known script +names as the keys, and the code point ranges (see L) as +the values. See also L. @@ -487,6 +493,112 @@ by L and L by using charinrange(): =cut +my %GENERAL_CATEGORIES = + ( + 'L' => 'Letter', + 'LC' => 'CasedLetter', + 'Lu' => 'UppercaseLetter', + 'Ll' => 'LowercaseLetter', + 'Lt' => 'TitlecaseLetter', + 'Lm' => 'ModifierLetter', + 'Lo' => 'OtherLetter', + 'M' => 'Mark', + 'Mn' => 'NonspacingMark', + 'Mc' => 'SpacingMark', + 'Me' => 'EnclosingMark', + 'N' => 'Number', + 'Nd' => 'DecimalNumber', + 'Nl' => 'LetterNumber', + 'No' => 'OtherNumber', + 'P' => 'Punctuation', + 'Pc' => 'ConnectorPunctuation', + 'Pd' => 'DashPunctuation', + 'Ps' => 'OpenPunctuation', + 'Pe' => 'ClosePunctuation', + 'Pi' => 'InitialPunctuation', + 'Pf' => 'FinalPunctuation', + 'Po' => 'OtherPunctuation', + 'S' => 'Symbol', + 'Sm' => 'MathSymbol', + 'Sc' => 'CurrencySymbol', + 'Sk' => 'ModifierSymbol', + 'So' => 'OtherSymbol', + 'Z' => 'Separator', + 'Zs' => 'SpaceSeparator', + 'Zl' => 'LineSeparator', + 'Zp' => 'ParagraphSeparator', + 'C' => 'Other', + 'Cc' => 'Control', + 'Cf' => 'Format', + 'Cs' => 'Surrogate', + 'Co' => 'PrivateUse', + 'Cn' => 'Unassigned', + ); + +sub general_categories { + return dclone \%GENERAL_CATEGORIES; +} + +=head2 general_categories + + use Unicode::UCD 'general_categories'; + + my $categories = general_categories(); + +The general_categories() returns a reference to a hash which has short +general category names (such as C, C, C, C) as keys and long +names (such as C, C, C, +C) as values. The hash is reversible in case you need to go +from the long names to the short names. The general category is the +one returned from charinfo() under the C key. + +=cut + +my %BIDI_TYPES = + ( + 'L' => 'Left-to-Right', + 'LRE' => 'Left-to-Right Embedding', + 'LRO' => 'Left-to-Right Override', + 'R' => 'Right-to-Left', + 'AL' => 'Right-to-Left Arabic', + 'RLE' => 'Right-to-Left Embedding', + 'RLO' => 'Right-to-Left Override', + 'PDF' => 'Pop Directional Format', + 'EN' => 'European Number', + 'ES' => 'European Number Separator', + 'ET' => 'European Number Terminator', + 'AN' => 'Arabic Number', + 'CS' => 'Common Number Separator', + 'NSM' => 'Non-Spacing Mark', + 'BN' => 'Boundary Neutral', + 'B' => 'Paragraph Separator', + 'S' => 'Segment Separator', + 'WS' => 'Whitespace', + 'ON' => 'Other Neutrals', + ); + +sub bidi_types { + return dclone \%BIDI_TYPES; +} + +=head2 bidi_types + + use Unicode::UCD 'bidi_types'; + + my $categories = bidi_types(); + +The bidi_types() returns a reference to a hash which has the short +bidi (bidirectional) type names (such as C, C) as keys and long +names (such as C, C) as values. The +hash is reversible in case you need to go from the long names to the +short names. The bidi type is the one returned from charinfo() +under the C key. For the exact meaning of the various bidi classes +the Unicode TR9 is recommended reading: +http://www.unicode.org/reports/tr9/tr9-17.html +(as of Unicode 5.0.0) + +=cut + =head2 compexcl use Unicode::UCD 'compexcl'; diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t index 52075b8..3ade6b3 100644 --- a/lib/Unicode/UCD.t +++ b/lib/Unicode/UCD.t @@ -18,7 +18,7 @@ use strict; use Unicode::UCD; use Test::More; -BEGIN { plan tests => 188 }; +BEGIN { plan tests => 194 }; use Unicode::UCD 'charinfo'; @@ -238,6 +238,22 @@ ok( charinrange($ranges, "13a0")); ok( charinrange($ranges, "13f4")); ok(!charinrange($ranges, "13f5")); +use Unicode::UCD qw(general_categories); + +my $gc = general_categories(); + +ok(exists $gc->{L}, 'has L'); +is($gc->{L}, 'Letter', 'L is Letter'); +is($gc->{Lu}, 'UppercaseLetter', 'Lu is UppercaseLetter'); + +use Unicode::UCD qw(bidi_types); + +my $bt = bidi_types(); + +ok(exists $bt->{L}, 'has L'); +is($bt->{L}, 'Left-to-Right', 'L is Left-to-Right'); +is($bt->{AL}, 'Right-to-Left Arabic', 'AL is Right-to-Left Arabic'); + is(Unicode::UCD::UnicodeVersion, '5.0.0', 'UnicodeVersion'); use Unicode::UCD qw(compexcl);