X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FUnicode%2FUCD.pm;h=1496663c5495a73f0f119efc9ccf8799267e8a1d;hb=06c0cc96ebd866767a6d107ed78967600f7e0395;hp=9dabc5dec6941722e726fc2229f2b2a8b49001bb;hpb=551b6b6ff9895983c94b1aff97abae5f0914a105;p=p5sagit%2Fp5-mst-13.2.git diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index 9dabc5d..1496663 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -3,7 +3,9 @@ package Unicode::UCD; use strict; use warnings; -our $VERSION = '0.2'; +our $VERSION = '0.23'; + +use Storable qw(dclone); require Exporter; @@ -14,7 +16,8 @@ our @EXPORT_OK = qw(charinfo charblocks charscripts charinrange compexcl - casefold casespec); + casefold casespec + namedseq); use Carp; @@ -31,7 +34,7 @@ Unicode::UCD - Unicode character database my $charblock = charblock($codepoint); use Unicode::UCD 'charscript'; - my $charscript = charblock($codepoint); + my $charscript = charscript($codepoint); use Unicode::UCD 'charblocks'; my $charblocks = charblocks(); @@ -46,6 +49,9 @@ Unicode::UCD - Unicode character database use Unicode::UCD 'compexcl'; my $compexcl = compexcl($codepoint); + use Unicode::UCD 'namedseq'; + my $namedseq = namedseq($named_sequence_name); + my $unicode_version = Unicode::UCD::UnicodeVersion(); =head1 DESCRIPTION @@ -62,6 +68,7 @@ my $VERSIONFH; my $COMPEXCLFH; my $CASEFOLDFH; my $CASESPECFH; +my $NAMEDSEQFH; sub openunicode { my ($rfh, @path) = @_; @@ -123,12 +130,13 @@ you will need also the compexcl(), casefold(), and casespec() functions. =cut +# NB: This function is duplicated in charnames.pm sub _getcode { my $arg = shift; - if ($arg =~ /^\d+$/) { + if ($arg =~ /^[1-9]\d*$/) { return $arg; - } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) { + } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { return hex($1); } @@ -209,6 +217,7 @@ sub charinfo { use Search::Dict 1.02; if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) { my $line = <$UNICODEFH>; + return unless defined $line; chomp $line; my %prop; @prop{qw( @@ -283,9 +292,9 @@ See also L. If supplied with an argument that can't be a code point, charblock() tries to do the opposite and interpret the argument as a character block. The return value is a I: an anonymous list of lists that contain -I, I code point pairs. You can test whether a -code point is in a range using the L function. If the -argument is not a known charater block, C is returned. +I, I code point pairs. You can test whether +a code point is in a range using the L function. If the +argument is not a known character block, C is returned. =cut @@ -295,6 +304,7 @@ my %BLOCKS; sub _charblocks { unless (@BLOCKS) { if (openunicode(\$BLOCKSFH, "Blocks.txt")) { + local $_; while (<$BLOCKSFH>) { if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) { my ($lo, $hi) = (hex($1), hex($2)); @@ -319,7 +329,7 @@ sub charblock { _search(\@BLOCKS, 0, $#BLOCKS, $code); } else { if (exists $BLOCKS{$arg}) { - return $BLOCKS{$arg}; + return dclone $BLOCKS{$arg}; } else { return; } @@ -346,7 +356,7 @@ to do the opposite and interpret the argument as a character script. The return value is a I: an anonymous list of lists that contain I, I code point pairs. You can test whether a code point is in a range using the L function. If the -argument is not a known charater script, C is returned. +argument is not a known character script, C is returned. =cut @@ -356,6 +366,7 @@ my %SCRIPTS; sub _charscripts { unless (@SCRIPTS) { if (openunicode(\$SCRIPTSFH, "Scripts.txt")) { + local $_; while (<$SCRIPTSFH>) { if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) { my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1)); @@ -383,7 +394,7 @@ sub charscript { _search(\@SCRIPTS, 0, $#SCRIPTS, $code); } else { if (exists $SCRIPTS{$arg}) { - return $SCRIPTS{$arg}; + return dclone $SCRIPTS{$arg}; } else { return; } @@ -405,7 +416,7 @@ See also L. sub charblocks { _charblocks() unless %BLOCKS; - return \%BLOCKS; + return dclone \%BLOCKS; } =head2 charscripts @@ -423,7 +434,7 @@ See also L. sub charscripts { _charscripts() unless %SCRIPTS; - return \%SCRIPTS; + return dclone \%SCRIPTS; } =head2 Blocks versus Scripts @@ -455,9 +466,12 @@ any of the 256 code points in the Tibetan block). A I is either a decimal or a hexadecimal scalar designating a Unicode character, or C followed by hexadecimals -designating a Unicode character. Note that Unicode is B limited -to 16 bits (the number of Unicode characters is open-ended, in theory -unlimited): you may have more than 4 hexdigits. +designating a Unicode character. In other words, if you want a code +point to be interpreted as a hexadecimal number, you must prefix it +with either C<0x> or C, because a string like e.g. C<123> will +be interpreted as a decimal code point. Also note that Unicode is +B limited to 16 bits (the number of Unicode characters is +open-ended, in theory unlimited): you may have more than 4 hexdigits. =head2 charinrange @@ -493,8 +507,9 @@ my %COMPEXCL; sub _compexcl { unless (%COMPEXCL) { if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) { + local $_; while (<$COMPEXCLFH>) { - if (/^([0-9A-F]+) \# /) { + if (/^([0-9A-F]+)\s+\#\s+/) { my $code = hex($1); $COMPEXCL{$code} = undef; } @@ -519,7 +534,7 @@ sub compexcl { use Unicode::UCD 'casefold'; - my %casefold = casefold("09dc"); + my $casefold = casefold("00DF"); The casefold() returns the locale-independent case folding of the character specified by a B. @@ -563,6 +578,7 @@ my %CASEFOLD; sub _casefold { unless (%CASEFOLD) { if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) { + local $_; while (<$CASEFOLDFH>) { if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) { my $code = hex($1); @@ -591,7 +607,7 @@ sub casefold { use Unicode::UCD 'casespec'; - my %casespec = casespec("09dc"); + my $casespec = casespec("FB00"); The casespec() returns the potentially locale-dependent case mapping of the character specified by a B. The mapping @@ -614,7 +630,7 @@ more I or I, separated by spaces (other than as used to separate elements, spaces are to be ignored). A condition list overrides the normal behavior if all of the listed conditions are true. Case distinctions in the condition list are not significant. -Conditions preceded by "NON_" represent the negation of the condition +Conditions preceded by "NON_" represent the negation of the condition. Note that when there are multiple case folding definitions for a single code point because of different locales, the value returned by @@ -643,6 +659,7 @@ my %CASESPEC; sub _casespec { unless (%CASESPEC) { if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) { + local $_; while (<$CASESPECFH>) { if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) { my ($hexcode, $lower, $title, $upper, $condition) = @@ -658,9 +675,9 @@ sub _casespec { title upper condition)}; - my ($oldlocale) = + if (defined $oldcondition) { + my ($oldlocale) = ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/); - if (defined $oldlocale) { delete $CASESPEC{$code}; $CASESPEC{$code}->{$oldlocale} = { code => $hexcode, @@ -668,8 +685,6 @@ sub _casespec { title => $oldtitle, upper => $oldupper, condition => $oldcondition }; - } else { - warn __PACKAGE__, ": SpecialCasing.txt:", $., ": No oldlocale for 0x$hexcode\n" } } my ($locale) = @@ -703,7 +718,64 @@ sub casespec { _casespec() unless %CASESPEC; - return $CASESPEC{$code}; + return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code}; +} + +=head2 namedseq() + + use Unicode::UCD 'namedseq'; + + my $namedseq = namedseq("KATAKANA LETTER AINU P"); + my @namedseq = namedseq("KATAKANA LETTER AINU P"); + my %namedseq = namedseq(); + +If used with a single argument in a scalar context, returns the string +consisting of the code points of the named sequence, or C if no +named sequence by that name exists. If used with a single argument in +a list context, returns list of the code points. If used with no +arguments in a list context, returns a hash with the names of the +named sequences as the keys and the named sequences as strings as +the values. Otherwise, returns C or empty list depending +on the context. + +(New from Unicode 4.1.0) + +=cut + +my %NAMEDSEQ; + +sub _namedseq { + unless (%NAMEDSEQ) { + if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) { + local $_; + while (<$NAMEDSEQFH>) { + if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) { + my ($n, $s) = ($1, $2); + my @s = map { chr(hex($_)) } split(' ', $s); + $NAMEDSEQ{$n} = join("", @s); + } + } + close($NAMEDSEQFH); + } + } +} + +sub namedseq { + _namedseq() unless %NAMEDSEQ; + my $wantarray = wantarray(); + if (defined $wantarray) { + if ($wantarray) { + if (@_ == 0) { + return %NAMEDSEQ; + } elsif (@_ == 1) { + my $s = $NAMEDSEQ{ $_[0] }; + return defined $s ? map { ord($_) } split('', $s) : (); + } + } elsif (@_ == 1) { + return $NAMEDSEQ{ $_[0] }; + } + } + return; } =head2 Unicode::UCD::UnicodeVersion