X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FUnicode%2FUCD.pm;h=dfdd2dcb51938a515e79c4b4babbadbc81460675;hb=98641f606c65e71cca89f9a694e2796b5a21cbd8;hp=b239c16fc1af4dd7a8b4068eb3d22f22d5fe762c;hpb=92e830a9086d75f086574c378b1c63ff2e00edcf;p=p5sagit%2Fp5-mst-13.2.git diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index b239c16..dfdd2dc 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -3,7 +3,9 @@ package Unicode::UCD; use strict; use warnings; -our $VERSION = '0.2'; +our $VERSION = '0.22'; + +use Storable qw(dclone); require Exporter; @@ -31,7 +33,7 @@ Unicode::UCD - Unicode character database my $charblock = charblock($codepoint); use Unicode::UCD 'charscript'; - my $charscript = charblock($codepoint); + my $charscript = charscript($codepoint); use Unicode::UCD 'charblocks'; my $charblocks = charblocks(); @@ -123,12 +125,13 @@ you will need also the compexcl(), casefold(), and casespec() functions. =cut +# NB: This function is duplicated in charnames.pm sub _getcode { my $arg = shift; - if ($arg =~ /^\d+$/) { + if ($arg =~ /^[1-9]\d*$/) { return $arg; - } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) { + } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { return hex($1); } @@ -204,11 +207,12 @@ sub charinfo { last; } } - openunicode(\$UNICODEFH, "Unicode.txt"); + openunicode(\$UNICODEFH, "UnicodeData.txt"); if (defined $UNICODEFH) { use Search::Dict 1.02; if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) { my $line = <$UNICODEFH>; + return unless defined $line; chomp $line; my %prop; @prop{qw( @@ -295,6 +299,7 @@ my %BLOCKS; sub _charblocks { unless (@BLOCKS) { if (openunicode(\$BLOCKSFH, "Blocks.txt")) { + local $_; while (<$BLOCKSFH>) { if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) { my ($lo, $hi) = (hex($1), hex($2)); @@ -319,7 +324,7 @@ sub charblock { _search(\@BLOCKS, 0, $#BLOCKS, $code); } else { if (exists $BLOCKS{$arg}) { - return $BLOCKS{$arg}; + return dclone $BLOCKS{$arg}; } else { return; } @@ -356,6 +361,7 @@ my %SCRIPTS; sub _charscripts { unless (@SCRIPTS) { if (openunicode(\$SCRIPTSFH, "Scripts.txt")) { + local $_; while (<$SCRIPTSFH>) { if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) { my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1)); @@ -383,7 +389,7 @@ sub charscript { _search(\@SCRIPTS, 0, $#SCRIPTS, $code); } else { if (exists $SCRIPTS{$arg}) { - return $SCRIPTS{$arg}; + return dclone $SCRIPTS{$arg}; } else { return; } @@ -405,7 +411,7 @@ See also L. sub charblocks { _charblocks() unless %BLOCKS; - return \%BLOCKS; + return dclone \%BLOCKS; } =head2 charscripts @@ -423,7 +429,7 @@ See also L. sub charscripts { _charscripts() unless %SCRIPTS; - return \%SCRIPTS; + return dclone \%SCRIPTS; } =head2 Blocks versus Scripts @@ -455,9 +461,12 @@ any of the 256 code points in the Tibetan block). A I is either a decimal or a hexadecimal scalar designating a Unicode character, or C followed by hexadecimals -designating a Unicode character. Note that Unicode is B limited -to 16 bits (the number of Unicode characters is open-ended, in theory -unlimited): you may have more than 4 hexdigits. +designating a Unicode character. In other words, if you want a code +point to be interpreted as a hexadecimal number, you must prefix it +with either C<0x> or C, because a string like e.g. C<123> will +be interpreted as a decimal code point. Also note that Unicode is +B limited to 16 bits (the number of Unicode characters is +open-ended, in theory unlimited): you may have more than 4 hexdigits. =head2 charinrange @@ -492,9 +501,10 @@ my %COMPEXCL; sub _compexcl { unless (%COMPEXCL) { - if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) { + if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) { + local $_; while (<$COMPEXCLFH>) { - if (/^([0-9A-F]+) \# /) { + if (/^([0-9A-F]+)\s+\#\s+/) { my $code = hex($1); $COMPEXCL{$code} = undef; } @@ -519,7 +529,7 @@ sub compexcl { use Unicode::UCD 'casefold'; - my %casefold = casefold("09dc"); + my $casefold = casefold("00DF"); The casefold() returns the locale-independent case folding of the character specified by a B. @@ -562,7 +572,8 @@ my %CASEFOLD; sub _casefold { unless (%CASEFOLD) { - if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) { + if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) { + local $_; while (<$CASEFOLDFH>) { if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) { my $code = hex($1); @@ -591,7 +602,7 @@ sub casefold { use Unicode::UCD 'casespec'; - my %casespec = casespec("09dc"); + my $casespec = casespec("FB00"); The casespec() returns the potentially locale-dependent case mapping of the character specified by a B. The mapping @@ -614,7 +625,7 @@ more I or I, separated by spaces (other than as used to separate elements, spaces are to be ignored). A condition list overrides the normal behavior if all of the listed conditions are true. Case distinctions in the condition list are not significant. -Conditions preceded by "NON_" represent the negation of the condition +Conditions preceded by "NON_" represent the negation of the condition. Note that when there are multiple case folding definitions for a single code point because of different locales, the value returned by @@ -642,7 +653,8 @@ my %CASESPEC; sub _casespec { unless (%CASESPEC) { - if (openunicode(\$CASESPECFH, "SpecCase.txt")) { + if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) { + local $_; while (<$CASESPECFH>) { if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) { my ($hexcode, $lower, $title, $upper, $condition) = @@ -658,9 +670,9 @@ sub _casespec { title upper condition)}; - my ($oldlocale) = + if (defined $oldcondition) { + my ($oldlocale) = ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/); - if (defined $oldlocale) { delete $CASESPEC{$code}; $CASESPEC{$code}->{$oldlocale} = { code => $hexcode, @@ -668,8 +680,6 @@ sub _casespec { title => $oldtitle, upper => $oldupper, condition => $oldcondition }; - } else { - warn __PACKAGE__, ": SpecCase.txt:", $., ": No oldlocale for 0x$hexcode\n" } } my ($locale) = @@ -703,7 +713,7 @@ sub casespec { _casespec() unless %CASESPEC; - return $CASESPEC{$code}; + return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code}; } =head2 Unicode::UCD::UnicodeVersion