use strict;
use warnings;
-our $VERSION = '0.2';
+our $VERSION = '0.22';
+
+use Storable qw(dclone);
require Exporter;
my $charblock = charblock($codepoint);
use Unicode::UCD 'charscript';
- my $charscript = charblock($codepoint);
+ my $charscript = charscript($codepoint);
use Unicode::UCD 'charblocks';
my $charblocks = charblocks();
=cut
+# NB: This function is duplicated in charnames.pm
sub _getcode {
my $arg = shift;
- if ($arg =~ /^\d+$/) {
+ if ($arg =~ /^[1-9]\d*$/) {
return $arg;
- } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
+ } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
return hex($1);
}
last;
}
}
- openunicode(\$UNICODEFH, "Unicode.txt");
+ openunicode(\$UNICODEFH, "UnicodeData.txt");
if (defined $UNICODEFH) {
use Search::Dict 1.02;
if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
my $line = <$UNICODEFH>;
+ return unless defined $line;
chomp $line;
my %prop;
@prop{qw(
sub _charblocks {
unless (@BLOCKS) {
if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
+ local $_;
while (<$BLOCKSFH>) {
if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
my ($lo, $hi) = (hex($1), hex($2));
_search(\@BLOCKS, 0, $#BLOCKS, $code);
} else {
if (exists $BLOCKS{$arg}) {
- return $BLOCKS{$arg};
+ return dclone $BLOCKS{$arg};
} else {
return;
}
sub _charscripts {
unless (@SCRIPTS) {
if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
+ local $_;
while (<$SCRIPTSFH>) {
if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
} else {
if (exists $SCRIPTS{$arg}) {
- return $SCRIPTS{$arg};
+ return dclone $SCRIPTS{$arg};
} else {
return;
}
sub charblocks {
_charblocks() unless %BLOCKS;
- return \%BLOCKS;
+ return dclone \%BLOCKS;
}
=head2 charscripts
sub charscripts {
_charscripts() unless %SCRIPTS;
- return \%SCRIPTS;
+ return dclone \%SCRIPTS;
}
=head2 Blocks versus Scripts
A I<code point argument> is either a decimal or a hexadecimal scalar
designating a Unicode character, or C<U+> followed by hexadecimals
-designating a Unicode character. Note that Unicode is B<not> limited
-to 16 bits (the number of Unicode characters is open-ended, in theory
-unlimited): you may have more than 4 hexdigits.
+designating a Unicode character. In other words, if you want a code
+point to be interpreted as a hexadecimal number, you must prefix it
+with either C<0x> or C<U+>, because a string like e.g. C<123> will
+be interpreted as a decimal code point. Also note that Unicode is
+B<not> limited to 16 bits (the number of Unicode characters is
+open-ended, in theory unlimited): you may have more than 4 hexdigits.
=head2 charinrange
sub _compexcl {
unless (%COMPEXCL) {
- if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
+ if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
+ local $_;
while (<$COMPEXCLFH>) {
- if (/^([0-9A-F]+) \# /) {
+ if (/^([0-9A-F]+)\s+\#\s+/) {
my $code = hex($1);
$COMPEXCL{$code} = undef;
}
use Unicode::UCD 'casefold';
- my %casefold = casefold("09dc");
+ my $casefold = casefold("00DF");
The casefold() returns the locale-independent case folding of the
character specified by a B<code point argument>.
sub _casefold {
unless (%CASEFOLD) {
- if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
+ if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
+ local $_;
while (<$CASEFOLDFH>) {
if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
my $code = hex($1);
use Unicode::UCD 'casespec';
- my %casespec = casespec("09dc");
+ my $casespec = casespec("FB00");
The casespec() returns the potentially locale-dependent case mapping
of the character specified by a B<code point argument>. The mapping
used to separate elements, spaces are to be ignored). A condition
list overrides the normal behavior if all of the listed conditions are
true. Case distinctions in the condition list are not significant.
-Conditions preceded by "NON_" represent the negation of the condition
+Conditions preceded by "NON_" represent the negation of the condition.
Note that when there are multiple case folding definitions for a
single code point because of different locales, the value returned by
sub _casespec {
unless (%CASESPEC) {
- if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
+ if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
+ local $_;
while (<$CASESPECFH>) {
if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
my ($hexcode, $lower, $title, $upper, $condition) =
title
upper
condition)};
- my ($oldlocale) =
+ if (defined $oldcondition) {
+ my ($oldlocale) =
($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
- if (defined $oldlocale) {
delete $CASESPEC{$code};
$CASESPEC{$code}->{$oldlocale} =
{ code => $hexcode,
title => $oldtitle,
upper => $oldupper,
condition => $oldcondition };
- } else {
- warn __PACKAGE__, ": SpecCase.txt:", $., ": No oldlocale for 0x$hexcode\n"
}
}
my ($locale) =
_casespec() unless %CASESPEC;
- return $CASESPEC{$code};
+ return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
}
=head2 Unicode::UCD::UnicodeVersion