use strict;
use warnings;
-our $VERSION = '0.1';
+our $VERSION = '0.23';
+
+use Storable qw(dclone);
require Exporter;
our @ISA = qw(Exporter);
+
our @EXPORT_OK = qw(charinfo
charblock charscript
charblocks charscripts
charinrange
compexcl
- casefold casespec);
+ casefold casespec
+ namedseq);
use Carp;
my $charblock = charblock($codepoint);
use Unicode::UCD 'charscript';
- my $charscript = charblock($codepoint);
+ my $charscript = charscript($codepoint);
+
+ use Unicode::UCD 'charblocks';
+ my $charblocks = charblocks();
+
+ use Unicode::UCD 'charscripts';
+ my %charscripts = charscripts();
+
+ use Unicode::UCD qw(charscript charinrange);
+ my $range = charscript($script);
+ print "looks like $script\n" if charinrange($range, $codepoint);
+
+ use Unicode::UCD 'compexcl';
+ my $compexcl = compexcl($codepoint);
+
+ use Unicode::UCD 'namedseq';
+ my $namedseq = namedseq($named_sequence_name);
+
+ my $unicode_version = Unicode::UCD::UnicodeVersion();
=head1 DESCRIPTION
-The Unicode module offers a simple interface to the Unicode Character
-Database.
+The Unicode::UCD module offers a simple interface to the Unicode
+Character Database.
=cut
my $COMPEXCLFH;
my $CASEFOLDFH;
my $CASESPECFH;
+my $NAMEDSEQFH;
sub openunicode {
my ($rfh, @path) = @_;
unless (defined $$rfh) {
for my $d (@INC) {
use File::Spec;
- $f = File::Spec->catfile($d, "unicode", @path);
+ $f = File::Spec->catfile($d, "unicore", @path);
last if open($$rfh, $f);
undef $f;
}
title titlecase equivalent mapping
block block the character belongs to (used in \p{In...})
- script script the character belongs to
+ script script the character belongs to
If no match is found, a reference to an empty hash is returned.
-The C<block> property is the same as as returned by charinfo(). It is
+The C<block> property is the same as returned by charinfo(). It is
not defined in the Unicode Character Database proper (Chapter 4 of the
-Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
-of TUS3). Similarly for the C<script> property.
+Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
+(Chapter 14 of TUS3). Similarly for the C<script> property.
Note that you cannot do (de)composition and casing based solely on the
above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
=cut
+# NB: This function is duplicated in charnames.pm
sub _getcode {
my $arg = shift;
- if ($arg =~ /^\d+$/) {
+ if ($arg =~ /^[1-9]\d*$/) {
return $arg;
- } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
+ } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
return hex($1);
}
return;
}
+# Lingua::KO::Hangul::Util not part of the standard distribution
+# but it will be used if available.
+
+eval { require Lingua::KO::Hangul::Util };
+my $hasHangulUtil = ! $@;
+if ($hasHangulUtil) {
+ Lingua::KO::Hangul::Util->import();
+}
+
+sub hangul_decomp { # internal: called from charinfo
+ if ($hasHangulUtil) {
+ my @tmp = decomposeHangul(shift);
+ return sprintf("%04X %04X", @tmp) if @tmp == 2;
+ return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
+ }
+ return;
+}
+
+sub hangul_charname { # internal: called from charinfo
+ return sprintf("HANGUL SYLLABLE-%04X", shift);
+}
+
+sub han_charname { # internal: called from charinfo
+ return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
+}
+
+my @CharinfoRanges = (
+# block name
+# [ first, last, coderef to name, coderef to decompose ],
+# CJK Ideographs Extension A
+ [ 0x3400, 0x4DB5, \&han_charname, undef ],
+# CJK Ideographs
+ [ 0x4E00, 0x9FA5, \&han_charname, undef ],
+# Hangul Syllables
+ [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
+# Non-Private Use High Surrogates
+ [ 0xD800, 0xDB7F, undef, undef ],
+# Private Use High Surrogates
+ [ 0xDB80, 0xDBFF, undef, undef ],
+# Low Surrogates
+ [ 0xDC00, 0xDFFF, undef, undef ],
+# The Private Use Area
+ [ 0xE000, 0xF8FF, undef, undef ],
+# CJK Ideographs Extension B
+ [ 0x20000, 0x2A6D6, \&han_charname, undef ],
+# Plane 15 Private Use Area
+ [ 0xF0000, 0xFFFFD, undef, undef ],
+# Plane 16 Private Use Area
+ [ 0x100000, 0x10FFFD, undef, undef ],
+);
+
sub charinfo {
my $arg = shift;
my $code = _getcode($arg);
croak __PACKAGE__, "::charinfo: unknown code '$arg'"
unless defined $code;
- my $hexk = sprintf("%04X", $code);
-
- openunicode(\$UNICODEFH, "Unicode.txt");
+ my $hexk = sprintf("%06X", $code);
+ my($rcode,$rname,$rdec);
+ foreach my $range (@CharinfoRanges){
+ if ($range->[0] <= $code && $code <= $range->[1]) {
+ $rcode = $hexk;
+ $rcode =~ s/^0+//;
+ $rcode = sprintf("%04X", hex($rcode));
+ $rname = $range->[2] ? $range->[2]->($code) : '';
+ $rdec = $range->[3] ? $range->[3]->($code) : '';
+ $hexk = sprintf("%06X", $range->[0]); # replace by the first
+ last;
+ }
+ }
+ openunicode(\$UNICODEFH, "UnicodeData.txt");
if (defined $UNICODEFH) {
- use Search::Dict;
- if (look($UNICODEFH, "$hexk;") >= 0) {
+ use Search::Dict 1.02;
+ if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
my $line = <$UNICODEFH>;
+ return unless defined $line;
chomp $line;
my %prop;
@prop{qw(
mirrored unicode10 comment
upper lower title
)} = split(/;/, $line, -1);
+ $hexk =~ s/^0+//;
+ $hexk = sprintf("%04X", hex($hexk));
if ($prop{code} eq $hexk) {
$prop{block} = charblock($code);
$prop{script} = charscript($code);
+ if(defined $rname){
+ $prop{code} = $rcode;
+ $prop{name} = $rname;
+ $prop{decomposition} = $rdec;
+ }
return \%prop;
}
}
my $charblock = charblock("0x263a");
my $charblock = charblock("U+263a");
- my $ranges = charblock('Armenian');
+ my $range = charblock('Armenian');
-With a B<code point argument> charblock() returns the block the character
+With a B<code point argument> charblock() returns the I<block> the character
belongs to, e.g. C<Basic Latin>. Note that not all the character
positions within all blocks are defined.
-If supplied with an argument that can't be a code point, charblock()
-tries to do the opposite and interpret the argument as a character
-block. The return value is a I<range>: an anonymous list that
-contains anonymous lists, which in turn contain I<start-of-range>,
-I<end-of-range> code point pairs. You can test whether a code point
-is in a range using the L</charinrange> function. If the argument is
-not a known charater block, C<undef> is returned.
+See also L</Blocks versus Scripts>.
+
+If supplied with an argument that can't be a code point, charblock() tries
+to do the opposite and interpret the argument as a character block. The
+return value is a I<range>: an anonymous list of lists that contain
+I<start-of-range>, I<end-of-range> code point pairs. You can test whether
+a code point is in a range using the L</charinrange> function. If the
+argument is not a known character block, C<undef> is returned.
=cut
sub _charblocks {
unless (@BLOCKS) {
if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
+ local $_;
while (<$BLOCKSFH>) {
if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
my ($lo, $hi) = (hex($1), hex($2));
_search(\@BLOCKS, 0, $#BLOCKS, $code);
} else {
if (exists $BLOCKS{$arg}) {
- return $BLOCKS{$arg};
+ return dclone $BLOCKS{$arg};
} else {
return;
}
my $charscript = charscript(1234);
my $charscript = charscript("U+263a");
- my $ranges = charscript('Thai');
+ my $range = charscript('Thai');
-With a B<code point argument> charscript() returns the script the
+With a B<code point argument> charscript() returns the I<script> the
character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
-If supplied with an argument that can't be a code point, charscript()
-tries to do the opposite and interpret the argument as a character
-script. The return value is a I<range>: an anonymous list that
-contains anonymous lists, which in turn contain I<start-of-range>,
-I<end-of-range> code point pairs. You can test whether a code point
-is in a range using the L</charinrange> function. If the argument is
-not a known charater script, C<undef> is returned.
+See also L</Blocks versus Scripts>.
+
+If supplied with an argument that can't be a code point, charscript() tries
+to do the opposite and interpret the argument as a character script. The
+return value is a I<range>: an anonymous list of lists that contain
+I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
+code point is in a range using the L</charinrange> function. If the
+argument is not a known character script, C<undef> is returned.
=cut
sub _charscripts {
unless (@SCRIPTS) {
if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
+ local $_;
while (<$SCRIPTSFH>) {
if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
} else {
if (exists $SCRIPTS{$arg}) {
- return $SCRIPTS{$arg};
+ return dclone $SCRIPTS{$arg};
} else {
return;
}
charblocks() returns a reference to a hash with the known block names
as the keys, and the code point ranges (see L</charblock>) as the values.
+See also L</Blocks versus Scripts>.
+
=cut
sub charblocks {
_charblocks() unless %BLOCKS;
- return \%BLOCKS;
+ return dclone \%BLOCKS;
}
=head2 charscripts
charscripts() returns a hash with the known script names as the keys,
and the code point ranges (see L</charscript>) as the values.
+See also L</Blocks versus Scripts>.
+
=cut
sub charscripts {
_charscripts() unless %SCRIPTS;
- return \%SCRIPTS;
+ return dclone \%SCRIPTS;
}
=head2 Blocks versus Scripts
The difference between a block and a script is that scripts are closer
to the linguistic notion of a set of characters required to present
languages, while block is more of an artifact of the Unicode character
-numbering and separation into blocks of 256 characters.
+numbering and separation into blocks of (mostly) 256 characters.
For example the Latin B<script> is spread over several B<blocks>, such
as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
C<Latin Extended-B>. On the other hand, the Latin script does not
contain all the characters of the C<Basic Latin> block (also known as
-the ASCII): it includes only the letters, not for example the digits
+the ASCII): it includes only the letters, and not, for example, the digits
or the punctuation.
For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
=head2 Matching Scripts and Blocks
-Both scripts and blocks can be matched using the regular expression
-construct C<\p{In...}> and its negation C<\P{In...}>.
-
-The name of the script or the block comes after the C<In>, for example
-C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
-removed from the names for the C<\p{In...}>, for example
-C<LatinExtendedA> instead of C<Latin Extended-A>.
-
-There are a few cases where there exists both a script and a block by
-the same name, in these cases the block version has C<Block> appended:
-C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
+Scripts are matched with the regular-expression construct
+C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
+while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
+any of the 256 code points in the Tibetan block).
=head2 Code Point Arguments
-A <code point argument> is either a decimal or a hexadecimal scalar,
-or "U+" followed by hexadecimals.
+A I<code point argument> is either a decimal or a hexadecimal scalar
+designating a Unicode character, or C<U+> followed by hexadecimals
+designating a Unicode character. In other words, if you want a code
+point to be interpreted as a hexadecimal number, you must prefix it
+with either C<0x> or C<U+>, because a string like e.g. C<123> will
+be interpreted as a decimal code point. Also note that Unicode is
+B<not> limited to 16 bits (the number of Unicode characters is
+open-ended, in theory unlimited): you may have more than 4 hexdigits.
=head2 charinrange
In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
can also test whether a code point is in the I<range> as returned by
L</charblock> and L</charscript> or as the values of the hash returned
-by L</charblocks> and </charscripts> by using charinrange():
+by L</charblocks> and L</charscripts> by using charinrange():
use Unicode::UCD qw(charscript charinrange);
$range = charscript('Hiragana');
- print "looks like hiragana\n" if charinrange($range, $code);
+ print "looks like hiragana\n" if charinrange($range, $codepoint);
=cut
my $compexcl = compexcl("09dc");
The compexcl() returns the composition exclusion (that is, if the
-character cannot be decomposed) of the character specified by a B<code
-point argument>.
+character should not be produced during a precomposition) of the
+character specified by a B<code point argument>.
If there is a composition exclusion for the character, true is
returned. Otherwise, false is returned.
sub _compexcl {
unless (%COMPEXCL) {
- if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
+ if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
+ local $_;
while (<$COMPEXCLFH>) {
- if (/^([0-9A-F]+) \# /) {
+ if (/^([0-9A-F]+)\s+\#\s+/) {
my $code = hex($1);
$COMPEXCL{$code} = undef;
}
sub compexcl {
my $arg = shift;
my $code = _getcode($arg);
+ croak __PACKAGE__, "::compexcl: unknown code '$arg'"
+ unless defined $code;
_compexcl() unless %COMPEXCL;
use Unicode::UCD 'casefold';
- my %casefold = casefold("09dc");
+ my $casefold = casefold("00DF");
The casefold() returns the locale-independent case folding of the
character specified by a B<code point argument>.
sub _casefold {
unless (%CASEFOLD) {
- if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
+ if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
+ local $_;
while (<$CASEFOLDFH>) {
if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
my $code = hex($1);
sub casefold {
my $arg = shift;
my $code = _getcode($arg);
+ croak __PACKAGE__, "::casefold: unknown code '$arg'"
+ unless defined $code;
_casefold() unless %CASEFOLD;
use Unicode::UCD 'casespec';
- my %casespec = casespec("09dc");
+ my $casespec = casespec("FB00");
The casespec() returns the potentially locale-dependent case mapping
of the character specified by a B<code point argument>. The mapping
used to separate elements, spaces are to be ignored). A condition
list overrides the normal behavior if all of the listed conditions are
true. Case distinctions in the condition list are not significant.
-Conditions preceded by "NON_" represent the negation of the condition
+Conditions preceded by "NON_" represent the negation of the condition.
+
+Note that when there are multiple case folding definitions for a
+single code point because of different locales, the value returned by
+casespec() is a hash reference which has the locales as the keys and
+hash references as described above as the values.
A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
-followed by a "_" and a 2-letter ISO language code (, possibly followed
-by a "_" and a variant code). You can find the list of those codes
-in L<Locale::Country> and L<Locale::Language>.
+followed by a "_" and a 2-letter ISO language code (possibly followed
+by a "_" and a variant code). You can find the lists of those codes,
+see L<Locale::Country> and L<Locale::Language>.
A I<context> is one of the following choices:
FINAL The letter is not followed by a letter of
general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
MODERN The mapping is only used for modern text
- AFTER_i The last base character was "i" 0069
+ AFTER_i The last base character was "i" (U+0069)
For more information about case mappings see
http://www.unicode.org/unicode/reports/tr21/
sub _casespec {
unless (%CASESPEC) {
- if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
+ if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
+ local $_;
while (<$CASESPECFH>) {
if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
- my $code = hex($1);
- $CASESPEC{$code} = { code => $1,
- lower => $2,
- title => $3,
- upper => $4,
- condition => $5 };
+ my ($hexcode, $lower, $title, $upper, $condition) =
+ ($1, $2, $3, $4, $5);
+ my $code = hex($hexcode);
+ if (exists $CASESPEC{$code}) {
+ if (exists $CASESPEC{$code}->{code}) {
+ my ($oldlower,
+ $oldtitle,
+ $oldupper,
+ $oldcondition) =
+ @{$CASESPEC{$code}}{qw(lower
+ title
+ upper
+ condition)};
+ if (defined $oldcondition) {
+ my ($oldlocale) =
+ ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
+ delete $CASESPEC{$code};
+ $CASESPEC{$code}->{$oldlocale} =
+ { code => $hexcode,
+ lower => $oldlower,
+ title => $oldtitle,
+ upper => $oldupper,
+ condition => $oldcondition };
+ }
+ }
+ my ($locale) =
+ ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
+ $CASESPEC{$code}->{$locale} =
+ { code => $hexcode,
+ lower => $lower,
+ title => $title,
+ upper => $upper,
+ condition => $condition };
+ } else {
+ $CASESPEC{$code} =
+ { code => $hexcode,
+ lower => $lower,
+ title => $title,
+ upper => $upper,
+ condition => $condition };
+ }
}
}
close($CASESPECFH);
sub casespec {
my $arg = shift;
my $code = _getcode($arg);
+ croak __PACKAGE__, "::casespec: unknown code '$arg'"
+ unless defined $code;
_casespec() unless %CASESPEC;
- return $CASESPEC{$code};
+ return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
+}
+
+=head2 namedseq()
+
+ use Unicode::UCD 'namedseq';
+
+ my $namedseq = namedseq("KATAKANA LETTER AINU P");
+ my @namedseq = namedseq("KATAKANA LETTER AINU P");
+ my %namedseq = namedseq();
+
+If used with a single argument in a scalar context, returns the string
+consisting of the code points of the named sequence, or C<undef> if no
+named sequence by that name exists. If used with a single argument in
+a list context, returns list of the code points. If used with no
+arguments in a list context, returns a hash with the names of the
+named sequences as the keys and the named sequences as strings as
+the values. Otherwise, returns C<undef> or empty list depending
+on the context.
+
+(New from Unicode 4.1.0)
+
+=cut
+
+my %NAMEDSEQ;
+
+sub _namedseq {
+ unless (%NAMEDSEQ) {
+ if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
+ local $_;
+ while (<$NAMEDSEQFH>) {
+ if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
+ my ($n, $s) = ($1, $2);
+ my @s = map { chr(hex($_)) } split(' ', $s);
+ $NAMEDSEQ{$n} = join("", @s);
+ }
+ }
+ close($NAMEDSEQFH);
+ }
+ }
+}
+
+sub namedseq {
+ _namedseq() unless %NAMEDSEQ;
+ my $wantarray = wantarray();
+ if (defined $wantarray) {
+ if ($wantarray) {
+ if (@_ == 0) {
+ return %NAMEDSEQ;
+ } elsif (@_ == 1) {
+ my $s = $NAMEDSEQ{ $_[0] };
+ return defined $s ? map { ord($_) } split('', $s) : ();
+ }
+ } elsif (@_ == 1) {
+ return $NAMEDSEQ{ $_[0] };
+ }
+ }
+ return;
}
=head2 Unicode::UCD::UnicodeVersion
-Unicode::UCD::UnicodeVersion() returns the version of the Unicode Character
-Database, in other words, the version of the Unicode standard the
-database implements.
+Unicode::UCD::UnicodeVersion() returns the version of the Unicode
+Character Database, in other words, the version of the Unicode
+standard the database implements. The version is a string
+of numbers delimited by dots (C<'.'>).
=cut
The first use of charinfo() opens a read-only filehandle to the Unicode
Character Database (the database is included in the Perl distribution).
-The filehandle is then kept open for further queries.
+The filehandle is then kept open for further queries. In other words,
+if you are wondering where one of your filehandles went, that's where.
+
+=head1 BUGS
+
+Does not yet support EBCDIC platforms.
=head1 AUTHOR