From: Jarkko Hietaniemi Date: Sat, 30 Jun 2001 15:53:22 +0000 (+0000) Subject: Add a simple Unicode character database interface, Unicode::UCD. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=561c79edc84c161bc73be7aa0d10cde362797d7d;p=p5sagit%2Fp5-mst-13.2.git Add a simple Unicode character database interface, Unicode::UCD. p4raw-id: //depot/perl@11046 --- diff --git a/MANIFEST b/MANIFEST index 7695b79..12b7047 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1161,6 +1161,8 @@ lib/Time/localtime.pm By-name interface to Perl's builtin localtime lib/Time/localtime.t Test for Time::localtime lib/Time/tm.pm Internal object for Time::{gm,local}time lib/timelocal.pl Perl library supporting inverse of localtime, gmtime +lib/Unicode/UCD.pm Unicode character database +lib/Unicode/UCD.t See if Unicode character database works lib/unicode/ArabLink.pl Unicode character database lib/unicode/ArabLnkGrp.pl Unicode character database lib/unicode/ArabShap.txt Unicode character database diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm new file mode 100644 index 0000000..ab214bb --- /dev/null +++ b/lib/Unicode/UCD.pm @@ -0,0 +1,183 @@ +package Unicode::UCD; + +use strict; +use warnings; + +our $VERSION = v3.1.0; + +require Exporter; + +our @ISA = qw(Exporter); +our @EXPORT_OK = qw(charinfo charblock); + +use Carp; + +=head1 NAME + +Unicode - Unicode character database + +=head1 SYNOPSIS + + use Unicode::UCD 3.1.0; + # requires that level of the Unicode character database + + use Unicode::UCD 'charinfo'; + my %charinfo = charinfo($codepoint); + + use Unicode::UCD 'charblock'; + my $charblock = charblock($codepoint); + +=head1 DESCRIPTION + +The Unicode module offers a simple interface to the Unicode Character +Database. + +=cut + +my $UNICODE; +my $BLOCKS; + +sub openunicode { + my ($rfh, @path) = @_; + my $f; + unless (defined $$rfh) { + for my $d (@INC) { + use File::Spec; + $f = File::Spec->catfile($d, "unicode", @path); + if (open($$rfh, $f)) { + last; + } else { + croak __PACKAGE__, ": open '$f' failed: $!\n"; + } + } + croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n" + unless defined $rfh; + } + return $f; +} + +=head2 charinfo + + use Unicode::UCD 'charinfo'; + + my %charinfo = charinfo(0x41); + +charinfo() returns a hash that has the following fields as defined +by the Unicode standard: + + key + + code code point with at least four hexdigits + name name of the character IN UPPER CASE + category general category of the character + combining classes used in the Canonical Ordering Algorithm + bidi bidirectional category + decomposition character decomposition mapping + decimal if decimal digit this is the integer numeric value + digit if digit this is the numeric value + numeric if numeric is the integer or rational numeric value + mirrored if mirrored in bidirectional text + unicode10 Unicode 1.0 name if existed and different + comment ISO 10646 comment field + upper uppercase equivalent mapping + lower lowercase equivalent mapping + title titlecase equivalent mapping + block block the character belongs to (used in \p{In...}) + +If no match is found, an empty hash is returned. + +The C property is the same as as returned by charinfo(). +(It is not defined in the Unicode Character Database proper but +instead in an auxiliary database.) + +=cut + +sub charinfo { + my $code = shift; + my $hexk = sprintf("%04X", $code); + + openunicode(\$UNICODE, "Unicode.txt"); + if (defined $UNICODE) { + use Search::Dict; + if (look($UNICODE, "$hexk;") >= 0) { + my $line = <$UNICODE>; + chomp $line; + my %prop; + @prop{qw( + code name category + combining bidi decomposition + decimal digit numeric + mirrored unicode10 comment + upper lower title + )} = split(/;/, $line, -1); + if ($prop{code} eq $hexk) { + $prop{block} = charblock($code); + return %prop; + } + } + } + return; +} + +=head2 charbloc + + use Unicode::UCD 'charblock'; + + my $charblock = charblock(0x41); + +charblock() returns the block the character belongs to, e.g. +C. Note that not all the character positions within all +block are defined. + +The name is the same name that is used in the C<\p{In...}> construct, +for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished +away from the names for the C<\p{In...}>. + +=cut + +my @BLOCKS; + +sub _charblock { + my ($code, $lo, $hi) = @_; + + return if $lo > $hi; + + my $mid = int(($lo+$hi) / 2); + + if ($BLOCKS[$mid]->[0] < $code) { + if ($BLOCKS[$mid]->[1] >= $code) { + return $BLOCKS[$mid]->[2]; + } else { + _charblock($code, $mid + 1, $hi); + } + } elsif ($BLOCKS[$mid]->[0] > $code) { + _charblock($code, $lo, $mid - 1); + } else { + return $BLOCKS[$mid]->[2]; + } +} + +sub charblock { + my $code = shift; + + unless (@BLOCKS) { + if (openunicode(\$BLOCKS, "Blocks.pl")) { + while (<$BLOCKS>) { + if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) { + push @BLOCKS, [ hex($1), hex($2), $3 ]; + } + } + close($BLOCKS); + } + } + + _charblock($code, 0, $#BLOCKS); +} + +=head1 AUTHOR + +Jarkko Hietaniemi + +=cut + +1; diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t new file mode 100644 index 0000000..731ac8f --- /dev/null +++ b/lib/Unicode/UCD.t @@ -0,0 +1,110 @@ +use Unicode::UCD 3.1.0; + +use Test; +use strict; + +BEGIN { plan tests => 81 }; + +use Unicode::UCD 'charinfo'; + +my %charinfo; + +%charinfo = charinfo(0x41); + +ok($charinfo{code}, '0041'); +ok($charinfo{name}, 'LATIN CAPITAL LETTER A'); +ok($charinfo{category}, 'Lu'); +ok($charinfo{combining}, '0'); +ok($charinfo{bidi}, 'L'); +ok($charinfo{decomposition}, ''); +ok($charinfo{decimal}, ''); +ok($charinfo{digit}, ''); +ok($charinfo{numeric}, ''); +ok($charinfo{mirrored}, 'N'); +ok($charinfo{unicode10}, ''); +ok($charinfo{comment}, ''); +ok($charinfo{upper}, ''); +ok($charinfo{lower}, '0061'); +ok($charinfo{title}, ''); +ok($charinfo{block}, 'Basic Latin'); + +%charinfo = charinfo(0x100); + +ok($charinfo{code}, '0100'); +ok($charinfo{name}, 'LATIN CAPITAL LETTER A WITH MACRON'); +ok($charinfo{category}, 'Lu'); +ok($charinfo{combining}, '0'); +ok($charinfo{bidi}, 'L'); +ok($charinfo{decomposition}, '0041 0304'); +ok($charinfo{decimal}, ''); +ok($charinfo{digit}, ''); +ok($charinfo{numeric}, ''); +ok($charinfo{mirrored}, 'N'); +ok($charinfo{unicode10}, 'LATIN CAPITAL LETTER A MACRON'); +ok($charinfo{comment}, ''); +ok($charinfo{upper}, ''); +ok($charinfo{lower}, '0101'); +ok($charinfo{title}, ''); +ok($charinfo{block}, 'Latin Extended-A'); + +%charinfo = charinfo(0x590); + +ok($charinfo{code}, undef); +ok($charinfo{name}, undef); +ok($charinfo{category}, undef); +ok($charinfo{combining}, undef); +ok($charinfo{bidi}, undef); +ok($charinfo{decomposition}, undef); +ok($charinfo{decimal}, undef); +ok($charinfo{digit}, undef); +ok($charinfo{numeric}, undef); +ok($charinfo{mirrored}, undef); +ok($charinfo{unicode10}, undef); +ok($charinfo{comment}, undef); +ok($charinfo{upper}, undef); +ok($charinfo{lower}, undef); +ok($charinfo{title}, undef); +ok($charinfo{block}, undef); + +%charinfo = charinfo(0x5d0); + +ok($charinfo{code}, '05D0'); +ok($charinfo{name}, 'HEBREW LETTER ALEF'); +ok($charinfo{category}, 'Lo'); +ok($charinfo{combining}, '0'); +ok($charinfo{bidi}, 'R'); +ok($charinfo{decomposition}, ''); +ok($charinfo{decimal}, ''); +ok($charinfo{digit}, ''); +ok($charinfo{numeric}, ''); +ok($charinfo{mirrored}, 'N'); +ok($charinfo{unicode10}, ''); +ok($charinfo{comment}, ''); +ok($charinfo{upper}, ''); +ok($charinfo{lower}, ''); +ok($charinfo{title}, ''); +ok($charinfo{block}, 'Hebrew'); + +use Unicode::UCD 'charblock'; + +ok(charblock(0x590), 'Hebrew'); + +%charinfo = charinfo(0xbe); + +ok($charinfo{code}, '00BE'); +ok($charinfo{name}, 'VULGAR FRACTION THREE QUARTERS'); +ok($charinfo{category}, 'No'); +ok($charinfo{combining}, '0'); +ok($charinfo{bidi}, 'ON'); +ok($charinfo{decomposition}, ' 0033 2044 0034'); +ok($charinfo{decimal}, ''); +ok($charinfo{digit}, ''); +ok($charinfo{numeric}, '3/4'); +ok($charinfo{mirrored}, 'N'); +ok($charinfo{unicode10}, 'FRACTION THREE QUARTERS'); +ok($charinfo{comment}, ''); +ok($charinfo{upper}, ''); +ok($charinfo{lower}, ''); +ok($charinfo{title}, ''); +ok($charinfo{block}, 'Latin-1 Supplement'); +