Add a simple Unicode character database interface, Unicode::UCD.
Jarkko Hietaniemi [Sat, 30 Jun 2001 15:53:22 +0000 (15:53 +0000)]
p4raw-id: //depot/perl@11046

MANIFEST
lib/Unicode/UCD.pm [new file with mode: 0644]
lib/Unicode/UCD.t [new file with mode: 0644]

index 7695b79..12b7047 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -1161,6 +1161,8 @@ lib/Time/localtime.pm             By-name interface to Perl's builtin localtime
 lib/Time/localtime.t           Test for Time::localtime
 lib/Time/tm.pm                 Internal object for Time::{gm,local}time
 lib/timelocal.pl               Perl library supporting inverse of localtime, gmtime
+lib/Unicode/UCD.pm                     Unicode character database
+lib/Unicode/UCD.t                      See if Unicode character database works
 lib/unicode/ArabLink.pl                Unicode character database
 lib/unicode/ArabLnkGrp.pl      Unicode character database
 lib/unicode/ArabShap.txt       Unicode character database
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
new file mode 100644 (file)
index 0000000..ab214bb
--- /dev/null
@@ -0,0 +1,183 @@
+package Unicode::UCD;
+
+use strict;
+use warnings;
+
+our $VERSION = v3.1.0;
+
+require Exporter;
+
+our @ISA = qw(Exporter);
+our @EXPORT_OK = qw(charinfo charblock);
+
+use Carp;
+
+=head1 NAME
+
+Unicode - Unicode character database
+
+=head1 SYNOPSIS
+
+    use Unicode::UCD 3.1.0;
+    # requires that level of the Unicode character database
+
+    use Unicode::UCD 'charinfo';
+    my %charinfo  = charinfo($codepoint);
+
+    use Unicode::UCD 'charblock';
+    my $charblock = charblock($codepoint);
+
+=head1 DESCRIPTION
+
+The Unicode module offers a simple interface to the Unicode Character
+Database.
+
+=cut
+
+my $UNICODE;
+my $BLOCKS;
+
+sub openunicode {
+    my ($rfh, @path) = @_;
+    my $f;
+    unless (defined $$rfh) {
+       for my $d (@INC) {
+           use File::Spec;
+           $f = File::Spec->catfile($d, "unicode", @path);
+           if (open($$rfh, $f)) {
+               last;
+           } else {
+               croak __PACKAGE__, ": open '$f' failed: $!\n";
+           }
+       }
+       croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n"
+           unless defined $rfh;
+    }
+    return $f;
+}
+
+=head2 charinfo
+
+    use Unicode::UCD 'charinfo';
+
+    my %charinfo = charinfo(0x41);
+
+charinfo() returns a hash that has the following fields as defined
+by the Unicode standard:
+
+    key
+
+    code             code point with at least four hexdigits
+    name             name of the character IN UPPER CASE
+    category         general category of the character
+    combining        classes used in the Canonical Ordering Algorithm
+    bidi             bidirectional category
+    decomposition    character decomposition mapping
+    decimal          if decimal digit this is the integer numeric value
+    digit            if digit this is the numeric value
+    numeric          if numeric is the integer or rational numeric value
+    mirrored         if mirrored in bidirectional text
+    unicode10        Unicode 1.0 name if existed and different
+    comment          ISO 10646 comment field
+    upper            uppercase equivalent mapping
+    lower            lowercase equivalent mapping
+    title            titlecase equivalent mapping
+    block            block the character belongs to (used in \p{In...})
+
+If no match is found, an empty hash is returned.
+
+The C<block> property is the same as as returned by charinfo().
+(It is not defined in the Unicode Character Database proper but
+instead in an auxiliary database.)
+
+=cut
+
+sub charinfo {
+    my $code = shift;
+    my $hexk = sprintf("%04X", $code);
+
+    openunicode(\$UNICODE, "Unicode.txt");
+    if (defined $UNICODE) {
+       use Search::Dict;
+       if (look($UNICODE, "$hexk;") >= 0) {
+           my $line = <$UNICODE>;
+           chomp $line;
+           my %prop;
+           @prop{qw(
+                    code name category
+                    combining bidi decomposition
+                    decimal digit numeric
+                    mirrored unicode10 comment
+                    upper lower title
+                   )} = split(/;/, $line, -1);
+           if ($prop{code} eq $hexk) {
+               $prop{block} = charblock($code);
+               return %prop;
+           }
+       }
+    }
+    return;
+}
+
+=head2 charbloc
+
+    use Unicode::UCD 'charblock';
+
+    my $charblock = charblock(0x41);
+
+charblock() returns the block the character belongs to, e.g.
+C<Basic Latin>.  Note that not all the character positions within all
+block are defined.
+
+The name is the same name that is used in the C<\p{In...}> construct,
+for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
+away from the names for the C<\p{In...}>.
+
+=cut
+
+my @BLOCKS;
+
+sub _charblock {
+    my ($code, $lo, $hi) = @_;
+
+    return if $lo > $hi;
+
+    my $mid = int(($lo+$hi) / 2);
+
+    if ($BLOCKS[$mid]->[0] < $code) {
+       if ($BLOCKS[$mid]->[1] >= $code) {
+           return $BLOCKS[$mid]->[2];
+       } else {
+           _charblock($code, $mid + 1, $hi);
+       }
+    } elsif ($BLOCKS[$mid]->[0] > $code) {
+       _charblock($code, $lo, $mid - 1);
+    } else {
+       return $BLOCKS[$mid]->[2];
+    }
+}
+
+sub charblock {
+    my $code = shift;
+
+    unless (@BLOCKS) {
+       if (openunicode(\$BLOCKS, "Blocks.pl")) {
+           while (<$BLOCKS>) {
+               if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
+                   push @BLOCKS, [ hex($1), hex($2), $3 ];
+               }
+           }
+           close($BLOCKS);
+       }
+    }
+
+    _charblock($code, 0, $#BLOCKS);
+}
+
+=head1 AUTHOR
+
+Jarkko Hietaniemi
+
+=cut
+
+1;
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
new file mode 100644 (file)
index 0000000..731ac8f
--- /dev/null
@@ -0,0 +1,110 @@
+use Unicode::UCD 3.1.0;
+
+use Test;
+use strict;
+
+BEGIN { plan tests => 81 };
+
+use Unicode::UCD 'charinfo';
+
+my %charinfo;
+
+%charinfo = charinfo(0x41);
+
+ok($charinfo{code},           '0041');
+ok($charinfo{name},           'LATIN CAPITAL LETTER A');
+ok($charinfo{category},       'Lu');
+ok($charinfo{combining},      '0');
+ok($charinfo{bidi},           'L');
+ok($charinfo{decomposition},  '');
+ok($charinfo{decimal},        '');
+ok($charinfo{digit},          '');
+ok($charinfo{numeric},        '');
+ok($charinfo{mirrored},       'N');
+ok($charinfo{unicode10},      '');
+ok($charinfo{comment},        '');
+ok($charinfo{upper},          '');
+ok($charinfo{lower},          '0061');
+ok($charinfo{title},          '');
+ok($charinfo{block},          'Basic Latin');
+
+%charinfo = charinfo(0x100);
+
+ok($charinfo{code},           '0100');
+ok($charinfo{name},           'LATIN CAPITAL LETTER A WITH MACRON');
+ok($charinfo{category},       'Lu');
+ok($charinfo{combining},      '0');
+ok($charinfo{bidi},           'L');
+ok($charinfo{decomposition},  '0041 0304');
+ok($charinfo{decimal},        '');
+ok($charinfo{digit},          '');
+ok($charinfo{numeric},        '');
+ok($charinfo{mirrored},       'N');
+ok($charinfo{unicode10},      'LATIN CAPITAL LETTER A MACRON');
+ok($charinfo{comment},        '');
+ok($charinfo{upper},          '');
+ok($charinfo{lower},          '0101');
+ok($charinfo{title},          '');
+ok($charinfo{block},          'Latin Extended-A');
+
+%charinfo = charinfo(0x590);
+
+ok($charinfo{code},          undef);
+ok($charinfo{name},          undef);
+ok($charinfo{category},      undef);
+ok($charinfo{combining},     undef);
+ok($charinfo{bidi},          undef);
+ok($charinfo{decomposition}, undef);
+ok($charinfo{decimal},       undef);
+ok($charinfo{digit},         undef);
+ok($charinfo{numeric},       undef);
+ok($charinfo{mirrored},      undef);
+ok($charinfo{unicode10},     undef);
+ok($charinfo{comment},       undef);
+ok($charinfo{upper},         undef);
+ok($charinfo{lower},         undef);
+ok($charinfo{title},         undef);
+ok($charinfo{block},         undef);
+
+%charinfo = charinfo(0x5d0);
+
+ok($charinfo{code},           '05D0');
+ok($charinfo{name},           'HEBREW LETTER ALEF');
+ok($charinfo{category},       'Lo');
+ok($charinfo{combining},      '0');
+ok($charinfo{bidi},           'R');
+ok($charinfo{decomposition},  '');
+ok($charinfo{decimal},        '');
+ok($charinfo{digit},          '');
+ok($charinfo{numeric},        '');
+ok($charinfo{mirrored},       'N');
+ok($charinfo{unicode10},      '');
+ok($charinfo{comment},        '');
+ok($charinfo{upper},          '');
+ok($charinfo{lower},          '');
+ok($charinfo{title},          '');
+ok($charinfo{block},          'Hebrew');
+
+use Unicode::UCD 'charblock';
+
+ok(charblock(0x590),          'Hebrew');
+
+%charinfo = charinfo(0xbe);
+
+ok($charinfo{code},           '00BE');
+ok($charinfo{name},           'VULGAR FRACTION THREE QUARTERS');
+ok($charinfo{category},       'No');
+ok($charinfo{combining},      '0');
+ok($charinfo{bidi},           'ON');
+ok($charinfo{decomposition},  '<fraction> 0033 2044 0034');
+ok($charinfo{decimal},        '');
+ok($charinfo{digit},          '');
+ok($charinfo{numeric},        '3/4');
+ok($charinfo{mirrored},       'N');
+ok($charinfo{unicode10},      'FRACTION THREE QUARTERS');
+ok($charinfo{comment},        '');
+ok($charinfo{upper},          '');
+ok($charinfo{lower},          '');
+ok($charinfo{title},          '');
+ok($charinfo{block},          'Latin-1 Supplement');
+