UCD.pm: if at first you don't succeed, croak?
[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm
CommitLineData
561c79ed 1package Unicode::UCD;
2
3use strict;
4use warnings;
5
6f50a187 6our $VERSION = '3.1.0';
561c79ed 7
8require Exporter;
9
10our @ISA = qw(Exporter);
11our @EXPORT_OK = qw(charinfo charblock);
12
13use Carp;
14
15=head1 NAME
16
00f2772c 17Unicode::UCD - Unicode character database
561c79ed 18
19=head1 SYNOPSIS
20
21 use Unicode::UCD 3.1.0;
22 # requires that level of the Unicode character database
23
24 use Unicode::UCD 'charinfo';
25 my %charinfo = charinfo($codepoint);
26
27 use Unicode::UCD 'charblock';
28 my $charblock = charblock($codepoint);
29
30=head1 DESCRIPTION
31
32The Unicode module offers a simple interface to the Unicode Character
33Database.
34
35=cut
36
37my $UNICODE;
38my $BLOCKS;
39
40sub openunicode {
41 my ($rfh, @path) = @_;
42 my $f;
43 unless (defined $$rfh) {
44 for my $d (@INC) {
45 use File::Spec;
46 $f = File::Spec->catfile($d, "unicode", @path);
14f14a10 47 next unless -f $f;
561c79ed 48 if (open($$rfh, $f)) {
49 last;
50 } else {
51 croak __PACKAGE__, ": open '$f' failed: $!\n";
52 }
53 }
54 croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n"
55 unless defined $rfh;
56 }
57 return $f;
58}
59
60=head2 charinfo
61
62 use Unicode::UCD 'charinfo';
63
64 my %charinfo = charinfo(0x41);
65
66charinfo() returns a hash that has the following fields as defined
67by the Unicode standard:
68
69 key
70
71 code code point with at least four hexdigits
72 name name of the character IN UPPER CASE
73 category general category of the character
74 combining classes used in the Canonical Ordering Algorithm
75 bidi bidirectional category
76 decomposition character decomposition mapping
77 decimal if decimal digit this is the integer numeric value
78 digit if digit this is the numeric value
79 numeric if numeric is the integer or rational numeric value
80 mirrored if mirrored in bidirectional text
81 unicode10 Unicode 1.0 name if existed and different
82 comment ISO 10646 comment field
83 upper uppercase equivalent mapping
84 lower lowercase equivalent mapping
85 title titlecase equivalent mapping
86 block block the character belongs to (used in \p{In...})
87
88If no match is found, an empty hash is returned.
89
90The C<block> property is the same as as returned by charinfo().
91(It is not defined in the Unicode Character Database proper but
92instead in an auxiliary database.)
93
94=cut
95
96sub charinfo {
97 my $code = shift;
98 my $hexk = sprintf("%04X", $code);
99
100 openunicode(\$UNICODE, "Unicode.txt");
101 if (defined $UNICODE) {
102 use Search::Dict;
103 if (look($UNICODE, "$hexk;") >= 0) {
104 my $line = <$UNICODE>;
105 chomp $line;
106 my %prop;
107 @prop{qw(
108 code name category
109 combining bidi decomposition
110 decimal digit numeric
111 mirrored unicode10 comment
112 upper lower title
113 )} = split(/;/, $line, -1);
114 if ($prop{code} eq $hexk) {
115 $prop{block} = charblock($code);
116 return %prop;
117 }
118 }
119 }
120 return;
121}
122
354a27bf 123=head2 charblock
561c79ed 124
125 use Unicode::UCD 'charblock';
126
127 my $charblock = charblock(0x41);
128
129charblock() returns the block the character belongs to, e.g.
130C<Basic Latin>. Note that not all the character positions within all
131block are defined.
132
133The name is the same name that is used in the C<\p{In...}> construct,
134for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
135away from the names for the C<\p{In...}>.
136
137=cut
138
139my @BLOCKS;
140
141sub _charblock {
142 my ($code, $lo, $hi) = @_;
143
144 return if $lo > $hi;
145
146 my $mid = int(($lo+$hi) / 2);
147
148 if ($BLOCKS[$mid]->[0] < $code) {
149 if ($BLOCKS[$mid]->[1] >= $code) {
150 return $BLOCKS[$mid]->[2];
151 } else {
152 _charblock($code, $mid + 1, $hi);
153 }
154 } elsif ($BLOCKS[$mid]->[0] > $code) {
155 _charblock($code, $lo, $mid - 1);
156 } else {
157 return $BLOCKS[$mid]->[2];
158 }
159}
160
161sub charblock {
162 my $code = shift;
163
164 unless (@BLOCKS) {
165 if (openunicode(\$BLOCKS, "Blocks.pl")) {
166 while (<$BLOCKS>) {
167 if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
168 push @BLOCKS, [ hex($1), hex($2), $3 ];
169 }
170 }
171 close($BLOCKS);
172 }
173 }
174
175 _charblock($code, 0, $#BLOCKS);
176}
177
178=head1 AUTHOR
179
180Jarkko Hietaniemi
181
182=cut
183
1841;