Fixes for ext/compress
[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed 2
3use strict;
4use warnings;
5
a452d459 6our $VERSION = '0.27';
561c79ed 7
741297c1 8use Storable qw(dclone);
9
561c79ed 10require Exporter;
11
12our @ISA = qw(Exporter);
74f8133e 13
10a6ecd2 14our @EXPORT_OK = qw(charinfo
15 charblock charscript
16 charblocks charscripts
b08cd201 17 charinrange
ea508aee 18 general_categories bidi_types
b08cd201 19 compexcl
a2bd7410 20 casefold casespec
21 namedseq);
561c79ed 22
23use Carp;
24
25=head1 NAME
26
55d7b906 27Unicode::UCD - Unicode character database
561c79ed 28
29=head1 SYNOPSIS
30
55d7b906 31 use Unicode::UCD 'charinfo';
b08cd201 32 my $charinfo = charinfo($codepoint);
561c79ed 33
5d8e6e41 34 use Unicode::UCD 'casespec';
35 my $casespec = casespec(0xFB00);
36
55d7b906 37 use Unicode::UCD 'charblock';
e882dd67 38 my $charblock = charblock($codepoint);
39
55d7b906 40 use Unicode::UCD 'charscript';
65044554 41 my $charscript = charscript($codepoint);
561c79ed 42
55d7b906 43 use Unicode::UCD 'charblocks';
e145285f 44 my $charblocks = charblocks();
45
55d7b906 46 use Unicode::UCD 'charscripts';
ea508aee 47 my $charscripts = charscripts();
e145285f 48
55d7b906 49 use Unicode::UCD qw(charscript charinrange);
e145285f 50 my $range = charscript($script);
51 print "looks like $script\n" if charinrange($range, $codepoint);
52
ea508aee 53 use Unicode::UCD qw(general_categories bidi_types);
54 my $categories = general_categories();
55 my $types = bidi_types();
56
55d7b906 57 use Unicode::UCD 'compexcl';
e145285f 58 my $compexcl = compexcl($codepoint);
59
a2bd7410 60 use Unicode::UCD 'namedseq';
61 my $namedseq = namedseq($named_sequence_name);
62
55d7b906 63 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 64
561c79ed 65=head1 DESCRIPTION
66
a452d459 67The Unicode::UCD module offers a series of functions that
68provide a simple interface to the Unicode
8b731da2 69Character Database.
561c79ed 70
a452d459 71=head2 code point argument
72
73Some of the functions are called with a I<code point argument>, which is either
74a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
75followed by hexadecimals designating a Unicode code point. In other words, if
76you want a code point to be interpreted as a hexadecimal number, you must
77prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
78interpreted as a decimal code point. Also note that Unicode is B<not> limited
79to 16 bits (the number of Unicode code points is open-ended, in theory
80unlimited): you may have more than 4 hexdigits.
561c79ed 81=cut
82
10a6ecd2 83my $UNICODEFH;
84my $BLOCKSFH;
85my $SCRIPTSFH;
86my $VERSIONFH;
b08cd201 87my $COMPEXCLFH;
88my $CASEFOLDFH;
89my $CASESPECFH;
a2bd7410 90my $NAMEDSEQFH;
561c79ed 91
92sub openunicode {
93 my ($rfh, @path) = @_;
94 my $f;
95 unless (defined $$rfh) {
96 for my $d (@INC) {
97 use File::Spec;
55d7b906 98 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 99 last if open($$rfh, $f);
e882dd67 100 undef $f;
561c79ed 101 }
e882dd67 102 croak __PACKAGE__, ": failed to find ",
103 File::Spec->catfile(@path), " in @INC"
104 unless defined $f;
561c79ed 105 }
106 return $f;
107}
108
a452d459 109=head2 B<charinfo()>
561c79ed 110
55d7b906 111 use Unicode::UCD 'charinfo';
561c79ed 112
b08cd201 113 my $charinfo = charinfo(0x41);
561c79ed 114
a452d459 115This returns information about the input L</code point argument>
116as a reference to a hash of fields as defined by the Unicode
117standard. If the L</code point argument> is not assigned in the standard
118(i.e., has the general category C<Cn> meaning C<Unassigned>)
119or is a non-character (meaning it is guaranteed to never be assigned in
120the standard),
121B<undef> is returned.
122
123Fields that aren't applicable to the particular code point argument exist in the
124returned hash, and are empty.
125
126The keys in the hash with the meanings of their values are:
127
128=over
129
130=item B<code>
131
132the input L</code point argument> expressed in hexadecimal, with leading zeros
133added if necessary to make it contain at least four hexdigits
134
135=item B<name>
136
137name of I<code>, all IN UPPER CASE.
138Some control-type code points do not have names.
139This field will be empty for C<Surrogate> and C<Private Use> code points,
140and for the others without a name,
141it will contain a description enclosed in angle brackets, like
142C<E<lt>controlE<gt>>.
143
144
145=item B<category>
146
147The short name of the general category of I<code>.
148This will match one of the keys in the hash returned by L</general_categories()>.
149
150=item B<combining>
151
152the combining class number for I<code> used in the Canonical Ordering Algorithm.
153For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
154available at
155L<http://www.unicode.org/versions/Unicode5.1.0/>
156
157=item B<bidi>
158
159bidirectional type of I<code>.
160This will match one of the keys in the hash returned by L</bidi_types()>.
161
162=item B<decomposition>
163
164is empty if I<code> has no decomposition; or is one or more codes
165(separated by spaces) that taken in order represent a decomposition for
166I<code>. Each has at least four hexdigits.
167The codes may be preceded by a word enclosed in angle brackets then a space,
168like C<E<lt>compatE<gt> >, giving the type of decomposition
169
170=item B<decimal>
171
172if I<code> is a decimal digit this is its integer numeric value
173
174=item B<digit>
175
176if I<code> represents a whole number, this is its integer numeric value
177
178=item B<numeric>
179
180if I<code> represents a whole or rational number, this is its numeric value.
181Rational values are expressed as a string like C<1/4>.
182
183=item B<mirrored>
184
185C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
186
187=item B<unicode10>
188
189name of I<code> in the Unicode 1.0 standard if one
190existed for this code point and is different from the current name
191
192=item B<comment>
193
194ISO 10646 comment field.
195It appears in parentheses in the ISO 10646 names list,
196or contains an asterisk to indicate there is
197a note for this code point in Annex P of that standard.
198
199=item B<upper>
200
201is empty if there is no single code point uppercase mapping for I<code>;
202otherwise it is that mapping expressed as at least four hexdigits.
203(L</casespec()> should be used in addition to B<charinfo()>
204for case mappings when the calling program can cope with multiple code point
205mappings.)
206
207=item B<lower>
208
209is empty if there is no single code point lowercase mapping for I<code>;
210otherwise it is that mapping expressed as at least four hexdigits.
211(L</casespec()> should be used in addition to B<charinfo()>
212for case mappings when the calling program can cope with multiple code point
213mappings.)
214
215=item B<title>
216
217is empty if there is no single code point titlecase mapping for I<code>;
218otherwise it is that mapping expressed as at least four hexdigits.
219(L</casespec()> should be used in addition to B<charinfo()>
220for case mappings when the calling program can cope with multiple code point
221mappings.)
222
223=item B<block>
224
225block I<code> belongs to (used in \p{In...}).
226See L</Blocks versus Scripts>.
227
228
229=item B<script>
230
231script I<code> belongs to.
232See L</Blocks versus Scripts>.
233
234=back
32c16050 235
236Note that you cannot do (de)composition and casing based solely on the
a452d459 237I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields;
238you will need also the L</compexcl()>, and L</casespec()> functions.
561c79ed 239
240=cut
241
0616d9cf 242# NB: This function is duplicated in charnames.pm
10a6ecd2 243sub _getcode {
244 my $arg = shift;
245
dc0a4417 246 if ($arg =~ /^[1-9]\d*$/) {
10a6ecd2 247 return $arg;
dc0a4417 248 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
10a6ecd2 249 return hex($1);
250 }
251
252 return;
253}
254
ac5ea531 255# Lingua::KO::Hangul::Util not part of the standard distribution
256# but it will be used if available.
257
258eval { require Lingua::KO::Hangul::Util };
259my $hasHangulUtil = ! $@;
260if ($hasHangulUtil) {
261 Lingua::KO::Hangul::Util->import();
262}
9087a70b 263
264sub hangul_decomp { # internal: called from charinfo
ac5ea531 265 if ($hasHangulUtil) {
266 my @tmp = decomposeHangul(shift);
267 return sprintf("%04X %04X", @tmp) if @tmp == 2;
268 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
269 }
270 return;
271}
272
273sub hangul_charname { # internal: called from charinfo
274 return sprintf("HANGUL SYLLABLE-%04X", shift);
a6fa416b 275}
276
9087a70b 277sub han_charname { # internal: called from charinfo
278 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
a6fa416b 279}
280
5d8e6e41 281# Overwritten by data in file
324f9e44 282my %first_last = (
283 'CJK Ideograph Extension A' => [ 0x3400, 0x4DB5 ],
284 'CJK Ideograph' => [ 0x4E00, 0x9FA5 ],
285 'CJK Ideograph Extension B' => [ 0x20000, 0x2A6D6 ],
286);
287
288get_charinfo_ranges();
289
290sub get_charinfo_ranges {
291 my @blocks = keys %first_last;
292
293 my $fh;
294 openunicode( \$fh, 'UnicodeData.txt' );
295 if( defined $fh ){
296 while( my $line = <$fh> ){
297 next unless $line =~ /(?:First|Last)/;
298 if( grep{ $line =~ /[^;]+;<$_\s*,\s*(?:First|Last)>/ }@blocks ){
299 my ($number,$block,$type);
300 ($number,$block) = split /;/, $line;
301 $block =~ s/<|>//g;
302 ($block,$type) = split /, /, $block;
303 my $index = $type eq 'First' ? 0 : 1;
304 $first_last{ $block }->[$index] = hex $number;
305 }
306 }
307 }
308}
309
a6fa416b 310my @CharinfoRanges = (
311# block name
312# [ first, last, coderef to name, coderef to decompose ],
313# CJK Ideographs Extension A
324f9e44 314 [ @{ $first_last{'CJK Ideograph Extension A'} }, \&han_charname, undef ],
a6fa416b 315# CJK Ideographs
324f9e44 316 [ @{ $first_last{'CJK Ideograph'} }, \&han_charname, undef ],
a6fa416b 317# Hangul Syllables
ac5ea531 318 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
a6fa416b 319# Non-Private Use High Surrogates
320 [ 0xD800, 0xDB7F, undef, undef ],
321# Private Use High Surrogates
322 [ 0xDB80, 0xDBFF, undef, undef ],
323# Low Surrogates
324 [ 0xDC00, 0xDFFF, undef, undef ],
325# The Private Use Area
326 [ 0xE000, 0xF8FF, undef, undef ],
327# CJK Ideographs Extension B
324f9e44 328 [ @{ $first_last{'CJK Ideograph Extension B'} }, \&han_charname, undef ],
a6fa416b 329# Plane 15 Private Use Area
330 [ 0xF0000, 0xFFFFD, undef, undef ],
331# Plane 16 Private Use Area
332 [ 0x100000, 0x10FFFD, undef, undef ],
333);
334
561c79ed 335sub charinfo {
10a6ecd2 336 my $arg = shift;
337 my $code = _getcode($arg);
338 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
339 unless defined $code;
e63dbbf9 340 my $hexk = sprintf("%06X", $code);
a6fa416b 341 my($rcode,$rname,$rdec);
342 foreach my $range (@CharinfoRanges){
74f8133e 343 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 344 $rcode = $hexk;
e63dbbf9 345 $rcode =~ s/^0+//;
346 $rcode = sprintf("%04X", hex($rcode));
a6fa416b 347 $rname = $range->[2] ? $range->[2]->($code) : '';
348 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 349 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b 350 last;
351 }
352 }
551b6b6f 353 openunicode(\$UNICODEFH, "UnicodeData.txt");
10a6ecd2 354 if (defined $UNICODEFH) {
e63dbbf9 355 use Search::Dict 1.02;
356 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 357 my $line = <$UNICODEFH>;
c5a29f40 358 return unless defined $line;
561c79ed 359 chomp $line;
360 my %prop;
361 @prop{qw(
362 code name category
363 combining bidi decomposition
364 decimal digit numeric
365 mirrored unicode10 comment
366 upper lower title
367 )} = split(/;/, $line, -1);
e63dbbf9 368 $hexk =~ s/^0+//;
369 $hexk = sprintf("%04X", hex($hexk));
561c79ed 370 if ($prop{code} eq $hexk) {
a196fbfd 371 $prop{block} = charblock($code);
372 $prop{script} = charscript($code);
a6fa416b 373 if(defined $rname){
374 $prop{code} = $rcode;
375 $prop{name} = $rname;
376 $prop{decomposition} = $rdec;
377 }
b08cd201 378 return \%prop;
561c79ed 379 }
380 }
381 }
382 return;
383}
384
e882dd67 385sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
386 my ($table, $lo, $hi, $code) = @_;
387
388 return if $lo > $hi;
389
390 my $mid = int(($lo+$hi) / 2);
391
392 if ($table->[$mid]->[0] < $code) {
10a6ecd2 393 if ($table->[$mid]->[1] >= $code) {
e882dd67 394 return $table->[$mid]->[2];
395 } else {
396 _search($table, $mid + 1, $hi, $code);
397 }
398 } elsif ($table->[$mid]->[0] > $code) {
399 _search($table, $lo, $mid - 1, $code);
400 } else {
401 return $table->[$mid]->[2];
402 }
403}
404
10a6ecd2 405sub charinrange {
406 my ($range, $arg) = @_;
407 my $code = _getcode($arg);
408 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
409 unless defined $code;
410 _search($range, 0, $#$range, $code);
411}
412
a452d459 413=head2 B<charblock()>
561c79ed 414
55d7b906 415 use Unicode::UCD 'charblock';
561c79ed 416
417 my $charblock = charblock(0x41);
10a6ecd2 418 my $charblock = charblock(1234);
a452d459 419 my $charblock = charblock(0x263a);
10a6ecd2 420 my $charblock = charblock("U+263a");
421
78bf21c2 422 my $range = charblock('Armenian');
10a6ecd2 423
a452d459 424With a L</code point argument> charblock() returns the I<block> the code point
425belongs to, e.g. C<Basic Latin>.
426If the code point is unassigned, this returns the block it would belong to if
427it were assigned (which it may in future versions of the Unicode Standard).
10a6ecd2 428
78bf21c2 429See also L</Blocks versus Scripts>.
430
eb0cc9e3 431If supplied with an argument that can't be a code point, charblock() tries
a452d459 432to do the opposite and interpret the argument as a code point block. The
eb0cc9e3 433return value is a I<range>: an anonymous list of lists that contain
a2bd7410 434I<start-of-range>, I<end-of-range> code point pairs. You can test whether
a452d459 435a code point is in a range using the L</charinrange()> function. If the
436argument is not a known code point block, B<undef> is returned.
561c79ed 437
561c79ed 438=cut
439
440my @BLOCKS;
10a6ecd2 441my %BLOCKS;
561c79ed 442
10a6ecd2 443sub _charblocks {
561c79ed 444 unless (@BLOCKS) {
10a6ecd2 445 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
6c8d78fb 446 local $_;
10a6ecd2 447 while (<$BLOCKSFH>) {
2796c109 448 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2 449 my ($lo, $hi) = (hex($1), hex($2));
450 my $subrange = [ $lo, $hi, $3 ];
451 push @BLOCKS, $subrange;
452 push @{$BLOCKS{$3}}, $subrange;
561c79ed 453 }
454 }
10a6ecd2 455 close($BLOCKSFH);
561c79ed 456 }
457 }
10a6ecd2 458}
459
460sub charblock {
461 my $arg = shift;
462
463 _charblocks() unless @BLOCKS;
464
465 my $code = _getcode($arg);
561c79ed 466
10a6ecd2 467 if (defined $code) {
468 _search(\@BLOCKS, 0, $#BLOCKS, $code);
469 } else {
470 if (exists $BLOCKS{$arg}) {
741297c1 471 return dclone $BLOCKS{$arg};
10a6ecd2 472 } else {
473 return;
474 }
475 }
e882dd67 476}
477
a452d459 478=head2 B<charscript()>
e882dd67 479
55d7b906 480 use Unicode::UCD 'charscript';
e882dd67 481
482 my $charscript = charscript(0x41);
10a6ecd2 483 my $charscript = charscript(1234);
484 my $charscript = charscript("U+263a");
e882dd67 485
78bf21c2 486 my $range = charscript('Thai');
10a6ecd2 487
a452d459 488With a L</code point argument> charscript() returns the I<script> the
489code point belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
490If the code point is unassigned, it returns B<undef>
78bf21c2 491
eb0cc9e3 492If supplied with an argument that can't be a code point, charscript() tries
a452d459 493to do the opposite and interpret the argument as a code point script. The
eb0cc9e3 494return value is a I<range>: an anonymous list of lists that contain
495I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
a452d459 496code point is in a range using the L</charinrange()> function. If the
497argument is not a known code point script, B<undef> is returned.
498
499See also L</Blocks versus Scripts>.
e882dd67 500
e882dd67 501=cut
502
503my @SCRIPTS;
10a6ecd2 504my %SCRIPTS;
e882dd67 505
10a6ecd2 506sub _charscripts {
e882dd67 507 unless (@SCRIPTS) {
10a6ecd2 508 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
6c8d78fb 509 local $_;
10a6ecd2 510 while (<$SCRIPTSFH>) {
e882dd67 511 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2 512 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
513 my $script = lc($3);
514 $script =~ s/\b(\w)/uc($1)/ge;
515 my $subrange = [ $lo, $hi, $script ];
516 push @SCRIPTS, $subrange;
517 push @{$SCRIPTS{$script}}, $subrange;
e882dd67 518 }
519 }
10a6ecd2 520 close($SCRIPTSFH);
e882dd67 521 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
522 }
523 }
10a6ecd2 524}
525
526sub charscript {
527 my $arg = shift;
528
529 _charscripts() unless @SCRIPTS;
e882dd67 530
10a6ecd2 531 my $code = _getcode($arg);
532
533 if (defined $code) {
534 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
535 } else {
536 if (exists $SCRIPTS{$arg}) {
741297c1 537 return dclone $SCRIPTS{$arg};
10a6ecd2 538 } else {
539 return;
540 }
541 }
542}
543
a452d459 544=head2 B<charblocks()>
10a6ecd2 545
55d7b906 546 use Unicode::UCD 'charblocks';
10a6ecd2 547
b08cd201 548 my $charblocks = charblocks();
10a6ecd2 549
b08cd201 550charblocks() returns a reference to a hash with the known block names
a452d459 551as the keys, and the code point ranges (see L</charblock()>) as the values.
10a6ecd2 552
78bf21c2 553See also L</Blocks versus Scripts>.
554
10a6ecd2 555=cut
556
557sub charblocks {
b08cd201 558 _charblocks() unless %BLOCKS;
741297c1 559 return dclone \%BLOCKS;
10a6ecd2 560}
561
a452d459 562=head2 B<charscripts()>
10a6ecd2 563
55d7b906 564 use Unicode::UCD 'charscripts';
10a6ecd2 565
ea508aee 566 my $charscripts = charscripts();
10a6ecd2 567
ea508aee 568charscripts() returns a reference to a hash with the known script
a452d459 569names as the keys, and the code point ranges (see L</charscript()>) as
ea508aee 570the values.
10a6ecd2 571
78bf21c2 572See also L</Blocks versus Scripts>.
573
10a6ecd2 574=cut
575
576sub charscripts {
b08cd201 577 _charscripts() unless %SCRIPTS;
741297c1 578 return dclone \%SCRIPTS;
561c79ed 579}
580
a452d459 581=head2 B<charinrange()>
10a6ecd2 582
583In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
584can also test whether a code point is in the I<range> as returned by
a452d459 585L</charblock()> and L</charscript()> or as the values of the hash returned
586by L</charblocks()> and L</charscripts()> by using charinrange():
10a6ecd2 587
55d7b906 588 use Unicode::UCD qw(charscript charinrange);
10a6ecd2 589
590 $range = charscript('Hiragana');
e145285f 591 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2 592
593=cut
594
ea508aee 595my %GENERAL_CATEGORIES =
596 (
597 'L' => 'Letter',
598 'LC' => 'CasedLetter',
599 'Lu' => 'UppercaseLetter',
600 'Ll' => 'LowercaseLetter',
601 'Lt' => 'TitlecaseLetter',
602 'Lm' => 'ModifierLetter',
603 'Lo' => 'OtherLetter',
604 'M' => 'Mark',
605 'Mn' => 'NonspacingMark',
606 'Mc' => 'SpacingMark',
607 'Me' => 'EnclosingMark',
608 'N' => 'Number',
609 'Nd' => 'DecimalNumber',
610 'Nl' => 'LetterNumber',
611 'No' => 'OtherNumber',
612 'P' => 'Punctuation',
613 'Pc' => 'ConnectorPunctuation',
614 'Pd' => 'DashPunctuation',
615 'Ps' => 'OpenPunctuation',
616 'Pe' => 'ClosePunctuation',
617 'Pi' => 'InitialPunctuation',
618 'Pf' => 'FinalPunctuation',
619 'Po' => 'OtherPunctuation',
620 'S' => 'Symbol',
621 'Sm' => 'MathSymbol',
622 'Sc' => 'CurrencySymbol',
623 'Sk' => 'ModifierSymbol',
624 'So' => 'OtherSymbol',
625 'Z' => 'Separator',
626 'Zs' => 'SpaceSeparator',
627 'Zl' => 'LineSeparator',
628 'Zp' => 'ParagraphSeparator',
629 'C' => 'Other',
630 'Cc' => 'Control',
631 'Cf' => 'Format',
632 'Cs' => 'Surrogate',
633 'Co' => 'PrivateUse',
634 'Cn' => 'Unassigned',
635 );
636
637sub general_categories {
638 return dclone \%GENERAL_CATEGORIES;
639}
640
a452d459 641=head2 B<general_categories()>
ea508aee 642
643 use Unicode::UCD 'general_categories';
644
645 my $categories = general_categories();
646
a452d459 647This returns a reference to a hash which has short
ea508aee 648general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
649names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
650C<Symbol>) as values. The hash is reversible in case you need to go
651from the long names to the short names. The general category is the
a452d459 652one returned from
653L</charinfo()> under the C<category> key.
ea508aee 654
655=cut
656
657my %BIDI_TYPES =
658 (
659 'L' => 'Left-to-Right',
660 'LRE' => 'Left-to-Right Embedding',
661 'LRO' => 'Left-to-Right Override',
662 'R' => 'Right-to-Left',
663 'AL' => 'Right-to-Left Arabic',
664 'RLE' => 'Right-to-Left Embedding',
665 'RLO' => 'Right-to-Left Override',
666 'PDF' => 'Pop Directional Format',
667 'EN' => 'European Number',
668 'ES' => 'European Number Separator',
669 'ET' => 'European Number Terminator',
670 'AN' => 'Arabic Number',
671 'CS' => 'Common Number Separator',
672 'NSM' => 'Non-Spacing Mark',
673 'BN' => 'Boundary Neutral',
674 'B' => 'Paragraph Separator',
675 'S' => 'Segment Separator',
676 'WS' => 'Whitespace',
677 'ON' => 'Other Neutrals',
678 );
679
a452d459 680=head2 B<bidi_types()>
ea508aee 681
682 use Unicode::UCD 'bidi_types';
683
684 my $categories = bidi_types();
685
a452d459 686This returns a reference to a hash which has the short
ea508aee 687bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
688names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The
689hash is reversible in case you need to go from the long names to the
a452d459 690short names. The bidi type is the one returned from
691L</charinfo()>
ea508aee 692under the C<bidi> key. For the exact meaning of the various bidi classes
693the Unicode TR9 is recommended reading:
a452d459 694L<http://www.unicode.org/reports/tr9/>
ea508aee 695(as of Unicode 5.0.0)
696
697=cut
698
a452d459 699sub bidi_types {
700 return dclone \%BIDI_TYPES;
701}
702
703=head2 B<compexcl()>
b08cd201 704
55d7b906 705 use Unicode::UCD 'compexcl';
b08cd201 706
a452d459 707 my $compexcl = compexcl(0x09dc);
b08cd201 708
a452d459 709This returns B<true> if the
710L</code point argument> should not be produced by composition normalization,
711B<AND> if that fact is not otherwise determinable from the Unicode data base.
712It currently does not return B<true> if the code point has a decomposition
713consisting of another single code point, nor if its decomposition starts
714with a code point whose combining class is non-zero. Code points that meet
715either of these conditions should also not be produced by composition
716normalization.
b08cd201 717
a452d459 718It returns B<false> otherwise.
b08cd201 719
720=cut
721
722my %COMPEXCL;
723
724sub _compexcl {
725 unless (%COMPEXCL) {
551b6b6f 726 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
6c8d78fb 727 local $_;
b08cd201 728 while (<$COMPEXCLFH>) {
822ebcc8 729 if (/^([0-9A-F]+)\s+\#\s+/) {
b08cd201 730 my $code = hex($1);
731 $COMPEXCL{$code} = undef;
732 }
733 }
734 close($COMPEXCLFH);
735 }
736 }
737}
738
739sub compexcl {
740 my $arg = shift;
741 my $code = _getcode($arg);
74f8133e 742 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
743 unless defined $code;
b08cd201 744
745 _compexcl() unless %COMPEXCL;
746
747 return exists $COMPEXCL{$code};
748}
749
a452d459 750=head2 B<casefold()>
b08cd201 751
55d7b906 752 use Unicode::UCD 'casefold';
b08cd201 753
a452d459 754 my $casefold = casefold(0xDF);
755 if (defined $casefold) {
756 my @full_fold_hex = split / /, $casefold->{'full'};
757 my $full_fold_string =
758 join "", map {chr(hex($_))} @full_fold_hex;
759 my @turkic_fold_hex =
760 split / /, ($casefold->{'turkic'} ne "")
761 ? $casefold->{'turkic'}
762 : $casefold->{'full'};
763 my $turkic_fold_string =
764 join "", map {chr(hex($_))} @turkic_fold_hex;
765 }
766 if (defined $casefold && $casefold->{'simple'} ne "") {
767 my $simple_fold_hex = $casefold->{'simple'};
768 my $simple_fold_string = chr(hex($simple_fold_hex));
769 }
b08cd201 770
a452d459 771This returns the (almost) locale-independent case folding of the
772character specified by the L</code point argument>.
b08cd201 773
a452d459 774If there is no case folding for that code point, B<undef> is returned.
775
776If there is a case folding for that code point, a reference to a hash
b08cd201 777with the following fields is returned:
778
a452d459 779=over
780
781=item B<code>
782
783the input L</code point argument> expressed in hexadecimal, with leading zeros
784added if necessary to make it contain at least four hexdigits
785
786=item B<full>
787
788one or more codes (separated by spaces) that taken in order give the
789code points for the case folding for I<code>.
790Each has at least four hexdigits.
791
792=item B<simple>
793
794is empty, or is exactly one code with at least four hexdigits which can be used
795as an alternative case folding when the calling program cannot cope with the
796fold being a sequence of multiple code points. If I<full> is just one code
797point, then I<simple> equals I<full>. If there is no single code point folding
798defined for I<code>, then I<simple> is the empty string. Otherwise, it is an
799inferior, but still better-than-nothing alternative folding to I<full>.
800
801=item B<mapping>
802
803is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
804otherwise. It can be considered to be the simplest possible folding for
805I<code>. It is defined primarily for backwards compatibility.
806
807=item B<status>
b08cd201 808
a452d459 809is C<C> (for C<common>) if the best possible fold is a single code point
810(I<simple> equals I<full> equals I<mapping>). It is C<S> if there are distinct
811folds, I<simple> and I<full> (I<mapping> equals I<simple>). And it is C<F> if
812there only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty). Note
813that this
814describes the contents of I<mapping>. It is defined primarily for backwards
815compatibility.
b08cd201 816
a452d459 817On versions 3.1 and earlier of Unicode, I<status> can also be
818C<I> which is the same as C<C> but is a special case for dotted uppercase I and
819dotless lowercase i:
b08cd201 820
a452d459 821=over
b08cd201 822
a452d459 823=item B<*>
824
825If you use this C<I> mapping, the result is case-insensitive,
826but dotless and dotted I's are not distinguished
827
828=item B<*>
829
830If you exclude this C<I> mapping, the result is not fully case-insensitive, but
831dotless and dotted I's are distinguished
832
833=back
834
835=item B<turkic>
836
837contains any special folding for Turkic languages. For versions of Unicode
838starting with 3.2, this field is empty unless I<code> has a different folding
839in Turkic languages, in which case it is one or more codes (separated by
840spaces) that taken in order give the code points for the case folding for
841I<code> in those languages.
842Each code has at least four hexdigits.
843Note that this folding does not maintain canonical equivalence without
844additional processing.
845
846For versions of Unicode 3.1 and earlier, this field is empty unless there is a
847special folding for Turkic languages, in which case I<status> is C<I>, and
848I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
849
850=back
851
852Programs that want complete generality and the best folding results should use
853the folding contained in the I<full> field. But note that the fold for some
854code points will be a sequence of multiple code points.
855
856Programs that can't cope with the fold mapping being multiple code points can
857use the folding contained in the I<simple> field, with the loss of some
858generality. In Unicode 5.1, about 7% of the defined foldings have no single
859code point folding.
860
861The I<mapping> and I<status> fields are provided for backwards compatibility for
862existing programs. They contain the same values as in previous versions of
863this function.
864
865Locale is not completely independent. The I<turkic> field contains results to
866use when the locale is a Turkic language.
b08cd201 867
868For more information about case mappings see
a452d459 869L<http://www.unicode.org/unicode/reports/tr21>
b08cd201 870
871=cut
872
873my %CASEFOLD;
874
875sub _casefold {
876 unless (%CASEFOLD) {
551b6b6f 877 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
6c8d78fb 878 local $_;
b08cd201 879 while (<$CASEFOLDFH>) {
a452d459 880 if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
b08cd201 881 my $code = hex($1);
a452d459 882 $CASEFOLD{$code}{'code'} = $1;
883 $CASEFOLD{$code}{'turkic'} = "" unless
884 defined $CASEFOLD{$code}{'turkic'};
885 if ($2 eq 'C' || $2 eq 'I') { # 'I' is only on 3.1 and
886 # earlier Unicodes
887 # Both entries there (I
888 # only checked 3.1) are
889 # the same as C, and
890 # there are no other
891 # entries for those
892 # codepoints, so treat
893 # as if C, but override
894 # the turkic one for
895 # 'I'.
896 $CASEFOLD{$code}{'status'} = $2;
897 $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
898 $CASEFOLD{$code}{'mapping'} = $3;
899 $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
900 } elsif ($2 eq 'F') {
901 $CASEFOLD{$code}{'full'} = $3;
902 unless (defined $CASEFOLD{$code}{'simple'}) {
903 $CASEFOLD{$code}{'simple'} = "";
904 $CASEFOLD{$code}{'mapping'} = $3;
905 $CASEFOLD{$code}{'status'} = $2;
906 }
907 } elsif ($2 eq 'S') {
908
909
910 # There can't be a simple without a full, and simple
911 # overrides all but full
912
913 $CASEFOLD{$code}{'simple'} = $3;
914 $CASEFOLD{$code}{'mapping'} = $3;
915 $CASEFOLD{$code}{'status'} = $2;
916 } elsif ($2 eq 'T') {
917 $CASEFOLD{$code}{'turkic'} = $3;
918 } # else can't happen because only [CIFST] are possible
b08cd201 919 }
920 }
921 close($CASEFOLDFH);
922 }
923 }
924}
925
926sub casefold {
927 my $arg = shift;
928 my $code = _getcode($arg);
74f8133e 929 croak __PACKAGE__, "::casefold: unknown code '$arg'"
930 unless defined $code;
b08cd201 931
932 _casefold() unless %CASEFOLD;
933
934 return $CASEFOLD{$code};
935}
936
a452d459 937=head2 B<casespec()>
b08cd201 938
55d7b906 939 use Unicode::UCD 'casespec';
b08cd201 940
a452d459 941 my $casespec = casespec(0xFB00);
b08cd201 942
a452d459 943This returns the potentially locale-dependent case mappings of the L</code point
944argument>. The mappings may be longer than a single code point (which the basic
945Unicode case mappings as returned by L</charinfo()> never are).
b08cd201 946
a452d459 947If there are no case mappings for the L</code point argument>, or if all three
948possible mappings (I<lower>, I<title> and I<upper>) result in single code
5d8e6e41 949points and are locale independent and unconditional, B<undef> is returned
950(which means that the case mappings, if any, for the code point are those
951returned by L</charinfo()>).
a452d459 952
953Otherwise, a reference to a hash giving the mappings (or a reference to a hash
5d8e6e41 954of such hashes, explained below) is returned with the following keys and their
955meanings:
a452d459 956
957The keys in the bottom layer hash with the meanings of their values are:
958
959=over
960
961=item B<code>
962
963the input L</code point argument> expressed in hexadecimal, with leading zeros
964added if necessary to make it contain at least four hexdigits
965
966=item B<lower>
967
968one or more codes (separated by spaces) that taken in order give the
969code points for the lower case of I<code>.
970Each has at least four hexdigits.
971
972=item B<title>
b08cd201 973
a452d459 974one or more codes (separated by spaces) that taken in order give the
975code points for the title case of I<code>.
976Each has at least four hexdigits.
b08cd201 977
a452d459 978=item B<lower>
b08cd201 979
a452d459 980one or more codes (separated by spaces) that taken in order give the
981code points for the upper case of I<code>.
982Each has at least four hexdigits.
983
984=item B<condition>
985
986the conditions for the mappings to be valid.
987If B<undef>, the mappings are always valid.
988When defined, this field is a list of conditions,
989all of which must be true for the mappings to be valid.
990The list consists of one or more
991I<locales> (see below)
992and/or I<contexts> (explained in the next paragraph),
993separated by spaces.
994(Other than as used to separate elements, spaces are to be ignored.)
995Case distinctions in the condition list are not significant.
82c0b05b 996Conditions preceded by "NON_" represent the negation of the condition.
b08cd201 997
a452d459 998A I<context> is one of those defined in the Unicode standard.
999For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
1000available at
5d8e6e41 1001L<http://www.unicode.org/versions/Unicode5.1.0/>.
1002These are for context-sensitive casing.
f499c386 1003
a452d459 1004=back
1005
5d8e6e41 1006The hash described above is returned for locale-independent casing, where
1007at least one of the mappings has length longer than one. If B<undef> is
1008returned, the code point may have mappings, but if so, all are length one,
1009and are returned by L</charinfo()>.
1010Note that when this function does return a value, it will be for the complete
1011set of mappings for a code point, even those whose length is one.
1012
1013If there are additional casing rules that apply only in certain locales,
1014an additional key for each will be defined in the returned hash. Each such key
1015will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
1016followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
1017and a variant code). You can find the lists of all possible locales, see
1018L<Locale::Country> and L<Locale::Language>.
a452d459 1019(In Unicode 5.1, the only locales returned by this function
1020are C<lt>, C<tr>, and C<az>.)
b08cd201 1021
5d8e6e41 1022Each locale key is a reference to a hash that has the form above, and gives
1023the casing rules for that particular locale, which take precedence over the
1024locale-independent ones when in that locale.
1025
1026If the only casing for a code point is locale-dependent, then the returned
1027hash will not have any of the base keys, like C<code>, C<upper>, etc., but
1028will contain only locale keys.
1029
b08cd201 1030For more information about case mappings see
a452d459 1031L<http://www.unicode.org/unicode/reports/tr21/>
b08cd201 1032
1033=cut
1034
1035my %CASESPEC;
1036
1037sub _casespec {
1038 unless (%CASESPEC) {
551b6b6f 1039 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
6c8d78fb 1040 local $_;
b08cd201 1041 while (<$CASESPECFH>) {
1042 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386 1043 my ($hexcode, $lower, $title, $upper, $condition) =
1044 ($1, $2, $3, $4, $5);
1045 my $code = hex($hexcode);
1046 if (exists $CASESPEC{$code}) {
1047 if (exists $CASESPEC{$code}->{code}) {
1048 my ($oldlower,
1049 $oldtitle,
1050 $oldupper,
1051 $oldcondition) =
1052 @{$CASESPEC{$code}}{qw(lower
1053 title
1054 upper
1055 condition)};
822ebcc8 1056 if (defined $oldcondition) {
1057 my ($oldlocale) =
f499c386 1058 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386 1059 delete $CASESPEC{$code};
1060 $CASESPEC{$code}->{$oldlocale} =
1061 { code => $hexcode,
1062 lower => $oldlower,
1063 title => $oldtitle,
1064 upper => $oldupper,
1065 condition => $oldcondition };
f499c386 1066 }
1067 }
1068 my ($locale) =
1069 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
1070 $CASESPEC{$code}->{$locale} =
1071 { code => $hexcode,
1072 lower => $lower,
1073 title => $title,
1074 upper => $upper,
1075 condition => $condition };
1076 } else {
1077 $CASESPEC{$code} =
1078 { code => $hexcode,
1079 lower => $lower,
1080 title => $title,
1081 upper => $upper,
1082 condition => $condition };
1083 }
b08cd201 1084 }
1085 }
1086 close($CASESPECFH);
1087 }
1088 }
1089}
1090
1091sub casespec {
1092 my $arg = shift;
1093 my $code = _getcode($arg);
74f8133e 1094 croak __PACKAGE__, "::casespec: unknown code '$arg'"
1095 unless defined $code;
b08cd201 1096
1097 _casespec() unless %CASESPEC;
1098
741297c1 1099 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
b08cd201 1100}
1101
a452d459 1102=head2 B<namedseq()>
a2bd7410 1103
1104 use Unicode::UCD 'namedseq';
1105
1106 my $namedseq = namedseq("KATAKANA LETTER AINU P");
1107 my @namedseq = namedseq("KATAKANA LETTER AINU P");
1108 my %namedseq = namedseq();
1109
1110If used with a single argument in a scalar context, returns the string
a452d459 1111consisting of the code points of the named sequence, or B<undef> if no
a2bd7410 1112named sequence by that name exists. If used with a single argument in
a452d459 1113a list context, it returns the list of the code points. If used with no
a2bd7410 1114arguments in a list context, returns a hash with the names of the
1115named sequences as the keys and the named sequences as strings as
a452d459 1116the values. Otherwise, it returns B<undef> or an empty list depending
a2bd7410 1117on the context.
1118
a452d459 1119This function only operates on officially approved (not provisional) named
1120sequences.
a2bd7410 1121
1122=cut
1123
1124my %NAMEDSEQ;
1125
1126sub _namedseq {
1127 unless (%NAMEDSEQ) {
1128 if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
1129 local $_;
1130 while (<$NAMEDSEQFH>) {
1131 if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
1132 my ($n, $s) = ($1, $2);
1133 my @s = map { chr(hex($_)) } split(' ', $s);
1134 $NAMEDSEQ{$n} = join("", @s);
1135 }
1136 }
1137 close($NAMEDSEQFH);
1138 }
1139 }
1140}
1141
1142sub namedseq {
1143 _namedseq() unless %NAMEDSEQ;
1144 my $wantarray = wantarray();
1145 if (defined $wantarray) {
1146 if ($wantarray) {
1147 if (@_ == 0) {
1148 return %NAMEDSEQ;
1149 } elsif (@_ == 1) {
1150 my $s = $NAMEDSEQ{ $_[0] };
1151 return defined $s ? map { ord($_) } split('', $s) : ();
1152 }
1153 } elsif (@_ == 1) {
1154 return $NAMEDSEQ{ $_[0] };
1155 }
1156 }
1157 return;
1158}
1159
55d7b906 1160=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 1161
a452d459 1162This returns the version of the Unicode Character Database, in other words, the
1163version of the Unicode standard the database implements. The version is a
1164string of numbers delimited by dots (C<'.'>).
10a6ecd2 1165
1166=cut
1167
1168my $UNICODEVERSION;
1169
1170sub UnicodeVersion {
1171 unless (defined $UNICODEVERSION) {
1172 openunicode(\$VERSIONFH, "version");
1173 chomp($UNICODEVERSION = <$VERSIONFH>);
1174 close($VERSIONFH);
1175 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
1176 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
1177 }
1178 return $UNICODEVERSION;
1179}
3aa957f9 1180
a452d459 1181=head2 B<Blocks versus Scripts>
1182
1183The difference between a block and a script is that scripts are closer
1184to the linguistic notion of a set of code points required to present
1185languages, while block is more of an artifact of the Unicode code point
1186numbering and separation into blocks of (mostly) 256 code points.
1187
1188For example the Latin B<script> is spread over several B<blocks>, such
1189as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
1190C<Latin Extended-B>. On the other hand, the Latin script does not
1191contain all the characters of the C<Basic Latin> block (also known as
1192ASCII): it includes only the letters, and not, for example, the digits
1193or the punctuation.
1194
1195For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
1196
1197For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
1198
1199=head2 B<Matching Scripts and Blocks>
1200
1201Scripts are matched with the regular-expression construct
1202C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
1203while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
1204any of the 256 code points in the Tibetan block).
1205
1206
3aa957f9 1207=head2 Implementation Note
32c16050 1208
ad9cab37 1209The first use of charinfo() opens a read-only filehandle to the Unicode
1210Character Database (the database is included in the Perl distribution).
78bf21c2 1211The filehandle is then kept open for further queries. In other words,
1212if you are wondering where one of your filehandles went, that's where.
32c16050 1213
8b731da2 1214=head1 BUGS
1215
1216Does not yet support EBCDIC platforms.
1217
a452d459 1218L</compexcl()> should give a complete list of excluded code points.
1219
561c79ed 1220=head1 AUTHOR
1221
1222Jarkko Hietaniemi
1223
1224=cut
1225
12261;