Add Tests for CPAN::Nox
[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed 2
3use strict;
4use warnings;
5
74f8133e 6our $VERSION = '0.2';
561c79ed 7
8require Exporter;
9
10our @ISA = qw(Exporter);
74f8133e 11
10a6ecd2 12our @EXPORT_OK = qw(charinfo
13 charblock charscript
14 charblocks charscripts
b08cd201 15 charinrange
16 compexcl
17 casefold casespec);
561c79ed 18
19use Carp;
20
21=head1 NAME
22
55d7b906 23Unicode::UCD - Unicode character database
561c79ed 24
25=head1 SYNOPSIS
26
55d7b906 27 use Unicode::UCD 'charinfo';
b08cd201 28 my $charinfo = charinfo($codepoint);
561c79ed 29
55d7b906 30 use Unicode::UCD 'charblock';
e882dd67 31 my $charblock = charblock($codepoint);
32
55d7b906 33 use Unicode::UCD 'charscript';
e882dd67 34 my $charscript = charblock($codepoint);
561c79ed 35
55d7b906 36 use Unicode::UCD 'charblocks';
e145285f 37 my $charblocks = charblocks();
38
55d7b906 39 use Unicode::UCD 'charscripts';
e145285f 40 my %charscripts = charscripts();
41
55d7b906 42 use Unicode::UCD qw(charscript charinrange);
e145285f 43 my $range = charscript($script);
44 print "looks like $script\n" if charinrange($range, $codepoint);
45
55d7b906 46 use Unicode::UCD 'compexcl';
e145285f 47 my $compexcl = compexcl($codepoint);
48
55d7b906 49 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 50
561c79ed 51=head1 DESCRIPTION
52
55d7b906 53The Unicode::UCD module offers a simple interface to the Unicode Character
561c79ed 54Database.
55
56=cut
57
10a6ecd2 58my $UNICODEFH;
59my $BLOCKSFH;
60my $SCRIPTSFH;
61my $VERSIONFH;
b08cd201 62my $COMPEXCLFH;
63my $CASEFOLDFH;
64my $CASESPECFH;
561c79ed 65
66sub openunicode {
67 my ($rfh, @path) = @_;
68 my $f;
69 unless (defined $$rfh) {
70 for my $d (@INC) {
71 use File::Spec;
55d7b906 72 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 73 last if open($$rfh, $f);
e882dd67 74 undef $f;
561c79ed 75 }
e882dd67 76 croak __PACKAGE__, ": failed to find ",
77 File::Spec->catfile(@path), " in @INC"
78 unless defined $f;
561c79ed 79 }
80 return $f;
81}
82
83=head2 charinfo
84
55d7b906 85 use Unicode::UCD 'charinfo';
561c79ed 86
b08cd201 87 my $charinfo = charinfo(0x41);
561c79ed 88
b08cd201 89charinfo() returns a reference to a hash that has the following fields
90as defined by the Unicode standard:
561c79ed 91
92 key
93
94 code code point with at least four hexdigits
95 name name of the character IN UPPER CASE
96 category general category of the character
97 combining classes used in the Canonical Ordering Algorithm
98 bidi bidirectional category
99 decomposition character decomposition mapping
100 decimal if decimal digit this is the integer numeric value
101 digit if digit this is the numeric value
102 numeric if numeric is the integer or rational numeric value
103 mirrored if mirrored in bidirectional text
104 unicode10 Unicode 1.0 name if existed and different
105 comment ISO 10646 comment field
106 upper uppercase equivalent mapping
107 lower lowercase equivalent mapping
108 title titlecase equivalent mapping
e882dd67 109
561c79ed 110 block block the character belongs to (used in \p{In...})
e882dd67 111 script script the character belongs to
561c79ed 112
b08cd201 113If no match is found, a reference to an empty hash is returned.
561c79ed 114
32c16050 115The C<block> property is the same as as returned by charinfo(). It is
116not defined in the Unicode Character Database proper (Chapter 4 of the
78bf21c2 117Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
118(Chapter 14 of TUS3). Similarly for the C<script> property.
32c16050 119
120Note that you cannot do (de)composition and casing based solely on the
121above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 122you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed 123
124=cut
125
10a6ecd2 126sub _getcode {
127 my $arg = shift;
128
129 if ($arg =~ /^\d+$/) {
130 return $arg;
131 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
132 return hex($1);
133 }
134
135 return;
136}
137
9087a70b 138use Lingua::KO::Hangul::Util;
139
140sub hangul_decomp { # internal: called from charinfo
141 my @tmp = decomposeHangul(shift);
142 return
143 @tmp == 2 ? sprintf("%04X %04X", @tmp) :
144 @tmp == 3 ? sprintf("%04X %04X %04X", @tmp) :
145 undef;
a6fa416b 146}
147
9087a70b 148sub han_charname { # internal: called from charinfo
149 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
a6fa416b 150}
151
152my @CharinfoRanges = (
153# block name
154# [ first, last, coderef to name, coderef to decompose ],
155# CJK Ideographs Extension A
156 [ 0x3400, 0x4DB5, \&han_charname, undef ],
157# CJK Ideographs
158 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
159# Hangul Syllables
9087a70b 160 [ 0xAC00, 0xD7A3, \&getHangulName, \&hangul_decomp ],
a6fa416b 161# Non-Private Use High Surrogates
162 [ 0xD800, 0xDB7F, undef, undef ],
163# Private Use High Surrogates
164 [ 0xDB80, 0xDBFF, undef, undef ],
165# Low Surrogates
166 [ 0xDC00, 0xDFFF, undef, undef ],
167# The Private Use Area
168 [ 0xE000, 0xF8FF, undef, undef ],
169# CJK Ideographs Extension B
170 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
171# Plane 15 Private Use Area
172 [ 0xF0000, 0xFFFFD, undef, undef ],
173# Plane 16 Private Use Area
174 [ 0x100000, 0x10FFFD, undef, undef ],
175);
176
561c79ed 177sub charinfo {
10a6ecd2 178 my $arg = shift;
179 my $code = _getcode($arg);
180 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
181 unless defined $code;
e63dbbf9 182 my $hexk = sprintf("%06X", $code);
a6fa416b 183 my($rcode,$rname,$rdec);
184 foreach my $range (@CharinfoRanges){
74f8133e 185 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 186 $rcode = $hexk;
e63dbbf9 187 $rcode =~ s/^0+//;
188 $rcode = sprintf("%04X", hex($rcode));
a6fa416b 189 $rname = $range->[2] ? $range->[2]->($code) : '';
190 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 191 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b 192 last;
193 }
194 }
74f8133e 195 openunicode(\$UNICODEFH, "Unicode.txt");
10a6ecd2 196 if (defined $UNICODEFH) {
e63dbbf9 197 use Search::Dict 1.02;
198 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 199 my $line = <$UNICODEFH>;
561c79ed 200 chomp $line;
201 my %prop;
202 @prop{qw(
203 code name category
204 combining bidi decomposition
205 decimal digit numeric
206 mirrored unicode10 comment
207 upper lower title
208 )} = split(/;/, $line, -1);
e63dbbf9 209 $hexk =~ s/^0+//;
210 $hexk = sprintf("%04X", hex($hexk));
561c79ed 211 if ($prop{code} eq $hexk) {
a196fbfd 212 $prop{block} = charblock($code);
213 $prop{script} = charscript($code);
a6fa416b 214 if(defined $rname){
215 $prop{code} = $rcode;
216 $prop{name} = $rname;
217 $prop{decomposition} = $rdec;
218 }
b08cd201 219 return \%prop;
561c79ed 220 }
221 }
222 }
223 return;
224}
225
e882dd67 226sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
227 my ($table, $lo, $hi, $code) = @_;
228
229 return if $lo > $hi;
230
231 my $mid = int(($lo+$hi) / 2);
232
233 if ($table->[$mid]->[0] < $code) {
10a6ecd2 234 if ($table->[$mid]->[1] >= $code) {
e882dd67 235 return $table->[$mid]->[2];
236 } else {
237 _search($table, $mid + 1, $hi, $code);
238 }
239 } elsif ($table->[$mid]->[0] > $code) {
240 _search($table, $lo, $mid - 1, $code);
241 } else {
242 return $table->[$mid]->[2];
243 }
244}
245
10a6ecd2 246sub charinrange {
247 my ($range, $arg) = @_;
248 my $code = _getcode($arg);
249 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
250 unless defined $code;
251 _search($range, 0, $#$range, $code);
252}
253
354a27bf 254=head2 charblock
561c79ed 255
55d7b906 256 use Unicode::UCD 'charblock';
561c79ed 257
258 my $charblock = charblock(0x41);
10a6ecd2 259 my $charblock = charblock(1234);
260 my $charblock = charblock("0x263a");
261 my $charblock = charblock("U+263a");
262
78bf21c2 263 my $range = charblock('Armenian');
10a6ecd2 264
78bf21c2 265With a B<code point argument> charblock() returns the I<block> the character
10a6ecd2 266belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 267positions within all blocks are defined.
10a6ecd2 268
78bf21c2 269See also L</Blocks versus Scripts>.
270
10a6ecd2 271If supplied with an argument that can't be a code point, charblock()
272tries to do the opposite and interpret the argument as a character
273block. The return value is a I<range>: an anonymous list that
274contains anonymous lists, which in turn contain I<start-of-range>,
275I<end-of-range> code point pairs. You can test whether a code point
276is in a range using the L</charinrange> function. If the argument is
277not a known charater block, C<undef> is returned.
561c79ed 278
561c79ed 279=cut
280
281my @BLOCKS;
10a6ecd2 282my %BLOCKS;
561c79ed 283
10a6ecd2 284sub _charblocks {
561c79ed 285 unless (@BLOCKS) {
10a6ecd2 286 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
287 while (<$BLOCKSFH>) {
2796c109 288 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2 289 my ($lo, $hi) = (hex($1), hex($2));
290 my $subrange = [ $lo, $hi, $3 ];
291 push @BLOCKS, $subrange;
292 push @{$BLOCKS{$3}}, $subrange;
561c79ed 293 }
294 }
10a6ecd2 295 close($BLOCKSFH);
561c79ed 296 }
297 }
10a6ecd2 298}
299
300sub charblock {
301 my $arg = shift;
302
303 _charblocks() unless @BLOCKS;
304
305 my $code = _getcode($arg);
561c79ed 306
10a6ecd2 307 if (defined $code) {
308 _search(\@BLOCKS, 0, $#BLOCKS, $code);
309 } else {
310 if (exists $BLOCKS{$arg}) {
311 return $BLOCKS{$arg};
312 } else {
313 return;
314 }
315 }
e882dd67 316}
317
318=head2 charscript
319
55d7b906 320 use Unicode::UCD 'charscript';
e882dd67 321
322 my $charscript = charscript(0x41);
10a6ecd2 323 my $charscript = charscript(1234);
324 my $charscript = charscript("U+263a");
e882dd67 325
78bf21c2 326 my $range = charscript('Thai');
10a6ecd2 327
78bf21c2 328With a B<code point argument> charscript() returns the I<script> the
b08cd201 329character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2 330
78bf21c2 331See also L</Blocks versus Scripts>.
332
10a6ecd2 333If supplied with an argument that can't be a code point, charscript()
334tries to do the opposite and interpret the argument as a character
335script. The return value is a I<range>: an anonymous list that
336contains anonymous lists, which in turn contain I<start-of-range>,
337I<end-of-range> code point pairs. You can test whether a code point
338is in a range using the L</charinrange> function. If the argument is
339not a known charater script, C<undef> is returned.
e882dd67 340
e882dd67 341=cut
342
343my @SCRIPTS;
10a6ecd2 344my %SCRIPTS;
e882dd67 345
10a6ecd2 346sub _charscripts {
e882dd67 347 unless (@SCRIPTS) {
10a6ecd2 348 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
349 while (<$SCRIPTSFH>) {
e882dd67 350 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2 351 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
352 my $script = lc($3);
353 $script =~ s/\b(\w)/uc($1)/ge;
354 my $subrange = [ $lo, $hi, $script ];
355 push @SCRIPTS, $subrange;
356 push @{$SCRIPTS{$script}}, $subrange;
e882dd67 357 }
358 }
10a6ecd2 359 close($SCRIPTSFH);
e882dd67 360 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
361 }
362 }
10a6ecd2 363}
364
365sub charscript {
366 my $arg = shift;
367
368 _charscripts() unless @SCRIPTS;
e882dd67 369
10a6ecd2 370 my $code = _getcode($arg);
371
372 if (defined $code) {
373 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
374 } else {
375 if (exists $SCRIPTS{$arg}) {
376 return $SCRIPTS{$arg};
377 } else {
378 return;
379 }
380 }
381}
382
383=head2 charblocks
384
55d7b906 385 use Unicode::UCD 'charblocks';
10a6ecd2 386
b08cd201 387 my $charblocks = charblocks();
10a6ecd2 388
b08cd201 389charblocks() returns a reference to a hash with the known block names
390as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2 391
78bf21c2 392See also L</Blocks versus Scripts>.
393
10a6ecd2 394=cut
395
396sub charblocks {
b08cd201 397 _charblocks() unless %BLOCKS;
398 return \%BLOCKS;
10a6ecd2 399}
400
401=head2 charscripts
402
55d7b906 403 use Unicode::UCD 'charscripts';
10a6ecd2 404
405 my %charscripts = charscripts();
406
407charscripts() returns a hash with the known script names as the keys,
408and the code point ranges (see L</charscript>) as the values.
409
78bf21c2 410See also L</Blocks versus Scripts>.
411
10a6ecd2 412=cut
413
414sub charscripts {
b08cd201 415 _charscripts() unless %SCRIPTS;
416 return \%SCRIPTS;
561c79ed 417}
418
10a6ecd2 419=head2 Blocks versus Scripts
ad9cab37 420
10a6ecd2 421The difference between a block and a script is that scripts are closer
422to the linguistic notion of a set of characters required to present
423languages, while block is more of an artifact of the Unicode character
424numbering and separation into blocks of 256 characters.
3aa957f9 425
426For example the Latin B<script> is spread over several B<blocks>, such
427as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
428C<Latin Extended-B>. On the other hand, the Latin script does not
429contain all the characters of the C<Basic Latin> block (also known as
430the ASCII): it includes only the letters, not for example the digits
431or the punctuation.
ad9cab37 432
3aa957f9 433For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37 434
435For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
436
3aa957f9 437=head2 Matching Scripts and Blocks
438
439Both scripts and blocks can be matched using the regular expression
440construct C<\p{In...}> and its negation C<\P{In...}>.
441
442The name of the script or the block comes after the C<In>, for example
443C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
10a6ecd2 444removed from the names for the C<\p{In...}>, for example
445C<LatinExtendedA> instead of C<Latin Extended-A>.
446
78bf21c2 447There are a few cases where there is both a script and a block by the
448same name, in these cases the block version has C<Block> appended to
449its name: C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is
450the block.
10a6ecd2 451
b08cd201 452=head2 Code Point Arguments
453
78bf21c2 454A <code point argument> is either a decimal or a hexadecimal scalar
455designating a Unicode character, or "U+" followed by hexadecimals
456designating a Unicode character. Note that Unicode is B<not> limited
457to 16 bits (the number of Unicode characters is open-ended, in theory
458unlimited): you may have more than 4 hexdigits.
b08cd201 459
10a6ecd2 460=head2 charinrange
461
462In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
463can also test whether a code point is in the I<range> as returned by
464L</charblock> and L</charscript> or as the values of the hash returned
e618509d 465by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 466
55d7b906 467 use Unicode::UCD qw(charscript charinrange);
10a6ecd2 468
469 $range = charscript('Hiragana');
e145285f 470 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2 471
472=cut
473
b08cd201 474=head2 compexcl
475
55d7b906 476 use Unicode::UCD 'compexcl';
b08cd201 477
478 my $compexcl = compexcl("09dc");
479
480The compexcl() returns the composition exclusion (that is, if the
9046a8ae 481character should not be produced during a precomposition) of the
482character specified by a B<code point argument>.
b08cd201 483
484If there is a composition exclusion for the character, true is
485returned. Otherwise, false is returned.
486
487=cut
488
489my %COMPEXCL;
490
491sub _compexcl {
492 unless (%COMPEXCL) {
493 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
494 while (<$COMPEXCLFH>) {
495 if (/^([0-9A-F]+) \# /) {
496 my $code = hex($1);
497 $COMPEXCL{$code} = undef;
498 }
499 }
500 close($COMPEXCLFH);
501 }
502 }
503}
504
505sub compexcl {
506 my $arg = shift;
507 my $code = _getcode($arg);
74f8133e 508 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
509 unless defined $code;
b08cd201 510
511 _compexcl() unless %COMPEXCL;
512
513 return exists $COMPEXCL{$code};
514}
515
516=head2 casefold
517
55d7b906 518 use Unicode::UCD 'casefold';
b08cd201 519
520 my %casefold = casefold("09dc");
521
522The casefold() returns the locale-independent case folding of the
523character specified by a B<code point argument>.
524
525If there is a case folding for that character, a reference to a hash
526with the following fields is returned:
527
528 key
529
530 code code point with at least four hexdigits
531 status "C", "F", "S", or "I"
532 mapping one or more codes separated by spaces
533
534The meaning of the I<status> is as follows:
535
536 C common case folding, common mappings shared
537 by both simple and full mappings
538 F full case folding, mappings that cause strings
539 to grow in length. Multiple characters are separated
540 by spaces
541 S simple case folding, mappings to single characters
542 where different from F
543 I special case for dotted uppercase I and
544 dotless lowercase i
545 - If this mapping is included, the result is
546 case-insensitive, but dotless and dotted I's
547 are not distinguished
548 - If this mapping is excluded, the result is not
549 fully case-insensitive, but dotless and dotted
550 I's are distinguished
551
552If there is no case folding for that character, C<undef> is returned.
553
554For more information about case mappings see
555http://www.unicode.org/unicode/reports/tr21/
556
557=cut
558
559my %CASEFOLD;
560
561sub _casefold {
562 unless (%CASEFOLD) {
563 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
564 while (<$CASEFOLDFH>) {
565 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
566 my $code = hex($1);
567 $CASEFOLD{$code} = { code => $1,
568 status => $2,
569 mapping => $3 };
570 }
571 }
572 close($CASEFOLDFH);
573 }
574 }
575}
576
577sub casefold {
578 my $arg = shift;
579 my $code = _getcode($arg);
74f8133e 580 croak __PACKAGE__, "::casefold: unknown code '$arg'"
581 unless defined $code;
b08cd201 582
583 _casefold() unless %CASEFOLD;
584
585 return $CASEFOLD{$code};
586}
587
588=head2 casespec
589
55d7b906 590 use Unicode::UCD 'casespec';
b08cd201 591
592 my %casespec = casespec("09dc");
593
594The casespec() returns the potentially locale-dependent case mapping
595of the character specified by a B<code point argument>. The mapping
596may change the length of the string (which the basic Unicode case
597mappings as returned by charinfo() never do).
598
599If there is a case folding for that character, a reference to a hash
600with the following fields is returned:
601
602 key
603
604 code code point with at least four hexdigits
605 lower lowercase
606 title titlecase
607 upper uppercase
608 condition condition list (may be undef)
609
610The C<condition> is optional. Where present, it consists of one or
611more I<locales> or I<contexts>, separated by spaces (other than as
612used to separate elements, spaces are to be ignored). A condition
613list overrides the normal behavior if all of the listed conditions are
614true. Case distinctions in the condition list are not significant.
615Conditions preceded by "NON_" represent the negation of the condition
616
f499c386 617Note that when there are multiple case folding definitions for a
618single code point because of different locales, the value returned by
619casespec() is a hash reference which has the locales as the keys and
620hash references as described above as the values.
621
b08cd201 622A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d 623followed by a "_" and a 2-letter ISO language code (possibly followed
624by a "_" and a variant code). You can find the lists of those codes,
625see L<Locale::Country> and L<Locale::Language>.
b08cd201 626
627A I<context> is one of the following choices:
628
629 FINAL The letter is not followed by a letter of
630 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
631 MODERN The mapping is only used for modern text
e618509d 632 AFTER_i The last base character was "i" (U+0069)
b08cd201 633
634For more information about case mappings see
635http://www.unicode.org/unicode/reports/tr21/
636
637=cut
638
639my %CASESPEC;
640
641sub _casespec {
642 unless (%CASESPEC) {
643 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
644 while (<$CASESPECFH>) {
645 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386 646 my ($hexcode, $lower, $title, $upper, $condition) =
647 ($1, $2, $3, $4, $5);
648 my $code = hex($hexcode);
649 if (exists $CASESPEC{$code}) {
650 if (exists $CASESPEC{$code}->{code}) {
651 my ($oldlower,
652 $oldtitle,
653 $oldupper,
654 $oldcondition) =
655 @{$CASESPEC{$code}}{qw(lower
656 title
657 upper
658 condition)};
659 my ($oldlocale) =
660 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
661 if (defined $oldlocale) {
662 delete $CASESPEC{$code};
663 $CASESPEC{$code}->{$oldlocale} =
664 { code => $hexcode,
665 lower => $oldlower,
666 title => $oldtitle,
667 upper => $oldupper,
668 condition => $oldcondition };
669 } else {
670 warn __PACKAGE__, ": SpecCase.txt:", $., ": No oldlocale for 0x$hexcode\n"
671 }
672 }
673 my ($locale) =
674 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
675 $CASESPEC{$code}->{$locale} =
676 { code => $hexcode,
677 lower => $lower,
678 title => $title,
679 upper => $upper,
680 condition => $condition };
681 } else {
682 $CASESPEC{$code} =
683 { code => $hexcode,
684 lower => $lower,
685 title => $title,
686 upper => $upper,
687 condition => $condition };
688 }
b08cd201 689 }
690 }
691 close($CASESPECFH);
692 }
693 }
694}
695
696sub casespec {
697 my $arg = shift;
698 my $code = _getcode($arg);
74f8133e 699 croak __PACKAGE__, "::casespec: unknown code '$arg'"
700 unless defined $code;
b08cd201 701
702 _casespec() unless %CASESPEC;
703
704 return $CASESPEC{$code};
705}
706
55d7b906 707=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 708
55d7b906 709Unicode::UCD::UnicodeVersion() returns the version of the Unicode
710Character Database, in other words, the version of the Unicode
78bf21c2 711standard the database implements. The version is a string
712of numbers delimited by dots (C<'.'>).
10a6ecd2 713
714=cut
715
716my $UNICODEVERSION;
717
718sub UnicodeVersion {
719 unless (defined $UNICODEVERSION) {
720 openunicode(\$VERSIONFH, "version");
721 chomp($UNICODEVERSION = <$VERSIONFH>);
722 close($VERSIONFH);
723 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
724 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
725 }
726 return $UNICODEVERSION;
727}
3aa957f9 728
729=head2 Implementation Note
32c16050 730
ad9cab37 731The first use of charinfo() opens a read-only filehandle to the Unicode
732Character Database (the database is included in the Perl distribution).
78bf21c2 733The filehandle is then kept open for further queries. In other words,
734if you are wondering where one of your filehandles went, that's where.
32c16050 735
561c79ed 736=head1 AUTHOR
737
738Jarkko Hietaniemi
739
740=cut
741
7421;