NetWare update from C Aditya.
[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed 2
3use strict;
4use warnings;
5
74f8133e 6our $VERSION = '0.2';
561c79ed 7
8require Exporter;
9
10our @ISA = qw(Exporter);
74f8133e 11
10a6ecd2 12our @EXPORT_OK = qw(charinfo
13 charblock charscript
14 charblocks charscripts
b08cd201 15 charinrange
16 compexcl
17 casefold casespec);
561c79ed 18
19use Carp;
20
21=head1 NAME
22
55d7b906 23Unicode::UCD - Unicode character database
561c79ed 24
25=head1 SYNOPSIS
26
55d7b906 27 use Unicode::UCD 'charinfo';
b08cd201 28 my $charinfo = charinfo($codepoint);
561c79ed 29
55d7b906 30 use Unicode::UCD 'charblock';
e882dd67 31 my $charblock = charblock($codepoint);
32
55d7b906 33 use Unicode::UCD 'charscript';
e882dd67 34 my $charscript = charblock($codepoint);
561c79ed 35
55d7b906 36 use Unicode::UCD 'charblocks';
e145285f 37 my $charblocks = charblocks();
38
55d7b906 39 use Unicode::UCD 'charscripts';
e145285f 40 my %charscripts = charscripts();
41
55d7b906 42 use Unicode::UCD qw(charscript charinrange);
e145285f 43 my $range = charscript($script);
44 print "looks like $script\n" if charinrange($range, $codepoint);
45
55d7b906 46 use Unicode::UCD 'compexcl';
e145285f 47 my $compexcl = compexcl($codepoint);
48
55d7b906 49 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 50
561c79ed 51=head1 DESCRIPTION
52
8b731da2 53The Unicode::UCD module offers a simple interface to the Unicode
54Character Database.
561c79ed 55
56=cut
57
10a6ecd2 58my $UNICODEFH;
59my $BLOCKSFH;
60my $SCRIPTSFH;
61my $VERSIONFH;
b08cd201 62my $COMPEXCLFH;
63my $CASEFOLDFH;
64my $CASESPECFH;
561c79ed 65
66sub openunicode {
67 my ($rfh, @path) = @_;
68 my $f;
69 unless (defined $$rfh) {
70 for my $d (@INC) {
71 use File::Spec;
55d7b906 72 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 73 last if open($$rfh, $f);
e882dd67 74 undef $f;
561c79ed 75 }
e882dd67 76 croak __PACKAGE__, ": failed to find ",
77 File::Spec->catfile(@path), " in @INC"
78 unless defined $f;
561c79ed 79 }
80 return $f;
81}
82
83=head2 charinfo
84
55d7b906 85 use Unicode::UCD 'charinfo';
561c79ed 86
b08cd201 87 my $charinfo = charinfo(0x41);
561c79ed 88
b08cd201 89charinfo() returns a reference to a hash that has the following fields
90as defined by the Unicode standard:
561c79ed 91
92 key
93
94 code code point with at least four hexdigits
95 name name of the character IN UPPER CASE
96 category general category of the character
97 combining classes used in the Canonical Ordering Algorithm
98 bidi bidirectional category
99 decomposition character decomposition mapping
100 decimal if decimal digit this is the integer numeric value
101 digit if digit this is the numeric value
102 numeric if numeric is the integer or rational numeric value
103 mirrored if mirrored in bidirectional text
104 unicode10 Unicode 1.0 name if existed and different
105 comment ISO 10646 comment field
106 upper uppercase equivalent mapping
107 lower lowercase equivalent mapping
108 title titlecase equivalent mapping
e882dd67 109
561c79ed 110 block block the character belongs to (used in \p{In...})
eb0cc9e3 111 script script the character belongs to
561c79ed 112
b08cd201 113If no match is found, a reference to an empty hash is returned.
561c79ed 114
d1be9408 115The C<block> property is the same as returned by charinfo(). It is
32c16050 116not defined in the Unicode Character Database proper (Chapter 4 of the
78bf21c2 117Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
118(Chapter 14 of TUS3). Similarly for the C<script> property.
32c16050 119
120Note that you cannot do (de)composition and casing based solely on the
121above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 122you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed 123
124=cut
125
10a6ecd2 126sub _getcode {
127 my $arg = shift;
128
129 if ($arg =~ /^\d+$/) {
130 return $arg;
131 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
132 return hex($1);
133 }
134
135 return;
136}
137
ac5ea531 138# Lingua::KO::Hangul::Util not part of the standard distribution
139# but it will be used if available.
140
141eval { require Lingua::KO::Hangul::Util };
142my $hasHangulUtil = ! $@;
143if ($hasHangulUtil) {
144 Lingua::KO::Hangul::Util->import();
145}
9087a70b 146
147sub hangul_decomp { # internal: called from charinfo
ac5ea531 148 if ($hasHangulUtil) {
149 my @tmp = decomposeHangul(shift);
150 return sprintf("%04X %04X", @tmp) if @tmp == 2;
151 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
152 }
153 return;
154}
155
156sub hangul_charname { # internal: called from charinfo
157 return sprintf("HANGUL SYLLABLE-%04X", shift);
a6fa416b 158}
159
9087a70b 160sub han_charname { # internal: called from charinfo
161 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
a6fa416b 162}
163
164my @CharinfoRanges = (
165# block name
166# [ first, last, coderef to name, coderef to decompose ],
167# CJK Ideographs Extension A
168 [ 0x3400, 0x4DB5, \&han_charname, undef ],
169# CJK Ideographs
170 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
171# Hangul Syllables
ac5ea531 172 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
a6fa416b 173# Non-Private Use High Surrogates
174 [ 0xD800, 0xDB7F, undef, undef ],
175# Private Use High Surrogates
176 [ 0xDB80, 0xDBFF, undef, undef ],
177# Low Surrogates
178 [ 0xDC00, 0xDFFF, undef, undef ],
179# The Private Use Area
180 [ 0xE000, 0xF8FF, undef, undef ],
181# CJK Ideographs Extension B
182 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
183# Plane 15 Private Use Area
184 [ 0xF0000, 0xFFFFD, undef, undef ],
185# Plane 16 Private Use Area
186 [ 0x100000, 0x10FFFD, undef, undef ],
187);
188
561c79ed 189sub charinfo {
10a6ecd2 190 my $arg = shift;
191 my $code = _getcode($arg);
192 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
193 unless defined $code;
e63dbbf9 194 my $hexk = sprintf("%06X", $code);
a6fa416b 195 my($rcode,$rname,$rdec);
196 foreach my $range (@CharinfoRanges){
74f8133e 197 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 198 $rcode = $hexk;
e63dbbf9 199 $rcode =~ s/^0+//;
200 $rcode = sprintf("%04X", hex($rcode));
a6fa416b 201 $rname = $range->[2] ? $range->[2]->($code) : '';
202 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 203 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b 204 last;
205 }
206 }
551b6b6f 207 openunicode(\$UNICODEFH, "UnicodeData.txt");
10a6ecd2 208 if (defined $UNICODEFH) {
e63dbbf9 209 use Search::Dict 1.02;
210 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 211 my $line = <$UNICODEFH>;
561c79ed 212 chomp $line;
213 my %prop;
214 @prop{qw(
215 code name category
216 combining bidi decomposition
217 decimal digit numeric
218 mirrored unicode10 comment
219 upper lower title
220 )} = split(/;/, $line, -1);
e63dbbf9 221 $hexk =~ s/^0+//;
222 $hexk = sprintf("%04X", hex($hexk));
561c79ed 223 if ($prop{code} eq $hexk) {
a196fbfd 224 $prop{block} = charblock($code);
225 $prop{script} = charscript($code);
a6fa416b 226 if(defined $rname){
227 $prop{code} = $rcode;
228 $prop{name} = $rname;
229 $prop{decomposition} = $rdec;
230 }
b08cd201 231 return \%prop;
561c79ed 232 }
233 }
234 }
235 return;
236}
237
e882dd67 238sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
239 my ($table, $lo, $hi, $code) = @_;
240
241 return if $lo > $hi;
242
243 my $mid = int(($lo+$hi) / 2);
244
245 if ($table->[$mid]->[0] < $code) {
10a6ecd2 246 if ($table->[$mid]->[1] >= $code) {
e882dd67 247 return $table->[$mid]->[2];
248 } else {
249 _search($table, $mid + 1, $hi, $code);
250 }
251 } elsif ($table->[$mid]->[0] > $code) {
252 _search($table, $lo, $mid - 1, $code);
253 } else {
254 return $table->[$mid]->[2];
255 }
256}
257
10a6ecd2 258sub charinrange {
259 my ($range, $arg) = @_;
260 my $code = _getcode($arg);
261 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
262 unless defined $code;
263 _search($range, 0, $#$range, $code);
264}
265
354a27bf 266=head2 charblock
561c79ed 267
55d7b906 268 use Unicode::UCD 'charblock';
561c79ed 269
270 my $charblock = charblock(0x41);
10a6ecd2 271 my $charblock = charblock(1234);
272 my $charblock = charblock("0x263a");
273 my $charblock = charblock("U+263a");
274
78bf21c2 275 my $range = charblock('Armenian');
10a6ecd2 276
78bf21c2 277With a B<code point argument> charblock() returns the I<block> the character
10a6ecd2 278belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 279positions within all blocks are defined.
10a6ecd2 280
78bf21c2 281See also L</Blocks versus Scripts>.
282
eb0cc9e3 283If supplied with an argument that can't be a code point, charblock() tries
284to do the opposite and interpret the argument as a character block. The
285return value is a I<range>: an anonymous list of lists that contain
286I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
287code point is in a range using the L</charinrange> function. If the
288argument is not a known charater block, C<undef> is returned.
561c79ed 289
561c79ed 290=cut
291
292my @BLOCKS;
10a6ecd2 293my %BLOCKS;
561c79ed 294
10a6ecd2 295sub _charblocks {
561c79ed 296 unless (@BLOCKS) {
10a6ecd2 297 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
298 while (<$BLOCKSFH>) {
2796c109 299 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2 300 my ($lo, $hi) = (hex($1), hex($2));
301 my $subrange = [ $lo, $hi, $3 ];
302 push @BLOCKS, $subrange;
303 push @{$BLOCKS{$3}}, $subrange;
561c79ed 304 }
305 }
10a6ecd2 306 close($BLOCKSFH);
561c79ed 307 }
308 }
10a6ecd2 309}
310
311sub charblock {
312 my $arg = shift;
313
314 _charblocks() unless @BLOCKS;
315
316 my $code = _getcode($arg);
561c79ed 317
10a6ecd2 318 if (defined $code) {
319 _search(\@BLOCKS, 0, $#BLOCKS, $code);
320 } else {
321 if (exists $BLOCKS{$arg}) {
322 return $BLOCKS{$arg};
323 } else {
324 return;
325 }
326 }
e882dd67 327}
328
329=head2 charscript
330
55d7b906 331 use Unicode::UCD 'charscript';
e882dd67 332
333 my $charscript = charscript(0x41);
10a6ecd2 334 my $charscript = charscript(1234);
335 my $charscript = charscript("U+263a");
e882dd67 336
78bf21c2 337 my $range = charscript('Thai');
10a6ecd2 338
78bf21c2 339With a B<code point argument> charscript() returns the I<script> the
b08cd201 340character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2 341
78bf21c2 342See also L</Blocks versus Scripts>.
343
eb0cc9e3 344If supplied with an argument that can't be a code point, charscript() tries
345to do the opposite and interpret the argument as a character script. The
346return value is a I<range>: an anonymous list of lists that contain
347I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
348code point is in a range using the L</charinrange> function. If the
349argument is not a known charater script, C<undef> is returned.
e882dd67 350
e882dd67 351=cut
352
353my @SCRIPTS;
10a6ecd2 354my %SCRIPTS;
e882dd67 355
10a6ecd2 356sub _charscripts {
e882dd67 357 unless (@SCRIPTS) {
10a6ecd2 358 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
359 while (<$SCRIPTSFH>) {
e882dd67 360 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2 361 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
362 my $script = lc($3);
363 $script =~ s/\b(\w)/uc($1)/ge;
364 my $subrange = [ $lo, $hi, $script ];
365 push @SCRIPTS, $subrange;
366 push @{$SCRIPTS{$script}}, $subrange;
e882dd67 367 }
368 }
10a6ecd2 369 close($SCRIPTSFH);
e882dd67 370 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
371 }
372 }
10a6ecd2 373}
374
375sub charscript {
376 my $arg = shift;
377
378 _charscripts() unless @SCRIPTS;
e882dd67 379
10a6ecd2 380 my $code = _getcode($arg);
381
382 if (defined $code) {
383 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
384 } else {
385 if (exists $SCRIPTS{$arg}) {
386 return $SCRIPTS{$arg};
387 } else {
388 return;
389 }
390 }
391}
392
393=head2 charblocks
394
55d7b906 395 use Unicode::UCD 'charblocks';
10a6ecd2 396
b08cd201 397 my $charblocks = charblocks();
10a6ecd2 398
b08cd201 399charblocks() returns a reference to a hash with the known block names
400as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2 401
78bf21c2 402See also L</Blocks versus Scripts>.
403
10a6ecd2 404=cut
405
406sub charblocks {
b08cd201 407 _charblocks() unless %BLOCKS;
408 return \%BLOCKS;
10a6ecd2 409}
410
411=head2 charscripts
412
55d7b906 413 use Unicode::UCD 'charscripts';
10a6ecd2 414
415 my %charscripts = charscripts();
416
417charscripts() returns a hash with the known script names as the keys,
418and the code point ranges (see L</charscript>) as the values.
419
78bf21c2 420See also L</Blocks versus Scripts>.
421
10a6ecd2 422=cut
423
424sub charscripts {
b08cd201 425 _charscripts() unless %SCRIPTS;
426 return \%SCRIPTS;
561c79ed 427}
428
10a6ecd2 429=head2 Blocks versus Scripts
ad9cab37 430
10a6ecd2 431The difference between a block and a script is that scripts are closer
432to the linguistic notion of a set of characters required to present
433languages, while block is more of an artifact of the Unicode character
eb0cc9e3 434numbering and separation into blocks of (mostly) 256 characters.
3aa957f9 435
436For example the Latin B<script> is spread over several B<blocks>, such
437as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
438C<Latin Extended-B>. On the other hand, the Latin script does not
439contain all the characters of the C<Basic Latin> block (also known as
eb0cc9e3 440the ASCII): it includes only the letters, and not, for example, the digits
3aa957f9 441or the punctuation.
ad9cab37 442
3aa957f9 443For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37 444
445For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
446
3aa957f9 447=head2 Matching Scripts and Blocks
448
eb0cc9e3 449Scripts are matched with the regular-expression construct
450C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
451while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
452any of the 256 code points in the Tibetan block).
10a6ecd2 453
b08cd201 454=head2 Code Point Arguments
455
92e830a9 456A I<code point argument> is either a decimal or a hexadecimal scalar
457designating a Unicode character, or C<U+> followed by hexadecimals
78bf21c2 458designating a Unicode character. Note that Unicode is B<not> limited
459to 16 bits (the number of Unicode characters is open-ended, in theory
460unlimited): you may have more than 4 hexdigits.
b08cd201 461
10a6ecd2 462=head2 charinrange
463
464In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
465can also test whether a code point is in the I<range> as returned by
466L</charblock> and L</charscript> or as the values of the hash returned
e618509d 467by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 468
55d7b906 469 use Unicode::UCD qw(charscript charinrange);
10a6ecd2 470
471 $range = charscript('Hiragana');
e145285f 472 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2 473
474=cut
475
b08cd201 476=head2 compexcl
477
55d7b906 478 use Unicode::UCD 'compexcl';
b08cd201 479
480 my $compexcl = compexcl("09dc");
481
482The compexcl() returns the composition exclusion (that is, if the
9046a8ae 483character should not be produced during a precomposition) of the
484character specified by a B<code point argument>.
b08cd201 485
486If there is a composition exclusion for the character, true is
487returned. Otherwise, false is returned.
488
489=cut
490
491my %COMPEXCL;
492
493sub _compexcl {
494 unless (%COMPEXCL) {
551b6b6f 495 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
b08cd201 496 while (<$COMPEXCLFH>) {
822ebcc8 497 if (/^([0-9A-F]+)\s+\#\s+/) {
b08cd201 498 my $code = hex($1);
499 $COMPEXCL{$code} = undef;
500 }
501 }
502 close($COMPEXCLFH);
503 }
504 }
505}
506
507sub compexcl {
508 my $arg = shift;
509 my $code = _getcode($arg);
74f8133e 510 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
511 unless defined $code;
b08cd201 512
513 _compexcl() unless %COMPEXCL;
514
515 return exists $COMPEXCL{$code};
516}
517
518=head2 casefold
519
55d7b906 520 use Unicode::UCD 'casefold';
b08cd201 521
522 my %casefold = casefold("09dc");
523
524The casefold() returns the locale-independent case folding of the
525character specified by a B<code point argument>.
526
527If there is a case folding for that character, a reference to a hash
528with the following fields is returned:
529
530 key
531
532 code code point with at least four hexdigits
533 status "C", "F", "S", or "I"
534 mapping one or more codes separated by spaces
535
536The meaning of the I<status> is as follows:
537
538 C common case folding, common mappings shared
539 by both simple and full mappings
540 F full case folding, mappings that cause strings
541 to grow in length. Multiple characters are separated
542 by spaces
543 S simple case folding, mappings to single characters
544 where different from F
545 I special case for dotted uppercase I and
546 dotless lowercase i
547 - If this mapping is included, the result is
548 case-insensitive, but dotless and dotted I's
549 are not distinguished
550 - If this mapping is excluded, the result is not
551 fully case-insensitive, but dotless and dotted
552 I's are distinguished
553
554If there is no case folding for that character, C<undef> is returned.
555
556For more information about case mappings see
557http://www.unicode.org/unicode/reports/tr21/
558
559=cut
560
561my %CASEFOLD;
562
563sub _casefold {
564 unless (%CASEFOLD) {
551b6b6f 565 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
b08cd201 566 while (<$CASEFOLDFH>) {
567 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
568 my $code = hex($1);
569 $CASEFOLD{$code} = { code => $1,
570 status => $2,
571 mapping => $3 };
572 }
573 }
574 close($CASEFOLDFH);
575 }
576 }
577}
578
579sub casefold {
580 my $arg = shift;
581 my $code = _getcode($arg);
74f8133e 582 croak __PACKAGE__, "::casefold: unknown code '$arg'"
583 unless defined $code;
b08cd201 584
585 _casefold() unless %CASEFOLD;
586
587 return $CASEFOLD{$code};
588}
589
590=head2 casespec
591
55d7b906 592 use Unicode::UCD 'casespec';
b08cd201 593
594 my %casespec = casespec("09dc");
595
596The casespec() returns the potentially locale-dependent case mapping
597of the character specified by a B<code point argument>. The mapping
598may change the length of the string (which the basic Unicode case
599mappings as returned by charinfo() never do).
600
601If there is a case folding for that character, a reference to a hash
602with the following fields is returned:
603
604 key
605
606 code code point with at least four hexdigits
607 lower lowercase
608 title titlecase
609 upper uppercase
610 condition condition list (may be undef)
611
612The C<condition> is optional. Where present, it consists of one or
613more I<locales> or I<contexts>, separated by spaces (other than as
614used to separate elements, spaces are to be ignored). A condition
615list overrides the normal behavior if all of the listed conditions are
616true. Case distinctions in the condition list are not significant.
617Conditions preceded by "NON_" represent the negation of the condition
618
f499c386 619Note that when there are multiple case folding definitions for a
620single code point because of different locales, the value returned by
621casespec() is a hash reference which has the locales as the keys and
622hash references as described above as the values.
623
b08cd201 624A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d 625followed by a "_" and a 2-letter ISO language code (possibly followed
626by a "_" and a variant code). You can find the lists of those codes,
627see L<Locale::Country> and L<Locale::Language>.
b08cd201 628
629A I<context> is one of the following choices:
630
631 FINAL The letter is not followed by a letter of
632 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
633 MODERN The mapping is only used for modern text
e618509d 634 AFTER_i The last base character was "i" (U+0069)
b08cd201 635
636For more information about case mappings see
637http://www.unicode.org/unicode/reports/tr21/
638
639=cut
640
641my %CASESPEC;
642
643sub _casespec {
644 unless (%CASESPEC) {
551b6b6f 645 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
b08cd201 646 while (<$CASESPECFH>) {
647 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386 648 my ($hexcode, $lower, $title, $upper, $condition) =
649 ($1, $2, $3, $4, $5);
650 my $code = hex($hexcode);
651 if (exists $CASESPEC{$code}) {
652 if (exists $CASESPEC{$code}->{code}) {
653 my ($oldlower,
654 $oldtitle,
655 $oldupper,
656 $oldcondition) =
657 @{$CASESPEC{$code}}{qw(lower
658 title
659 upper
660 condition)};
822ebcc8 661 if (defined $oldcondition) {
662 my ($oldlocale) =
f499c386 663 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386 664 delete $CASESPEC{$code};
665 $CASESPEC{$code}->{$oldlocale} =
666 { code => $hexcode,
667 lower => $oldlower,
668 title => $oldtitle,
669 upper => $oldupper,
670 condition => $oldcondition };
f499c386 671 }
672 }
673 my ($locale) =
674 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
675 $CASESPEC{$code}->{$locale} =
676 { code => $hexcode,
677 lower => $lower,
678 title => $title,
679 upper => $upper,
680 condition => $condition };
681 } else {
682 $CASESPEC{$code} =
683 { code => $hexcode,
684 lower => $lower,
685 title => $title,
686 upper => $upper,
687 condition => $condition };
688 }
b08cd201 689 }
690 }
691 close($CASESPECFH);
692 }
693 }
694}
695
696sub casespec {
697 my $arg = shift;
698 my $code = _getcode($arg);
74f8133e 699 croak __PACKAGE__, "::casespec: unknown code '$arg'"
700 unless defined $code;
b08cd201 701
702 _casespec() unless %CASESPEC;
703
704 return $CASESPEC{$code};
705}
706
55d7b906 707=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 708
55d7b906 709Unicode::UCD::UnicodeVersion() returns the version of the Unicode
710Character Database, in other words, the version of the Unicode
78bf21c2 711standard the database implements. The version is a string
712of numbers delimited by dots (C<'.'>).
10a6ecd2 713
714=cut
715
716my $UNICODEVERSION;
717
718sub UnicodeVersion {
719 unless (defined $UNICODEVERSION) {
720 openunicode(\$VERSIONFH, "version");
721 chomp($UNICODEVERSION = <$VERSIONFH>);
722 close($VERSIONFH);
723 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
724 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
725 }
726 return $UNICODEVERSION;
727}
3aa957f9 728
729=head2 Implementation Note
32c16050 730
ad9cab37 731The first use of charinfo() opens a read-only filehandle to the Unicode
732Character Database (the database is included in the Perl distribution).
78bf21c2 733The filehandle is then kept open for further queries. In other words,
734if you are wondering where one of your filehandles went, that's where.
32c16050 735
8b731da2 736=head1 BUGS
737
738Does not yet support EBCDIC platforms.
739
561c79ed 740=head1 AUTHOR
741
742Jarkko Hietaniemi
743
744=cut
745
7461;