Allow several arguments to display().
[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed 2
3use strict;
4use warnings;
5
74f8133e 6our $VERSION = '0.2';
561c79ed 7
8require Exporter;
9
10our @ISA = qw(Exporter);
74f8133e 11
10a6ecd2 12our @EXPORT_OK = qw(charinfo
13 charblock charscript
14 charblocks charscripts
b08cd201 15 charinrange
16 compexcl
17 casefold casespec);
561c79ed 18
19use Carp;
20
21=head1 NAME
22
55d7b906 23Unicode::UCD - Unicode character database
561c79ed 24
25=head1 SYNOPSIS
26
55d7b906 27 use Unicode::UCD 'charinfo';
b08cd201 28 my $charinfo = charinfo($codepoint);
561c79ed 29
55d7b906 30 use Unicode::UCD 'charblock';
e882dd67 31 my $charblock = charblock($codepoint);
32
55d7b906 33 use Unicode::UCD 'charscript';
e882dd67 34 my $charscript = charblock($codepoint);
561c79ed 35
55d7b906 36 use Unicode::UCD 'charblocks';
e145285f 37 my $charblocks = charblocks();
38
55d7b906 39 use Unicode::UCD 'charscripts';
e145285f 40 my %charscripts = charscripts();
41
55d7b906 42 use Unicode::UCD qw(charscript charinrange);
e145285f 43 my $range = charscript($script);
44 print "looks like $script\n" if charinrange($range, $codepoint);
45
55d7b906 46 use Unicode::UCD 'compexcl';
e145285f 47 my $compexcl = compexcl($codepoint);
48
55d7b906 49 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 50
561c79ed 51=head1 DESCRIPTION
52
8b731da2 53The Unicode::UCD module offers a simple interface to the Unicode
54Character Database.
561c79ed 55
56=cut
57
10a6ecd2 58my $UNICODEFH;
59my $BLOCKSFH;
60my $SCRIPTSFH;
61my $VERSIONFH;
b08cd201 62my $COMPEXCLFH;
63my $CASEFOLDFH;
64my $CASESPECFH;
561c79ed 65
66sub openunicode {
67 my ($rfh, @path) = @_;
68 my $f;
69 unless (defined $$rfh) {
70 for my $d (@INC) {
71 use File::Spec;
55d7b906 72 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 73 last if open($$rfh, $f);
e882dd67 74 undef $f;
561c79ed 75 }
e882dd67 76 croak __PACKAGE__, ": failed to find ",
77 File::Spec->catfile(@path), " in @INC"
78 unless defined $f;
561c79ed 79 }
80 return $f;
81}
82
83=head2 charinfo
84
55d7b906 85 use Unicode::UCD 'charinfo';
561c79ed 86
b08cd201 87 my $charinfo = charinfo(0x41);
561c79ed 88
b08cd201 89charinfo() returns a reference to a hash that has the following fields
90as defined by the Unicode standard:
561c79ed 91
92 key
93
94 code code point with at least four hexdigits
95 name name of the character IN UPPER CASE
96 category general category of the character
97 combining classes used in the Canonical Ordering Algorithm
98 bidi bidirectional category
99 decomposition character decomposition mapping
100 decimal if decimal digit this is the integer numeric value
101 digit if digit this is the numeric value
102 numeric if numeric is the integer or rational numeric value
103 mirrored if mirrored in bidirectional text
104 unicode10 Unicode 1.0 name if existed and different
105 comment ISO 10646 comment field
106 upper uppercase equivalent mapping
107 lower lowercase equivalent mapping
108 title titlecase equivalent mapping
e882dd67 109
561c79ed 110 block block the character belongs to (used in \p{In...})
e882dd67 111 script script the character belongs to
561c79ed 112
b08cd201 113If no match is found, a reference to an empty hash is returned.
561c79ed 114
d1be9408 115The C<block> property is the same as returned by charinfo(). It is
32c16050 116not defined in the Unicode Character Database proper (Chapter 4 of the
78bf21c2 117Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
118(Chapter 14 of TUS3). Similarly for the C<script> property.
32c16050 119
120Note that you cannot do (de)composition and casing based solely on the
121above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 122you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed 123
124=cut
125
10a6ecd2 126sub _getcode {
127 my $arg = shift;
128
129 if ($arg =~ /^\d+$/) {
130 return $arg;
131 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
132 return hex($1);
133 }
134
135 return;
136}
137
ac5ea531 138# Lingua::KO::Hangul::Util not part of the standard distribution
139# but it will be used if available.
140
141eval { require Lingua::KO::Hangul::Util };
142my $hasHangulUtil = ! $@;
143if ($hasHangulUtil) {
144 Lingua::KO::Hangul::Util->import();
145}
9087a70b 146
147sub hangul_decomp { # internal: called from charinfo
ac5ea531 148 if ($hasHangulUtil) {
149 my @tmp = decomposeHangul(shift);
150 return sprintf("%04X %04X", @tmp) if @tmp == 2;
151 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
152 }
153 return;
154}
155
156sub hangul_charname { # internal: called from charinfo
157 return sprintf("HANGUL SYLLABLE-%04X", shift);
a6fa416b 158}
159
9087a70b 160sub han_charname { # internal: called from charinfo
161 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
a6fa416b 162}
163
164my @CharinfoRanges = (
165# block name
166# [ first, last, coderef to name, coderef to decompose ],
167# CJK Ideographs Extension A
168 [ 0x3400, 0x4DB5, \&han_charname, undef ],
169# CJK Ideographs
170 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
171# Hangul Syllables
ac5ea531 172 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
a6fa416b 173# Non-Private Use High Surrogates
174 [ 0xD800, 0xDB7F, undef, undef ],
175# Private Use High Surrogates
176 [ 0xDB80, 0xDBFF, undef, undef ],
177# Low Surrogates
178 [ 0xDC00, 0xDFFF, undef, undef ],
179# The Private Use Area
180 [ 0xE000, 0xF8FF, undef, undef ],
181# CJK Ideographs Extension B
182 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
183# Plane 15 Private Use Area
184 [ 0xF0000, 0xFFFFD, undef, undef ],
185# Plane 16 Private Use Area
186 [ 0x100000, 0x10FFFD, undef, undef ],
187);
188
561c79ed 189sub charinfo {
10a6ecd2 190 my $arg = shift;
191 my $code = _getcode($arg);
192 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
193 unless defined $code;
e63dbbf9 194 my $hexk = sprintf("%06X", $code);
a6fa416b 195 my($rcode,$rname,$rdec);
196 foreach my $range (@CharinfoRanges){
74f8133e 197 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 198 $rcode = $hexk;
e63dbbf9 199 $rcode =~ s/^0+//;
200 $rcode = sprintf("%04X", hex($rcode));
a6fa416b 201 $rname = $range->[2] ? $range->[2]->($code) : '';
202 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 203 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b 204 last;
205 }
206 }
74f8133e 207 openunicode(\$UNICODEFH, "Unicode.txt");
10a6ecd2 208 if (defined $UNICODEFH) {
e63dbbf9 209 use Search::Dict 1.02;
210 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 211 my $line = <$UNICODEFH>;
561c79ed 212 chomp $line;
213 my %prop;
214 @prop{qw(
215 code name category
216 combining bidi decomposition
217 decimal digit numeric
218 mirrored unicode10 comment
219 upper lower title
220 )} = split(/;/, $line, -1);
e63dbbf9 221 $hexk =~ s/^0+//;
222 $hexk = sprintf("%04X", hex($hexk));
561c79ed 223 if ($prop{code} eq $hexk) {
a196fbfd 224 $prop{block} = charblock($code);
225 $prop{script} = charscript($code);
a6fa416b 226 if(defined $rname){
227 $prop{code} = $rcode;
228 $prop{name} = $rname;
229 $prop{decomposition} = $rdec;
230 }
b08cd201 231 return \%prop;
561c79ed 232 }
233 }
234 }
235 return;
236}
237
e882dd67 238sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
239 my ($table, $lo, $hi, $code) = @_;
240
241 return if $lo > $hi;
242
243 my $mid = int(($lo+$hi) / 2);
244
245 if ($table->[$mid]->[0] < $code) {
10a6ecd2 246 if ($table->[$mid]->[1] >= $code) {
e882dd67 247 return $table->[$mid]->[2];
248 } else {
249 _search($table, $mid + 1, $hi, $code);
250 }
251 } elsif ($table->[$mid]->[0] > $code) {
252 _search($table, $lo, $mid - 1, $code);
253 } else {
254 return $table->[$mid]->[2];
255 }
256}
257
10a6ecd2 258sub charinrange {
259 my ($range, $arg) = @_;
260 my $code = _getcode($arg);
261 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
262 unless defined $code;
263 _search($range, 0, $#$range, $code);
264}
265
354a27bf 266=head2 charblock
561c79ed 267
55d7b906 268 use Unicode::UCD 'charblock';
561c79ed 269
270 my $charblock = charblock(0x41);
10a6ecd2 271 my $charblock = charblock(1234);
272 my $charblock = charblock("0x263a");
273 my $charblock = charblock("U+263a");
274
78bf21c2 275 my $range = charblock('Armenian');
10a6ecd2 276
78bf21c2 277With a B<code point argument> charblock() returns the I<block> the character
10a6ecd2 278belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 279positions within all blocks are defined.
10a6ecd2 280
78bf21c2 281See also L</Blocks versus Scripts>.
282
10a6ecd2 283If supplied with an argument that can't be a code point, charblock()
284tries to do the opposite and interpret the argument as a character
285block. The return value is a I<range>: an anonymous list that
286contains anonymous lists, which in turn contain I<start-of-range>,
287I<end-of-range> code point pairs. You can test whether a code point
288is in a range using the L</charinrange> function. If the argument is
289not a known charater block, C<undef> is returned.
561c79ed 290
561c79ed 291=cut
292
293my @BLOCKS;
10a6ecd2 294my %BLOCKS;
561c79ed 295
10a6ecd2 296sub _charblocks {
561c79ed 297 unless (@BLOCKS) {
10a6ecd2 298 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
299 while (<$BLOCKSFH>) {
2796c109 300 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2 301 my ($lo, $hi) = (hex($1), hex($2));
302 my $subrange = [ $lo, $hi, $3 ];
303 push @BLOCKS, $subrange;
304 push @{$BLOCKS{$3}}, $subrange;
561c79ed 305 }
306 }
10a6ecd2 307 close($BLOCKSFH);
561c79ed 308 }
309 }
10a6ecd2 310}
311
312sub charblock {
313 my $arg = shift;
314
315 _charblocks() unless @BLOCKS;
316
317 my $code = _getcode($arg);
561c79ed 318
10a6ecd2 319 if (defined $code) {
320 _search(\@BLOCKS, 0, $#BLOCKS, $code);
321 } else {
322 if (exists $BLOCKS{$arg}) {
323 return $BLOCKS{$arg};
324 } else {
325 return;
326 }
327 }
e882dd67 328}
329
330=head2 charscript
331
55d7b906 332 use Unicode::UCD 'charscript';
e882dd67 333
334 my $charscript = charscript(0x41);
10a6ecd2 335 my $charscript = charscript(1234);
336 my $charscript = charscript("U+263a");
e882dd67 337
78bf21c2 338 my $range = charscript('Thai');
10a6ecd2 339
78bf21c2 340With a B<code point argument> charscript() returns the I<script> the
b08cd201 341character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2 342
78bf21c2 343See also L</Blocks versus Scripts>.
344
10a6ecd2 345If supplied with an argument that can't be a code point, charscript()
346tries to do the opposite and interpret the argument as a character
347script. The return value is a I<range>: an anonymous list that
348contains anonymous lists, which in turn contain I<start-of-range>,
349I<end-of-range> code point pairs. You can test whether a code point
350is in a range using the L</charinrange> function. If the argument is
351not a known charater script, C<undef> is returned.
e882dd67 352
e882dd67 353=cut
354
355my @SCRIPTS;
10a6ecd2 356my %SCRIPTS;
e882dd67 357
10a6ecd2 358sub _charscripts {
e882dd67 359 unless (@SCRIPTS) {
10a6ecd2 360 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
361 while (<$SCRIPTSFH>) {
e882dd67 362 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2 363 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
364 my $script = lc($3);
365 $script =~ s/\b(\w)/uc($1)/ge;
366 my $subrange = [ $lo, $hi, $script ];
367 push @SCRIPTS, $subrange;
368 push @{$SCRIPTS{$script}}, $subrange;
e882dd67 369 }
370 }
10a6ecd2 371 close($SCRIPTSFH);
e882dd67 372 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
373 }
374 }
10a6ecd2 375}
376
377sub charscript {
378 my $arg = shift;
379
380 _charscripts() unless @SCRIPTS;
e882dd67 381
10a6ecd2 382 my $code = _getcode($arg);
383
384 if (defined $code) {
385 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
386 } else {
387 if (exists $SCRIPTS{$arg}) {
388 return $SCRIPTS{$arg};
389 } else {
390 return;
391 }
392 }
393}
394
395=head2 charblocks
396
55d7b906 397 use Unicode::UCD 'charblocks';
10a6ecd2 398
b08cd201 399 my $charblocks = charblocks();
10a6ecd2 400
b08cd201 401charblocks() returns a reference to a hash with the known block names
402as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2 403
78bf21c2 404See also L</Blocks versus Scripts>.
405
10a6ecd2 406=cut
407
408sub charblocks {
b08cd201 409 _charblocks() unless %BLOCKS;
410 return \%BLOCKS;
10a6ecd2 411}
412
413=head2 charscripts
414
55d7b906 415 use Unicode::UCD 'charscripts';
10a6ecd2 416
417 my %charscripts = charscripts();
418
419charscripts() returns a hash with the known script names as the keys,
420and the code point ranges (see L</charscript>) as the values.
421
78bf21c2 422See also L</Blocks versus Scripts>.
423
10a6ecd2 424=cut
425
426sub charscripts {
b08cd201 427 _charscripts() unless %SCRIPTS;
428 return \%SCRIPTS;
561c79ed 429}
430
10a6ecd2 431=head2 Blocks versus Scripts
ad9cab37 432
10a6ecd2 433The difference between a block and a script is that scripts are closer
434to the linguistic notion of a set of characters required to present
435languages, while block is more of an artifact of the Unicode character
436numbering and separation into blocks of 256 characters.
3aa957f9 437
438For example the Latin B<script> is spread over several B<blocks>, such
439as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
440C<Latin Extended-B>. On the other hand, the Latin script does not
441contain all the characters of the C<Basic Latin> block (also known as
442the ASCII): it includes only the letters, not for example the digits
443or the punctuation.
ad9cab37 444
3aa957f9 445For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37 446
447For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
448
3aa957f9 449=head2 Matching Scripts and Blocks
450
451Both scripts and blocks can be matched using the regular expression
452construct C<\p{In...}> and its negation C<\P{In...}>.
453
454The name of the script or the block comes after the C<In>, for example
455C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
10a6ecd2 456removed from the names for the C<\p{In...}>, for example
457C<LatinExtendedA> instead of C<Latin Extended-A>.
458
78bf21c2 459There are a few cases where there is both a script and a block by the
460same name, in these cases the block version has C<Block> appended to
461its name: C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is
462the block.
10a6ecd2 463
b08cd201 464=head2 Code Point Arguments
465
78bf21c2 466A <code point argument> is either a decimal or a hexadecimal scalar
467designating a Unicode character, or "U+" followed by hexadecimals
468designating a Unicode character. Note that Unicode is B<not> limited
469to 16 bits (the number of Unicode characters is open-ended, in theory
470unlimited): you may have more than 4 hexdigits.
b08cd201 471
10a6ecd2 472=head2 charinrange
473
474In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
475can also test whether a code point is in the I<range> as returned by
476L</charblock> and L</charscript> or as the values of the hash returned
e618509d 477by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 478
55d7b906 479 use Unicode::UCD qw(charscript charinrange);
10a6ecd2 480
481 $range = charscript('Hiragana');
e145285f 482 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2 483
484=cut
485
b08cd201 486=head2 compexcl
487
55d7b906 488 use Unicode::UCD 'compexcl';
b08cd201 489
490 my $compexcl = compexcl("09dc");
491
492The compexcl() returns the composition exclusion (that is, if the
9046a8ae 493character should not be produced during a precomposition) of the
494character specified by a B<code point argument>.
b08cd201 495
496If there is a composition exclusion for the character, true is
497returned. Otherwise, false is returned.
498
499=cut
500
501my %COMPEXCL;
502
503sub _compexcl {
504 unless (%COMPEXCL) {
505 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
506 while (<$COMPEXCLFH>) {
507 if (/^([0-9A-F]+) \# /) {
508 my $code = hex($1);
509 $COMPEXCL{$code} = undef;
510 }
511 }
512 close($COMPEXCLFH);
513 }
514 }
515}
516
517sub compexcl {
518 my $arg = shift;
519 my $code = _getcode($arg);
74f8133e 520 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
521 unless defined $code;
b08cd201 522
523 _compexcl() unless %COMPEXCL;
524
525 return exists $COMPEXCL{$code};
526}
527
528=head2 casefold
529
55d7b906 530 use Unicode::UCD 'casefold';
b08cd201 531
532 my %casefold = casefold("09dc");
533
534The casefold() returns the locale-independent case folding of the
535character specified by a B<code point argument>.
536
537If there is a case folding for that character, a reference to a hash
538with the following fields is returned:
539
540 key
541
542 code code point with at least four hexdigits
543 status "C", "F", "S", or "I"
544 mapping one or more codes separated by spaces
545
546The meaning of the I<status> is as follows:
547
548 C common case folding, common mappings shared
549 by both simple and full mappings
550 F full case folding, mappings that cause strings
551 to grow in length. Multiple characters are separated
552 by spaces
553 S simple case folding, mappings to single characters
554 where different from F
555 I special case for dotted uppercase I and
556 dotless lowercase i
557 - If this mapping is included, the result is
558 case-insensitive, but dotless and dotted I's
559 are not distinguished
560 - If this mapping is excluded, the result is not
561 fully case-insensitive, but dotless and dotted
562 I's are distinguished
563
564If there is no case folding for that character, C<undef> is returned.
565
566For more information about case mappings see
567http://www.unicode.org/unicode/reports/tr21/
568
569=cut
570
571my %CASEFOLD;
572
573sub _casefold {
574 unless (%CASEFOLD) {
575 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
576 while (<$CASEFOLDFH>) {
577 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
578 my $code = hex($1);
579 $CASEFOLD{$code} = { code => $1,
580 status => $2,
581 mapping => $3 };
582 }
583 }
584 close($CASEFOLDFH);
585 }
586 }
587}
588
589sub casefold {
590 my $arg = shift;
591 my $code = _getcode($arg);
74f8133e 592 croak __PACKAGE__, "::casefold: unknown code '$arg'"
593 unless defined $code;
b08cd201 594
595 _casefold() unless %CASEFOLD;
596
597 return $CASEFOLD{$code};
598}
599
600=head2 casespec
601
55d7b906 602 use Unicode::UCD 'casespec';
b08cd201 603
604 my %casespec = casespec("09dc");
605
606The casespec() returns the potentially locale-dependent case mapping
607of the character specified by a B<code point argument>. The mapping
608may change the length of the string (which the basic Unicode case
609mappings as returned by charinfo() never do).
610
611If there is a case folding for that character, a reference to a hash
612with the following fields is returned:
613
614 key
615
616 code code point with at least four hexdigits
617 lower lowercase
618 title titlecase
619 upper uppercase
620 condition condition list (may be undef)
621
622The C<condition> is optional. Where present, it consists of one or
623more I<locales> or I<contexts>, separated by spaces (other than as
624used to separate elements, spaces are to be ignored). A condition
625list overrides the normal behavior if all of the listed conditions are
626true. Case distinctions in the condition list are not significant.
627Conditions preceded by "NON_" represent the negation of the condition
628
f499c386 629Note that when there are multiple case folding definitions for a
630single code point because of different locales, the value returned by
631casespec() is a hash reference which has the locales as the keys and
632hash references as described above as the values.
633
b08cd201 634A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d 635followed by a "_" and a 2-letter ISO language code (possibly followed
636by a "_" and a variant code). You can find the lists of those codes,
637see L<Locale::Country> and L<Locale::Language>.
b08cd201 638
639A I<context> is one of the following choices:
640
641 FINAL The letter is not followed by a letter of
642 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
643 MODERN The mapping is only used for modern text
e618509d 644 AFTER_i The last base character was "i" (U+0069)
b08cd201 645
646For more information about case mappings see
647http://www.unicode.org/unicode/reports/tr21/
648
649=cut
650
651my %CASESPEC;
652
653sub _casespec {
654 unless (%CASESPEC) {
655 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
656 while (<$CASESPECFH>) {
657 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386 658 my ($hexcode, $lower, $title, $upper, $condition) =
659 ($1, $2, $3, $4, $5);
660 my $code = hex($hexcode);
661 if (exists $CASESPEC{$code}) {
662 if (exists $CASESPEC{$code}->{code}) {
663 my ($oldlower,
664 $oldtitle,
665 $oldupper,
666 $oldcondition) =
667 @{$CASESPEC{$code}}{qw(lower
668 title
669 upper
670 condition)};
671 my ($oldlocale) =
672 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
673 if (defined $oldlocale) {
674 delete $CASESPEC{$code};
675 $CASESPEC{$code}->{$oldlocale} =
676 { code => $hexcode,
677 lower => $oldlower,
678 title => $oldtitle,
679 upper => $oldupper,
680 condition => $oldcondition };
681 } else {
682 warn __PACKAGE__, ": SpecCase.txt:", $., ": No oldlocale for 0x$hexcode\n"
683 }
684 }
685 my ($locale) =
686 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
687 $CASESPEC{$code}->{$locale} =
688 { code => $hexcode,
689 lower => $lower,
690 title => $title,
691 upper => $upper,
692 condition => $condition };
693 } else {
694 $CASESPEC{$code} =
695 { code => $hexcode,
696 lower => $lower,
697 title => $title,
698 upper => $upper,
699 condition => $condition };
700 }
b08cd201 701 }
702 }
703 close($CASESPECFH);
704 }
705 }
706}
707
708sub casespec {
709 my $arg = shift;
710 my $code = _getcode($arg);
74f8133e 711 croak __PACKAGE__, "::casespec: unknown code '$arg'"
712 unless defined $code;
b08cd201 713
714 _casespec() unless %CASESPEC;
715
716 return $CASESPEC{$code};
717}
718
55d7b906 719=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 720
55d7b906 721Unicode::UCD::UnicodeVersion() returns the version of the Unicode
722Character Database, in other words, the version of the Unicode
78bf21c2 723standard the database implements. The version is a string
724of numbers delimited by dots (C<'.'>).
10a6ecd2 725
726=cut
727
728my $UNICODEVERSION;
729
730sub UnicodeVersion {
731 unless (defined $UNICODEVERSION) {
732 openunicode(\$VERSIONFH, "version");
733 chomp($UNICODEVERSION = <$VERSIONFH>);
734 close($VERSIONFH);
735 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
736 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
737 }
738 return $UNICODEVERSION;
739}
3aa957f9 740
741=head2 Implementation Note
32c16050 742
ad9cab37 743The first use of charinfo() opens a read-only filehandle to the Unicode
744Character Database (the database is included in the Perl distribution).
78bf21c2 745The filehandle is then kept open for further queries. In other words,
746if you are wondering where one of your filehandles went, that's where.
32c16050 747
8b731da2 748=head1 BUGS
749
750Does not yet support EBCDIC platforms.
751
561c79ed 752=head1 AUTHOR
753
754Jarkko Hietaniemi
755
756=cut
757
7581;