Explain the 'Wide character in print' a bit more.
[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed 2
3use strict;
4use warnings;
5
74f8133e 6our $VERSION = '0.2';
561c79ed 7
741297c1 8use Storable qw(dclone);
9
561c79ed 10require Exporter;
11
12our @ISA = qw(Exporter);
74f8133e 13
10a6ecd2 14our @EXPORT_OK = qw(charinfo
15 charblock charscript
16 charblocks charscripts
b08cd201 17 charinrange
18 compexcl
19 casefold casespec);
561c79ed 20
21use Carp;
22
23=head1 NAME
24
55d7b906 25Unicode::UCD - Unicode character database
561c79ed 26
27=head1 SYNOPSIS
28
55d7b906 29 use Unicode::UCD 'charinfo';
b08cd201 30 my $charinfo = charinfo($codepoint);
561c79ed 31
55d7b906 32 use Unicode::UCD 'charblock';
e882dd67 33 my $charblock = charblock($codepoint);
34
55d7b906 35 use Unicode::UCD 'charscript';
65044554 36 my $charscript = charscript($codepoint);
561c79ed 37
55d7b906 38 use Unicode::UCD 'charblocks';
e145285f 39 my $charblocks = charblocks();
40
55d7b906 41 use Unicode::UCD 'charscripts';
e145285f 42 my %charscripts = charscripts();
43
55d7b906 44 use Unicode::UCD qw(charscript charinrange);
e145285f 45 my $range = charscript($script);
46 print "looks like $script\n" if charinrange($range, $codepoint);
47
55d7b906 48 use Unicode::UCD 'compexcl';
e145285f 49 my $compexcl = compexcl($codepoint);
50
55d7b906 51 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 52
561c79ed 53=head1 DESCRIPTION
54
8b731da2 55The Unicode::UCD module offers a simple interface to the Unicode
56Character Database.
561c79ed 57
58=cut
59
10a6ecd2 60my $UNICODEFH;
61my $BLOCKSFH;
62my $SCRIPTSFH;
63my $VERSIONFH;
b08cd201 64my $COMPEXCLFH;
65my $CASEFOLDFH;
66my $CASESPECFH;
561c79ed 67
68sub openunicode {
69 my ($rfh, @path) = @_;
70 my $f;
71 unless (defined $$rfh) {
72 for my $d (@INC) {
73 use File::Spec;
55d7b906 74 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 75 last if open($$rfh, $f);
e882dd67 76 undef $f;
561c79ed 77 }
e882dd67 78 croak __PACKAGE__, ": failed to find ",
79 File::Spec->catfile(@path), " in @INC"
80 unless defined $f;
561c79ed 81 }
82 return $f;
83}
84
85=head2 charinfo
86
55d7b906 87 use Unicode::UCD 'charinfo';
561c79ed 88
b08cd201 89 my $charinfo = charinfo(0x41);
561c79ed 90
b08cd201 91charinfo() returns a reference to a hash that has the following fields
92as defined by the Unicode standard:
561c79ed 93
94 key
95
96 code code point with at least four hexdigits
97 name name of the character IN UPPER CASE
98 category general category of the character
99 combining classes used in the Canonical Ordering Algorithm
100 bidi bidirectional category
101 decomposition character decomposition mapping
102 decimal if decimal digit this is the integer numeric value
103 digit if digit this is the numeric value
104 numeric if numeric is the integer or rational numeric value
105 mirrored if mirrored in bidirectional text
106 unicode10 Unicode 1.0 name if existed and different
107 comment ISO 10646 comment field
108 upper uppercase equivalent mapping
109 lower lowercase equivalent mapping
110 title titlecase equivalent mapping
e882dd67 111
561c79ed 112 block block the character belongs to (used in \p{In...})
eb0cc9e3 113 script script the character belongs to
561c79ed 114
b08cd201 115If no match is found, a reference to an empty hash is returned.
561c79ed 116
d1be9408 117The C<block> property is the same as returned by charinfo(). It is
32c16050 118not defined in the Unicode Character Database proper (Chapter 4 of the
78bf21c2 119Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
120(Chapter 14 of TUS3). Similarly for the C<script> property.
32c16050 121
122Note that you cannot do (de)composition and casing based solely on the
123above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 124you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed 125
126=cut
127
10a6ecd2 128sub _getcode {
129 my $arg = shift;
130
dc0a4417 131 if ($arg =~ /^[1-9]\d*$/) {
10a6ecd2 132 return $arg;
dc0a4417 133 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
10a6ecd2 134 return hex($1);
135 }
136
137 return;
138}
139
ac5ea531 140# Lingua::KO::Hangul::Util not part of the standard distribution
141# but it will be used if available.
142
143eval { require Lingua::KO::Hangul::Util };
144my $hasHangulUtil = ! $@;
145if ($hasHangulUtil) {
146 Lingua::KO::Hangul::Util->import();
147}
9087a70b 148
149sub hangul_decomp { # internal: called from charinfo
ac5ea531 150 if ($hasHangulUtil) {
151 my @tmp = decomposeHangul(shift);
152 return sprintf("%04X %04X", @tmp) if @tmp == 2;
153 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
154 }
155 return;
156}
157
158sub hangul_charname { # internal: called from charinfo
159 return sprintf("HANGUL SYLLABLE-%04X", shift);
a6fa416b 160}
161
9087a70b 162sub han_charname { # internal: called from charinfo
163 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
a6fa416b 164}
165
166my @CharinfoRanges = (
167# block name
168# [ first, last, coderef to name, coderef to decompose ],
169# CJK Ideographs Extension A
170 [ 0x3400, 0x4DB5, \&han_charname, undef ],
171# CJK Ideographs
172 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
173# Hangul Syllables
ac5ea531 174 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
a6fa416b 175# Non-Private Use High Surrogates
176 [ 0xD800, 0xDB7F, undef, undef ],
177# Private Use High Surrogates
178 [ 0xDB80, 0xDBFF, undef, undef ],
179# Low Surrogates
180 [ 0xDC00, 0xDFFF, undef, undef ],
181# The Private Use Area
182 [ 0xE000, 0xF8FF, undef, undef ],
183# CJK Ideographs Extension B
184 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
185# Plane 15 Private Use Area
186 [ 0xF0000, 0xFFFFD, undef, undef ],
187# Plane 16 Private Use Area
188 [ 0x100000, 0x10FFFD, undef, undef ],
189);
190
561c79ed 191sub charinfo {
10a6ecd2 192 my $arg = shift;
193 my $code = _getcode($arg);
194 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
195 unless defined $code;
e63dbbf9 196 my $hexk = sprintf("%06X", $code);
a6fa416b 197 my($rcode,$rname,$rdec);
198 foreach my $range (@CharinfoRanges){
74f8133e 199 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 200 $rcode = $hexk;
e63dbbf9 201 $rcode =~ s/^0+//;
202 $rcode = sprintf("%04X", hex($rcode));
a6fa416b 203 $rname = $range->[2] ? $range->[2]->($code) : '';
204 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 205 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b 206 last;
207 }
208 }
551b6b6f 209 openunicode(\$UNICODEFH, "UnicodeData.txt");
10a6ecd2 210 if (defined $UNICODEFH) {
e63dbbf9 211 use Search::Dict 1.02;
212 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 213 my $line = <$UNICODEFH>;
561c79ed 214 chomp $line;
215 my %prop;
216 @prop{qw(
217 code name category
218 combining bidi decomposition
219 decimal digit numeric
220 mirrored unicode10 comment
221 upper lower title
222 )} = split(/;/, $line, -1);
e63dbbf9 223 $hexk =~ s/^0+//;
224 $hexk = sprintf("%04X", hex($hexk));
561c79ed 225 if ($prop{code} eq $hexk) {
a196fbfd 226 $prop{block} = charblock($code);
227 $prop{script} = charscript($code);
a6fa416b 228 if(defined $rname){
229 $prop{code} = $rcode;
230 $prop{name} = $rname;
231 $prop{decomposition} = $rdec;
232 }
b08cd201 233 return \%prop;
561c79ed 234 }
235 }
236 }
237 return;
238}
239
e882dd67 240sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
241 my ($table, $lo, $hi, $code) = @_;
242
243 return if $lo > $hi;
244
245 my $mid = int(($lo+$hi) / 2);
246
247 if ($table->[$mid]->[0] < $code) {
10a6ecd2 248 if ($table->[$mid]->[1] >= $code) {
e882dd67 249 return $table->[$mid]->[2];
250 } else {
251 _search($table, $mid + 1, $hi, $code);
252 }
253 } elsif ($table->[$mid]->[0] > $code) {
254 _search($table, $lo, $mid - 1, $code);
255 } else {
256 return $table->[$mid]->[2];
257 }
258}
259
10a6ecd2 260sub charinrange {
261 my ($range, $arg) = @_;
262 my $code = _getcode($arg);
263 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
264 unless defined $code;
265 _search($range, 0, $#$range, $code);
266}
267
354a27bf 268=head2 charblock
561c79ed 269
55d7b906 270 use Unicode::UCD 'charblock';
561c79ed 271
272 my $charblock = charblock(0x41);
10a6ecd2 273 my $charblock = charblock(1234);
274 my $charblock = charblock("0x263a");
275 my $charblock = charblock("U+263a");
276
78bf21c2 277 my $range = charblock('Armenian');
10a6ecd2 278
78bf21c2 279With a B<code point argument> charblock() returns the I<block> the character
10a6ecd2 280belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 281positions within all blocks are defined.
10a6ecd2 282
78bf21c2 283See also L</Blocks versus Scripts>.
284
eb0cc9e3 285If supplied with an argument that can't be a code point, charblock() tries
286to do the opposite and interpret the argument as a character block. The
287return value is a I<range>: an anonymous list of lists that contain
288I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
289code point is in a range using the L</charinrange> function. If the
290argument is not a known charater block, C<undef> is returned.
561c79ed 291
561c79ed 292=cut
293
294my @BLOCKS;
10a6ecd2 295my %BLOCKS;
561c79ed 296
10a6ecd2 297sub _charblocks {
561c79ed 298 unless (@BLOCKS) {
10a6ecd2 299 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
6c8d78fb 300 local $_;
10a6ecd2 301 while (<$BLOCKSFH>) {
2796c109 302 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2 303 my ($lo, $hi) = (hex($1), hex($2));
304 my $subrange = [ $lo, $hi, $3 ];
305 push @BLOCKS, $subrange;
306 push @{$BLOCKS{$3}}, $subrange;
561c79ed 307 }
308 }
10a6ecd2 309 close($BLOCKSFH);
561c79ed 310 }
311 }
10a6ecd2 312}
313
314sub charblock {
315 my $arg = shift;
316
317 _charblocks() unless @BLOCKS;
318
319 my $code = _getcode($arg);
561c79ed 320
10a6ecd2 321 if (defined $code) {
322 _search(\@BLOCKS, 0, $#BLOCKS, $code);
323 } else {
324 if (exists $BLOCKS{$arg}) {
741297c1 325 return dclone $BLOCKS{$arg};
10a6ecd2 326 } else {
327 return;
328 }
329 }
e882dd67 330}
331
332=head2 charscript
333
55d7b906 334 use Unicode::UCD 'charscript';
e882dd67 335
336 my $charscript = charscript(0x41);
10a6ecd2 337 my $charscript = charscript(1234);
338 my $charscript = charscript("U+263a");
e882dd67 339
78bf21c2 340 my $range = charscript('Thai');
10a6ecd2 341
78bf21c2 342With a B<code point argument> charscript() returns the I<script> the
b08cd201 343character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2 344
78bf21c2 345See also L</Blocks versus Scripts>.
346
eb0cc9e3 347If supplied with an argument that can't be a code point, charscript() tries
348to do the opposite and interpret the argument as a character script. The
349return value is a I<range>: an anonymous list of lists that contain
350I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
351code point is in a range using the L</charinrange> function. If the
352argument is not a known charater script, C<undef> is returned.
e882dd67 353
e882dd67 354=cut
355
356my @SCRIPTS;
10a6ecd2 357my %SCRIPTS;
e882dd67 358
10a6ecd2 359sub _charscripts {
e882dd67 360 unless (@SCRIPTS) {
10a6ecd2 361 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
6c8d78fb 362 local $_;
10a6ecd2 363 while (<$SCRIPTSFH>) {
e882dd67 364 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2 365 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
366 my $script = lc($3);
367 $script =~ s/\b(\w)/uc($1)/ge;
368 my $subrange = [ $lo, $hi, $script ];
369 push @SCRIPTS, $subrange;
370 push @{$SCRIPTS{$script}}, $subrange;
e882dd67 371 }
372 }
10a6ecd2 373 close($SCRIPTSFH);
e882dd67 374 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
375 }
376 }
10a6ecd2 377}
378
379sub charscript {
380 my $arg = shift;
381
382 _charscripts() unless @SCRIPTS;
e882dd67 383
10a6ecd2 384 my $code = _getcode($arg);
385
386 if (defined $code) {
387 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
388 } else {
389 if (exists $SCRIPTS{$arg}) {
741297c1 390 return dclone $SCRIPTS{$arg};
10a6ecd2 391 } else {
392 return;
393 }
394 }
395}
396
397=head2 charblocks
398
55d7b906 399 use Unicode::UCD 'charblocks';
10a6ecd2 400
b08cd201 401 my $charblocks = charblocks();
10a6ecd2 402
b08cd201 403charblocks() returns a reference to a hash with the known block names
404as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2 405
78bf21c2 406See also L</Blocks versus Scripts>.
407
10a6ecd2 408=cut
409
410sub charblocks {
b08cd201 411 _charblocks() unless %BLOCKS;
741297c1 412 return dclone \%BLOCKS;
10a6ecd2 413}
414
415=head2 charscripts
416
55d7b906 417 use Unicode::UCD 'charscripts';
10a6ecd2 418
419 my %charscripts = charscripts();
420
421charscripts() returns a hash with the known script names as the keys,
422and the code point ranges (see L</charscript>) as the values.
423
78bf21c2 424See also L</Blocks versus Scripts>.
425
10a6ecd2 426=cut
427
428sub charscripts {
b08cd201 429 _charscripts() unless %SCRIPTS;
741297c1 430 return dclone \%SCRIPTS;
561c79ed 431}
432
10a6ecd2 433=head2 Blocks versus Scripts
ad9cab37 434
10a6ecd2 435The difference between a block and a script is that scripts are closer
436to the linguistic notion of a set of characters required to present
437languages, while block is more of an artifact of the Unicode character
eb0cc9e3 438numbering and separation into blocks of (mostly) 256 characters.
3aa957f9 439
440For example the Latin B<script> is spread over several B<blocks>, such
441as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
442C<Latin Extended-B>. On the other hand, the Latin script does not
443contain all the characters of the C<Basic Latin> block (also known as
eb0cc9e3 444the ASCII): it includes only the letters, and not, for example, the digits
3aa957f9 445or the punctuation.
ad9cab37 446
3aa957f9 447For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37 448
449For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
450
3aa957f9 451=head2 Matching Scripts and Blocks
452
eb0cc9e3 453Scripts are matched with the regular-expression construct
454C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
455while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
456any of the 256 code points in the Tibetan block).
10a6ecd2 457
b08cd201 458=head2 Code Point Arguments
459
92e830a9 460A I<code point argument> is either a decimal or a hexadecimal scalar
461designating a Unicode character, or C<U+> followed by hexadecimals
dc0a4417 462designating a Unicode character. In other words, if you want a code
463point to be interpreted as a hexadecimal number, you must prefix it
43adb1d9 464with either C<0x> or C<U+>, because a string like e.g. C<123> will
dc0a4417 465be interpreted as a decimal code point. Also note that Unicode is
466B<not> limited to 16 bits (the number of Unicode characters is
467open-ended, in theory unlimited): you may have more than 4 hexdigits.
b08cd201 468
10a6ecd2 469=head2 charinrange
470
471In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
472can also test whether a code point is in the I<range> as returned by
473L</charblock> and L</charscript> or as the values of the hash returned
e618509d 474by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 475
55d7b906 476 use Unicode::UCD qw(charscript charinrange);
10a6ecd2 477
478 $range = charscript('Hiragana');
e145285f 479 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2 480
481=cut
482
b08cd201 483=head2 compexcl
484
55d7b906 485 use Unicode::UCD 'compexcl';
b08cd201 486
487 my $compexcl = compexcl("09dc");
488
489The compexcl() returns the composition exclusion (that is, if the
9046a8ae 490character should not be produced during a precomposition) of the
491character specified by a B<code point argument>.
b08cd201 492
493If there is a composition exclusion for the character, true is
494returned. Otherwise, false is returned.
495
496=cut
497
498my %COMPEXCL;
499
500sub _compexcl {
501 unless (%COMPEXCL) {
551b6b6f 502 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
6c8d78fb 503 local $_;
b08cd201 504 while (<$COMPEXCLFH>) {
822ebcc8 505 if (/^([0-9A-F]+)\s+\#\s+/) {
b08cd201 506 my $code = hex($1);
507 $COMPEXCL{$code} = undef;
508 }
509 }
510 close($COMPEXCLFH);
511 }
512 }
513}
514
515sub compexcl {
516 my $arg = shift;
517 my $code = _getcode($arg);
74f8133e 518 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
519 unless defined $code;
b08cd201 520
521 _compexcl() unless %COMPEXCL;
522
523 return exists $COMPEXCL{$code};
524}
525
526=head2 casefold
527
55d7b906 528 use Unicode::UCD 'casefold';
b08cd201 529
82c0b05b 530 my $casefold = casefold("00DF");
b08cd201 531
532The casefold() returns the locale-independent case folding of the
533character specified by a B<code point argument>.
534
535If there is a case folding for that character, a reference to a hash
536with the following fields is returned:
537
538 key
539
540 code code point with at least four hexdigits
541 status "C", "F", "S", or "I"
542 mapping one or more codes separated by spaces
543
544The meaning of the I<status> is as follows:
545
546 C common case folding, common mappings shared
547 by both simple and full mappings
548 F full case folding, mappings that cause strings
549 to grow in length. Multiple characters are separated
550 by spaces
551 S simple case folding, mappings to single characters
552 where different from F
553 I special case for dotted uppercase I and
554 dotless lowercase i
555 - If this mapping is included, the result is
556 case-insensitive, but dotless and dotted I's
557 are not distinguished
558 - If this mapping is excluded, the result is not
559 fully case-insensitive, but dotless and dotted
560 I's are distinguished
561
562If there is no case folding for that character, C<undef> is returned.
563
564For more information about case mappings see
565http://www.unicode.org/unicode/reports/tr21/
566
567=cut
568
569my %CASEFOLD;
570
571sub _casefold {
572 unless (%CASEFOLD) {
551b6b6f 573 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
6c8d78fb 574 local $_;
b08cd201 575 while (<$CASEFOLDFH>) {
576 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
577 my $code = hex($1);
578 $CASEFOLD{$code} = { code => $1,
579 status => $2,
580 mapping => $3 };
581 }
582 }
583 close($CASEFOLDFH);
584 }
585 }
586}
587
588sub casefold {
589 my $arg = shift;
590 my $code = _getcode($arg);
74f8133e 591 croak __PACKAGE__, "::casefold: unknown code '$arg'"
592 unless defined $code;
b08cd201 593
594 _casefold() unless %CASEFOLD;
595
596 return $CASEFOLD{$code};
597}
598
599=head2 casespec
600
55d7b906 601 use Unicode::UCD 'casespec';
b08cd201 602
82c0b05b 603 my $casespec = casespec("FB00");
b08cd201 604
605The casespec() returns the potentially locale-dependent case mapping
606of the character specified by a B<code point argument>. The mapping
607may change the length of the string (which the basic Unicode case
608mappings as returned by charinfo() never do).
609
610If there is a case folding for that character, a reference to a hash
611with the following fields is returned:
612
613 key
614
615 code code point with at least four hexdigits
616 lower lowercase
617 title titlecase
618 upper uppercase
619 condition condition list (may be undef)
620
621The C<condition> is optional. Where present, it consists of one or
622more I<locales> or I<contexts>, separated by spaces (other than as
623used to separate elements, spaces are to be ignored). A condition
624list overrides the normal behavior if all of the listed conditions are
625true. Case distinctions in the condition list are not significant.
82c0b05b 626Conditions preceded by "NON_" represent the negation of the condition.
b08cd201 627
f499c386 628Note that when there are multiple case folding definitions for a
629single code point because of different locales, the value returned by
630casespec() is a hash reference which has the locales as the keys and
631hash references as described above as the values.
632
b08cd201 633A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d 634followed by a "_" and a 2-letter ISO language code (possibly followed
635by a "_" and a variant code). You can find the lists of those codes,
636see L<Locale::Country> and L<Locale::Language>.
b08cd201 637
638A I<context> is one of the following choices:
639
640 FINAL The letter is not followed by a letter of
641 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
642 MODERN The mapping is only used for modern text
e618509d 643 AFTER_i The last base character was "i" (U+0069)
b08cd201 644
645For more information about case mappings see
646http://www.unicode.org/unicode/reports/tr21/
647
648=cut
649
650my %CASESPEC;
651
652sub _casespec {
653 unless (%CASESPEC) {
551b6b6f 654 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
6c8d78fb 655 local $_;
b08cd201 656 while (<$CASESPECFH>) {
657 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386 658 my ($hexcode, $lower, $title, $upper, $condition) =
659 ($1, $2, $3, $4, $5);
660 my $code = hex($hexcode);
661 if (exists $CASESPEC{$code}) {
662 if (exists $CASESPEC{$code}->{code}) {
663 my ($oldlower,
664 $oldtitle,
665 $oldupper,
666 $oldcondition) =
667 @{$CASESPEC{$code}}{qw(lower
668 title
669 upper
670 condition)};
822ebcc8 671 if (defined $oldcondition) {
672 my ($oldlocale) =
f499c386 673 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386 674 delete $CASESPEC{$code};
675 $CASESPEC{$code}->{$oldlocale} =
676 { code => $hexcode,
677 lower => $oldlower,
678 title => $oldtitle,
679 upper => $oldupper,
680 condition => $oldcondition };
f499c386 681 }
682 }
683 my ($locale) =
684 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
685 $CASESPEC{$code}->{$locale} =
686 { code => $hexcode,
687 lower => $lower,
688 title => $title,
689 upper => $upper,
690 condition => $condition };
691 } else {
692 $CASESPEC{$code} =
693 { code => $hexcode,
694 lower => $lower,
695 title => $title,
696 upper => $upper,
697 condition => $condition };
698 }
b08cd201 699 }
700 }
701 close($CASESPECFH);
702 }
703 }
704}
705
706sub casespec {
707 my $arg = shift;
708 my $code = _getcode($arg);
74f8133e 709 croak __PACKAGE__, "::casespec: unknown code '$arg'"
710 unless defined $code;
b08cd201 711
712 _casespec() unless %CASESPEC;
713
741297c1 714 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
b08cd201 715}
716
55d7b906 717=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 718
55d7b906 719Unicode::UCD::UnicodeVersion() returns the version of the Unicode
720Character Database, in other words, the version of the Unicode
78bf21c2 721standard the database implements. The version is a string
722of numbers delimited by dots (C<'.'>).
10a6ecd2 723
724=cut
725
726my $UNICODEVERSION;
727
728sub UnicodeVersion {
729 unless (defined $UNICODEVERSION) {
730 openunicode(\$VERSIONFH, "version");
731 chomp($UNICODEVERSION = <$VERSIONFH>);
732 close($VERSIONFH);
733 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
734 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
735 }
736 return $UNICODEVERSION;
737}
3aa957f9 738
739=head2 Implementation Note
32c16050 740
ad9cab37 741The first use of charinfo() opens a read-only filehandle to the Unicode
742Character Database (the database is included in the Perl distribution).
78bf21c2 743The filehandle is then kept open for further queries. In other words,
744if you are wondering where one of your filehandles went, that's where.
32c16050 745
8b731da2 746=head1 BUGS
747
748Does not yet support EBCDIC platforms.
749
561c79ed 750=head1 AUTHOR
751
752Jarkko Hietaniemi
753
754=cut
755
7561;