More PerlIO documentation.
[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed 2
3use strict;
4use warnings;
5
2af1ab88 6our $VERSION = '0.21';
561c79ed 7
741297c1 8use Storable qw(dclone);
9
561c79ed 10require Exporter;
11
12our @ISA = qw(Exporter);
74f8133e 13
10a6ecd2 14our @EXPORT_OK = qw(charinfo
15 charblock charscript
16 charblocks charscripts
b08cd201 17 charinrange
18 compexcl
19 casefold casespec);
561c79ed 20
21use Carp;
22
23=head1 NAME
24
55d7b906 25Unicode::UCD - Unicode character database
561c79ed 26
27=head1 SYNOPSIS
28
55d7b906 29 use Unicode::UCD 'charinfo';
b08cd201 30 my $charinfo = charinfo($codepoint);
561c79ed 31
55d7b906 32 use Unicode::UCD 'charblock';
e882dd67 33 my $charblock = charblock($codepoint);
34
55d7b906 35 use Unicode::UCD 'charscript';
65044554 36 my $charscript = charscript($codepoint);
561c79ed 37
55d7b906 38 use Unicode::UCD 'charblocks';
e145285f 39 my $charblocks = charblocks();
40
55d7b906 41 use Unicode::UCD 'charscripts';
e145285f 42 my %charscripts = charscripts();
43
55d7b906 44 use Unicode::UCD qw(charscript charinrange);
e145285f 45 my $range = charscript($script);
46 print "looks like $script\n" if charinrange($range, $codepoint);
47
55d7b906 48 use Unicode::UCD 'compexcl';
e145285f 49 my $compexcl = compexcl($codepoint);
50
55d7b906 51 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 52
561c79ed 53=head1 DESCRIPTION
54
8b731da2 55The Unicode::UCD module offers a simple interface to the Unicode
56Character Database.
561c79ed 57
58=cut
59
10a6ecd2 60my $UNICODEFH;
61my $BLOCKSFH;
62my $SCRIPTSFH;
63my $VERSIONFH;
b08cd201 64my $COMPEXCLFH;
65my $CASEFOLDFH;
66my $CASESPECFH;
561c79ed 67
68sub openunicode {
69 my ($rfh, @path) = @_;
70 my $f;
71 unless (defined $$rfh) {
72 for my $d (@INC) {
73 use File::Spec;
55d7b906 74 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 75 last if open($$rfh, $f);
e882dd67 76 undef $f;
561c79ed 77 }
e882dd67 78 croak __PACKAGE__, ": failed to find ",
79 File::Spec->catfile(@path), " in @INC"
80 unless defined $f;
561c79ed 81 }
82 return $f;
83}
84
85=head2 charinfo
86
55d7b906 87 use Unicode::UCD 'charinfo';
561c79ed 88
b08cd201 89 my $charinfo = charinfo(0x41);
561c79ed 90
b08cd201 91charinfo() returns a reference to a hash that has the following fields
92as defined by the Unicode standard:
561c79ed 93
94 key
95
96 code code point with at least four hexdigits
97 name name of the character IN UPPER CASE
98 category general category of the character
99 combining classes used in the Canonical Ordering Algorithm
100 bidi bidirectional category
101 decomposition character decomposition mapping
102 decimal if decimal digit this is the integer numeric value
103 digit if digit this is the numeric value
104 numeric if numeric is the integer or rational numeric value
105 mirrored if mirrored in bidirectional text
106 unicode10 Unicode 1.0 name if existed and different
107 comment ISO 10646 comment field
108 upper uppercase equivalent mapping
109 lower lowercase equivalent mapping
110 title titlecase equivalent mapping
e882dd67 111
561c79ed 112 block block the character belongs to (used in \p{In...})
eb0cc9e3 113 script script the character belongs to
561c79ed 114
b08cd201 115If no match is found, a reference to an empty hash is returned.
561c79ed 116
d1be9408 117The C<block> property is the same as returned by charinfo(). It is
32c16050 118not defined in the Unicode Character Database proper (Chapter 4 of the
78bf21c2 119Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
120(Chapter 14 of TUS3). Similarly for the C<script> property.
32c16050 121
122Note that you cannot do (de)composition and casing based solely on the
123above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 124you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed 125
126=cut
127
10a6ecd2 128sub _getcode {
129 my $arg = shift;
130
dc0a4417 131 if ($arg =~ /^[1-9]\d*$/) {
10a6ecd2 132 return $arg;
dc0a4417 133 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
10a6ecd2 134 return hex($1);
135 }
136
137 return;
138}
139
ac5ea531 140# Lingua::KO::Hangul::Util not part of the standard distribution
141# but it will be used if available.
142
143eval { require Lingua::KO::Hangul::Util };
144my $hasHangulUtil = ! $@;
145if ($hasHangulUtil) {
146 Lingua::KO::Hangul::Util->import();
147}
9087a70b 148
149sub hangul_decomp { # internal: called from charinfo
ac5ea531 150 if ($hasHangulUtil) {
151 my @tmp = decomposeHangul(shift);
152 return sprintf("%04X %04X", @tmp) if @tmp == 2;
153 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
154 }
155 return;
156}
157
158sub hangul_charname { # internal: called from charinfo
159 return sprintf("HANGUL SYLLABLE-%04X", shift);
a6fa416b 160}
161
9087a70b 162sub han_charname { # internal: called from charinfo
163 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
a6fa416b 164}
165
166my @CharinfoRanges = (
167# block name
168# [ first, last, coderef to name, coderef to decompose ],
169# CJK Ideographs Extension A
170 [ 0x3400, 0x4DB5, \&han_charname, undef ],
171# CJK Ideographs
172 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
173# Hangul Syllables
ac5ea531 174 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
a6fa416b 175# Non-Private Use High Surrogates
176 [ 0xD800, 0xDB7F, undef, undef ],
177# Private Use High Surrogates
178 [ 0xDB80, 0xDBFF, undef, undef ],
179# Low Surrogates
180 [ 0xDC00, 0xDFFF, undef, undef ],
181# The Private Use Area
182 [ 0xE000, 0xF8FF, undef, undef ],
183# CJK Ideographs Extension B
184 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
185# Plane 15 Private Use Area
186 [ 0xF0000, 0xFFFFD, undef, undef ],
187# Plane 16 Private Use Area
188 [ 0x100000, 0x10FFFD, undef, undef ],
189);
190
561c79ed 191sub charinfo {
10a6ecd2 192 my $arg = shift;
193 my $code = _getcode($arg);
194 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
195 unless defined $code;
e63dbbf9 196 my $hexk = sprintf("%06X", $code);
a6fa416b 197 my($rcode,$rname,$rdec);
198 foreach my $range (@CharinfoRanges){
74f8133e 199 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 200 $rcode = $hexk;
e63dbbf9 201 $rcode =~ s/^0+//;
202 $rcode = sprintf("%04X", hex($rcode));
a6fa416b 203 $rname = $range->[2] ? $range->[2]->($code) : '';
204 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 205 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b 206 last;
207 }
208 }
551b6b6f 209 openunicode(\$UNICODEFH, "UnicodeData.txt");
10a6ecd2 210 if (defined $UNICODEFH) {
e63dbbf9 211 use Search::Dict 1.02;
212 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 213 my $line = <$UNICODEFH>;
c5a29f40 214 return unless defined $line;
561c79ed 215 chomp $line;
216 my %prop;
217 @prop{qw(
218 code name category
219 combining bidi decomposition
220 decimal digit numeric
221 mirrored unicode10 comment
222 upper lower title
223 )} = split(/;/, $line, -1);
e63dbbf9 224 $hexk =~ s/^0+//;
225 $hexk = sprintf("%04X", hex($hexk));
561c79ed 226 if ($prop{code} eq $hexk) {
a196fbfd 227 $prop{block} = charblock($code);
228 $prop{script} = charscript($code);
a6fa416b 229 if(defined $rname){
230 $prop{code} = $rcode;
231 $prop{name} = $rname;
232 $prop{decomposition} = $rdec;
233 }
b08cd201 234 return \%prop;
561c79ed 235 }
236 }
237 }
238 return;
239}
240
e882dd67 241sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
242 my ($table, $lo, $hi, $code) = @_;
243
244 return if $lo > $hi;
245
246 my $mid = int(($lo+$hi) / 2);
247
248 if ($table->[$mid]->[0] < $code) {
10a6ecd2 249 if ($table->[$mid]->[1] >= $code) {
e882dd67 250 return $table->[$mid]->[2];
251 } else {
252 _search($table, $mid + 1, $hi, $code);
253 }
254 } elsif ($table->[$mid]->[0] > $code) {
255 _search($table, $lo, $mid - 1, $code);
256 } else {
257 return $table->[$mid]->[2];
258 }
259}
260
10a6ecd2 261sub charinrange {
262 my ($range, $arg) = @_;
263 my $code = _getcode($arg);
264 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
265 unless defined $code;
266 _search($range, 0, $#$range, $code);
267}
268
354a27bf 269=head2 charblock
561c79ed 270
55d7b906 271 use Unicode::UCD 'charblock';
561c79ed 272
273 my $charblock = charblock(0x41);
10a6ecd2 274 my $charblock = charblock(1234);
275 my $charblock = charblock("0x263a");
276 my $charblock = charblock("U+263a");
277
78bf21c2 278 my $range = charblock('Armenian');
10a6ecd2 279
78bf21c2 280With a B<code point argument> charblock() returns the I<block> the character
10a6ecd2 281belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 282positions within all blocks are defined.
10a6ecd2 283
78bf21c2 284See also L</Blocks versus Scripts>.
285
eb0cc9e3 286If supplied with an argument that can't be a code point, charblock() tries
287to do the opposite and interpret the argument as a character block. The
288return value is a I<range>: an anonymous list of lists that contain
289I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
290code point is in a range using the L</charinrange> function. If the
291argument is not a known charater block, C<undef> is returned.
561c79ed 292
561c79ed 293=cut
294
295my @BLOCKS;
10a6ecd2 296my %BLOCKS;
561c79ed 297
10a6ecd2 298sub _charblocks {
561c79ed 299 unless (@BLOCKS) {
10a6ecd2 300 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
6c8d78fb 301 local $_;
10a6ecd2 302 while (<$BLOCKSFH>) {
2796c109 303 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2 304 my ($lo, $hi) = (hex($1), hex($2));
305 my $subrange = [ $lo, $hi, $3 ];
306 push @BLOCKS, $subrange;
307 push @{$BLOCKS{$3}}, $subrange;
561c79ed 308 }
309 }
10a6ecd2 310 close($BLOCKSFH);
561c79ed 311 }
312 }
10a6ecd2 313}
314
315sub charblock {
316 my $arg = shift;
317
318 _charblocks() unless @BLOCKS;
319
320 my $code = _getcode($arg);
561c79ed 321
10a6ecd2 322 if (defined $code) {
323 _search(\@BLOCKS, 0, $#BLOCKS, $code);
324 } else {
325 if (exists $BLOCKS{$arg}) {
741297c1 326 return dclone $BLOCKS{$arg};
10a6ecd2 327 } else {
328 return;
329 }
330 }
e882dd67 331}
332
333=head2 charscript
334
55d7b906 335 use Unicode::UCD 'charscript';
e882dd67 336
337 my $charscript = charscript(0x41);
10a6ecd2 338 my $charscript = charscript(1234);
339 my $charscript = charscript("U+263a");
e882dd67 340
78bf21c2 341 my $range = charscript('Thai');
10a6ecd2 342
78bf21c2 343With a B<code point argument> charscript() returns the I<script> the
b08cd201 344character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2 345
78bf21c2 346See also L</Blocks versus Scripts>.
347
eb0cc9e3 348If supplied with an argument that can't be a code point, charscript() tries
349to do the opposite and interpret the argument as a character script. The
350return value is a I<range>: an anonymous list of lists that contain
351I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
352code point is in a range using the L</charinrange> function. If the
353argument is not a known charater script, C<undef> is returned.
e882dd67 354
e882dd67 355=cut
356
357my @SCRIPTS;
10a6ecd2 358my %SCRIPTS;
e882dd67 359
10a6ecd2 360sub _charscripts {
e882dd67 361 unless (@SCRIPTS) {
10a6ecd2 362 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
6c8d78fb 363 local $_;
10a6ecd2 364 while (<$SCRIPTSFH>) {
e882dd67 365 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2 366 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
367 my $script = lc($3);
368 $script =~ s/\b(\w)/uc($1)/ge;
369 my $subrange = [ $lo, $hi, $script ];
370 push @SCRIPTS, $subrange;
371 push @{$SCRIPTS{$script}}, $subrange;
e882dd67 372 }
373 }
10a6ecd2 374 close($SCRIPTSFH);
e882dd67 375 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
376 }
377 }
10a6ecd2 378}
379
380sub charscript {
381 my $arg = shift;
382
383 _charscripts() unless @SCRIPTS;
e882dd67 384
10a6ecd2 385 my $code = _getcode($arg);
386
387 if (defined $code) {
388 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
389 } else {
390 if (exists $SCRIPTS{$arg}) {
741297c1 391 return dclone $SCRIPTS{$arg};
10a6ecd2 392 } else {
393 return;
394 }
395 }
396}
397
398=head2 charblocks
399
55d7b906 400 use Unicode::UCD 'charblocks';
10a6ecd2 401
b08cd201 402 my $charblocks = charblocks();
10a6ecd2 403
b08cd201 404charblocks() returns a reference to a hash with the known block names
405as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2 406
78bf21c2 407See also L</Blocks versus Scripts>.
408
10a6ecd2 409=cut
410
411sub charblocks {
b08cd201 412 _charblocks() unless %BLOCKS;
741297c1 413 return dclone \%BLOCKS;
10a6ecd2 414}
415
416=head2 charscripts
417
55d7b906 418 use Unicode::UCD 'charscripts';
10a6ecd2 419
420 my %charscripts = charscripts();
421
422charscripts() returns a hash with the known script names as the keys,
423and the code point ranges (see L</charscript>) as the values.
424
78bf21c2 425See also L</Blocks versus Scripts>.
426
10a6ecd2 427=cut
428
429sub charscripts {
b08cd201 430 _charscripts() unless %SCRIPTS;
741297c1 431 return dclone \%SCRIPTS;
561c79ed 432}
433
10a6ecd2 434=head2 Blocks versus Scripts
ad9cab37 435
10a6ecd2 436The difference between a block and a script is that scripts are closer
437to the linguistic notion of a set of characters required to present
438languages, while block is more of an artifact of the Unicode character
eb0cc9e3 439numbering and separation into blocks of (mostly) 256 characters.
3aa957f9 440
441For example the Latin B<script> is spread over several B<blocks>, such
442as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
443C<Latin Extended-B>. On the other hand, the Latin script does not
444contain all the characters of the C<Basic Latin> block (also known as
eb0cc9e3 445the ASCII): it includes only the letters, and not, for example, the digits
3aa957f9 446or the punctuation.
ad9cab37 447
3aa957f9 448For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37 449
450For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
451
3aa957f9 452=head2 Matching Scripts and Blocks
453
eb0cc9e3 454Scripts are matched with the regular-expression construct
455C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
456while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
457any of the 256 code points in the Tibetan block).
10a6ecd2 458
b08cd201 459=head2 Code Point Arguments
460
92e830a9 461A I<code point argument> is either a decimal or a hexadecimal scalar
462designating a Unicode character, or C<U+> followed by hexadecimals
dc0a4417 463designating a Unicode character. In other words, if you want a code
464point to be interpreted as a hexadecimal number, you must prefix it
43adb1d9 465with either C<0x> or C<U+>, because a string like e.g. C<123> will
dc0a4417 466be interpreted as a decimal code point. Also note that Unicode is
467B<not> limited to 16 bits (the number of Unicode characters is
468open-ended, in theory unlimited): you may have more than 4 hexdigits.
b08cd201 469
10a6ecd2 470=head2 charinrange
471
472In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
473can also test whether a code point is in the I<range> as returned by
474L</charblock> and L</charscript> or as the values of the hash returned
e618509d 475by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 476
55d7b906 477 use Unicode::UCD qw(charscript charinrange);
10a6ecd2 478
479 $range = charscript('Hiragana');
e145285f 480 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2 481
482=cut
483
b08cd201 484=head2 compexcl
485
55d7b906 486 use Unicode::UCD 'compexcl';
b08cd201 487
488 my $compexcl = compexcl("09dc");
489
490The compexcl() returns the composition exclusion (that is, if the
9046a8ae 491character should not be produced during a precomposition) of the
492character specified by a B<code point argument>.
b08cd201 493
494If there is a composition exclusion for the character, true is
495returned. Otherwise, false is returned.
496
497=cut
498
499my %COMPEXCL;
500
501sub _compexcl {
502 unless (%COMPEXCL) {
551b6b6f 503 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
6c8d78fb 504 local $_;
b08cd201 505 while (<$COMPEXCLFH>) {
822ebcc8 506 if (/^([0-9A-F]+)\s+\#\s+/) {
b08cd201 507 my $code = hex($1);
508 $COMPEXCL{$code} = undef;
509 }
510 }
511 close($COMPEXCLFH);
512 }
513 }
514}
515
516sub compexcl {
517 my $arg = shift;
518 my $code = _getcode($arg);
74f8133e 519 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
520 unless defined $code;
b08cd201 521
522 _compexcl() unless %COMPEXCL;
523
524 return exists $COMPEXCL{$code};
525}
526
527=head2 casefold
528
55d7b906 529 use Unicode::UCD 'casefold';
b08cd201 530
82c0b05b 531 my $casefold = casefold("00DF");
b08cd201 532
533The casefold() returns the locale-independent case folding of the
534character specified by a B<code point argument>.
535
536If there is a case folding for that character, a reference to a hash
537with the following fields is returned:
538
539 key
540
541 code code point with at least four hexdigits
542 status "C", "F", "S", or "I"
543 mapping one or more codes separated by spaces
544
545The meaning of the I<status> is as follows:
546
547 C common case folding, common mappings shared
548 by both simple and full mappings
549 F full case folding, mappings that cause strings
550 to grow in length. Multiple characters are separated
551 by spaces
552 S simple case folding, mappings to single characters
553 where different from F
554 I special case for dotted uppercase I and
555 dotless lowercase i
556 - If this mapping is included, the result is
557 case-insensitive, but dotless and dotted I's
558 are not distinguished
559 - If this mapping is excluded, the result is not
560 fully case-insensitive, but dotless and dotted
561 I's are distinguished
562
563If there is no case folding for that character, C<undef> is returned.
564
565For more information about case mappings see
566http://www.unicode.org/unicode/reports/tr21/
567
568=cut
569
570my %CASEFOLD;
571
572sub _casefold {
573 unless (%CASEFOLD) {
551b6b6f 574 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
6c8d78fb 575 local $_;
b08cd201 576 while (<$CASEFOLDFH>) {
577 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
578 my $code = hex($1);
579 $CASEFOLD{$code} = { code => $1,
580 status => $2,
581 mapping => $3 };
582 }
583 }
584 close($CASEFOLDFH);
585 }
586 }
587}
588
589sub casefold {
590 my $arg = shift;
591 my $code = _getcode($arg);
74f8133e 592 croak __PACKAGE__, "::casefold: unknown code '$arg'"
593 unless defined $code;
b08cd201 594
595 _casefold() unless %CASEFOLD;
596
597 return $CASEFOLD{$code};
598}
599
600=head2 casespec
601
55d7b906 602 use Unicode::UCD 'casespec';
b08cd201 603
82c0b05b 604 my $casespec = casespec("FB00");
b08cd201 605
606The casespec() returns the potentially locale-dependent case mapping
607of the character specified by a B<code point argument>. The mapping
608may change the length of the string (which the basic Unicode case
609mappings as returned by charinfo() never do).
610
611If there is a case folding for that character, a reference to a hash
612with the following fields is returned:
613
614 key
615
616 code code point with at least four hexdigits
617 lower lowercase
618 title titlecase
619 upper uppercase
620 condition condition list (may be undef)
621
622The C<condition> is optional. Where present, it consists of one or
623more I<locales> or I<contexts>, separated by spaces (other than as
624used to separate elements, spaces are to be ignored). A condition
625list overrides the normal behavior if all of the listed conditions are
626true. Case distinctions in the condition list are not significant.
82c0b05b 627Conditions preceded by "NON_" represent the negation of the condition.
b08cd201 628
f499c386 629Note that when there are multiple case folding definitions for a
630single code point because of different locales, the value returned by
631casespec() is a hash reference which has the locales as the keys and
632hash references as described above as the values.
633
b08cd201 634A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d 635followed by a "_" and a 2-letter ISO language code (possibly followed
636by a "_" and a variant code). You can find the lists of those codes,
637see L<Locale::Country> and L<Locale::Language>.
b08cd201 638
639A I<context> is one of the following choices:
640
641 FINAL The letter is not followed by a letter of
642 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
643 MODERN The mapping is only used for modern text
e618509d 644 AFTER_i The last base character was "i" (U+0069)
b08cd201 645
646For more information about case mappings see
647http://www.unicode.org/unicode/reports/tr21/
648
649=cut
650
651my %CASESPEC;
652
653sub _casespec {
654 unless (%CASESPEC) {
551b6b6f 655 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
6c8d78fb 656 local $_;
b08cd201 657 while (<$CASESPECFH>) {
658 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386 659 my ($hexcode, $lower, $title, $upper, $condition) =
660 ($1, $2, $3, $4, $5);
661 my $code = hex($hexcode);
662 if (exists $CASESPEC{$code}) {
663 if (exists $CASESPEC{$code}->{code}) {
664 my ($oldlower,
665 $oldtitle,
666 $oldupper,
667 $oldcondition) =
668 @{$CASESPEC{$code}}{qw(lower
669 title
670 upper
671 condition)};
822ebcc8 672 if (defined $oldcondition) {
673 my ($oldlocale) =
f499c386 674 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386 675 delete $CASESPEC{$code};
676 $CASESPEC{$code}->{$oldlocale} =
677 { code => $hexcode,
678 lower => $oldlower,
679 title => $oldtitle,
680 upper => $oldupper,
681 condition => $oldcondition };
f499c386 682 }
683 }
684 my ($locale) =
685 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
686 $CASESPEC{$code}->{$locale} =
687 { code => $hexcode,
688 lower => $lower,
689 title => $title,
690 upper => $upper,
691 condition => $condition };
692 } else {
693 $CASESPEC{$code} =
694 { code => $hexcode,
695 lower => $lower,
696 title => $title,
697 upper => $upper,
698 condition => $condition };
699 }
b08cd201 700 }
701 }
702 close($CASESPECFH);
703 }
704 }
705}
706
707sub casespec {
708 my $arg = shift;
709 my $code = _getcode($arg);
74f8133e 710 croak __PACKAGE__, "::casespec: unknown code '$arg'"
711 unless defined $code;
b08cd201 712
713 _casespec() unless %CASESPEC;
714
741297c1 715 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
b08cd201 716}
717
55d7b906 718=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 719
55d7b906 720Unicode::UCD::UnicodeVersion() returns the version of the Unicode
721Character Database, in other words, the version of the Unicode
78bf21c2 722standard the database implements. The version is a string
723of numbers delimited by dots (C<'.'>).
10a6ecd2 724
725=cut
726
727my $UNICODEVERSION;
728
729sub UnicodeVersion {
730 unless (defined $UNICODEVERSION) {
731 openunicode(\$VERSIONFH, "version");
732 chomp($UNICODEVERSION = <$VERSIONFH>);
733 close($VERSIONFH);
734 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
735 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
736 }
737 return $UNICODEVERSION;
738}
3aa957f9 739
740=head2 Implementation Note
32c16050 741
ad9cab37 742The first use of charinfo() opens a read-only filehandle to the Unicode
743Character Database (the database is included in the Perl distribution).
78bf21c2 744The filehandle is then kept open for further queries. In other words,
745if you are wondering where one of your filehandles went, that's where.
32c16050 746
8b731da2 747=head1 BUGS
748
749Does not yet support EBCDIC platforms.
750
561c79ed 751=head1 AUTHOR
752
753Jarkko Hietaniemi
754
755=cut
756
7571;