Special casing had become a little bit more complex in Unicode 3.1.1.
[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed 2
3use strict;
4use warnings;
5
74f8133e 6our $VERSION = '0.2';
561c79ed 7
8require Exporter;
9
10our @ISA = qw(Exporter);
74f8133e 11
10a6ecd2 12our @EXPORT_OK = qw(charinfo
13 charblock charscript
14 charblocks charscripts
b08cd201 15 charinrange
16 compexcl
17 casefold casespec);
561c79ed 18
19use Carp;
20
21=head1 NAME
22
55d7b906 23Unicode::UCD - Unicode character database
561c79ed 24
25=head1 SYNOPSIS
26
55d7b906 27 use Unicode::UCD 'charinfo';
b08cd201 28 my $charinfo = charinfo($codepoint);
561c79ed 29
55d7b906 30 use Unicode::UCD 'charblock';
e882dd67 31 my $charblock = charblock($codepoint);
32
55d7b906 33 use Unicode::UCD 'charscript';
e882dd67 34 my $charscript = charblock($codepoint);
561c79ed 35
55d7b906 36 use Unicode::UCD 'charblocks';
e145285f 37 my $charblocks = charblocks();
38
55d7b906 39 use Unicode::UCD 'charscripts';
e145285f 40 my %charscripts = charscripts();
41
55d7b906 42 use Unicode::UCD qw(charscript charinrange);
e145285f 43 my $range = charscript($script);
44 print "looks like $script\n" if charinrange($range, $codepoint);
45
55d7b906 46 use Unicode::UCD 'compexcl';
e145285f 47 my $compexcl = compexcl($codepoint);
48
55d7b906 49 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 50
561c79ed 51=head1 DESCRIPTION
52
55d7b906 53The Unicode::UCD module offers a simple interface to the Unicode Character
561c79ed 54Database.
55
56=cut
57
10a6ecd2 58my $UNICODEFH;
59my $BLOCKSFH;
60my $SCRIPTSFH;
61my $VERSIONFH;
b08cd201 62my $COMPEXCLFH;
63my $CASEFOLDFH;
64my $CASESPECFH;
561c79ed 65
66sub openunicode {
67 my ($rfh, @path) = @_;
68 my $f;
69 unless (defined $$rfh) {
70 for my $d (@INC) {
71 use File::Spec;
55d7b906 72 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 73 last if open($$rfh, $f);
e882dd67 74 undef $f;
561c79ed 75 }
e882dd67 76 croak __PACKAGE__, ": failed to find ",
77 File::Spec->catfile(@path), " in @INC"
78 unless defined $f;
561c79ed 79 }
80 return $f;
81}
82
83=head2 charinfo
84
55d7b906 85 use Unicode::UCD 'charinfo';
561c79ed 86
b08cd201 87 my $charinfo = charinfo(0x41);
561c79ed 88
b08cd201 89charinfo() returns a reference to a hash that has the following fields
90as defined by the Unicode standard:
561c79ed 91
92 key
93
94 code code point with at least four hexdigits
95 name name of the character IN UPPER CASE
96 category general category of the character
97 combining classes used in the Canonical Ordering Algorithm
98 bidi bidirectional category
99 decomposition character decomposition mapping
100 decimal if decimal digit this is the integer numeric value
101 digit if digit this is the numeric value
102 numeric if numeric is the integer or rational numeric value
103 mirrored if mirrored in bidirectional text
104 unicode10 Unicode 1.0 name if existed and different
105 comment ISO 10646 comment field
106 upper uppercase equivalent mapping
107 lower lowercase equivalent mapping
108 title titlecase equivalent mapping
e882dd67 109
561c79ed 110 block block the character belongs to (used in \p{In...})
e882dd67 111 script script the character belongs to
561c79ed 112
b08cd201 113If no match is found, a reference to an empty hash is returned.
561c79ed 114
32c16050 115The C<block> property is the same as as returned by charinfo(). It is
116not defined in the Unicode Character Database proper (Chapter 4 of the
78bf21c2 117Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
118(Chapter 14 of TUS3). Similarly for the C<script> property.
32c16050 119
120Note that you cannot do (de)composition and casing based solely on the
121above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 122you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed 123
124=cut
125
10a6ecd2 126sub _getcode {
127 my $arg = shift;
128
129 if ($arg =~ /^\d+$/) {
130 return $arg;
131 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
132 return hex($1);
133 }
134
135 return;
136}
137
a6fa416b 138sub han_charname {
139 my $arg = shift;
140 my $code = _getcode($arg);
74f8133e 141 croak __PACKAGE__, "::han_charname: unknown code '$arg'"
a6fa416b 142 unless defined $code;
143 croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'"
144 unless 0x3400 <= $code && $code <= 0x4DB5
145 || 0x4E00 <= $code && $code <= 0x9FA5
146 || 0x20000 <= $code && $code <= 0x2A6D6;
147 sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
148}
149
150my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
151 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
152 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
153 );
154
155my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
156 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
157 "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
158 "YU", "EU", "YI", "I",
159 );
160
161my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
162 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
163 "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
164 "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
165 );
166
167my %HangulConst = (
168 SBase => 0xAC00,
169 LBase => 0x1100,
170 VBase => 0x1161,
171 TBase => 0x11A7,
172 LCount => 19, # scalar @JamoL
173 VCount => 21, # scalar @JamoV
174 TCount => 28, # scalar @JamoT
175 NCount => 588, # VCount * TCount
176 SCount => 11172, # LCount * NCount
177 Final => 0xD7A3, # SBase -1 + SCount
178 );
179
180sub hangul_charname {
181 my $arg = shift;
182 my $code = _getcode($arg);
74f8133e 183 croak __PACKAGE__, "::hangul_charname: unknown code '$arg'"
a6fa416b 184 unless defined $code;
185 croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
186 unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
187 my $SIndex = $code - $HangulConst{SBase};
188 my $LIndex = int( $SIndex / $HangulConst{NCount});
189 my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
190 my $TIndex = $SIndex % $HangulConst{TCount};
191 return join('',
192 "HANGUL SYLLABLE ",
193 $JamoL[$LIndex],
194 $JamoV[$VIndex],
195 $JamoT[$TIndex],
196 );
197}
198
199sub hangul_decomp {
200 my $arg = shift;
201 my $code = _getcode($arg);
74f8133e 202 croak __PACKAGE__, "::hangul_decomp: unknown code '$arg'"
a6fa416b 203 unless defined $code;
204 croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
205 unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
206 my $SIndex = $code - $HangulConst{SBase};
207 my $LIndex = int( $SIndex / $HangulConst{NCount});
208 my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
209 my $TIndex = $SIndex % $HangulConst{TCount};
210
211 return join(" ",
212 sprintf("%04X", $HangulConst{LBase} + $LIndex),
213 sprintf("%04X", $HangulConst{VBase} + $VIndex),
214 $TIndex ?
215 sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
216 );
217}
218
219my @CharinfoRanges = (
220# block name
221# [ first, last, coderef to name, coderef to decompose ],
222# CJK Ideographs Extension A
223 [ 0x3400, 0x4DB5, \&han_charname, undef ],
224# CJK Ideographs
225 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
226# Hangul Syllables
227 [ 0xAC00, 0xD7A3, \&hangul_charname, \&hangul_decomp ],
228# Non-Private Use High Surrogates
229 [ 0xD800, 0xDB7F, undef, undef ],
230# Private Use High Surrogates
231 [ 0xDB80, 0xDBFF, undef, undef ],
232# Low Surrogates
233 [ 0xDC00, 0xDFFF, undef, undef ],
234# The Private Use Area
235 [ 0xE000, 0xF8FF, undef, undef ],
236# CJK Ideographs Extension B
237 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
238# Plane 15 Private Use Area
239 [ 0xF0000, 0xFFFFD, undef, undef ],
240# Plane 16 Private Use Area
241 [ 0x100000, 0x10FFFD, undef, undef ],
242);
243
561c79ed 244sub charinfo {
10a6ecd2 245 my $arg = shift;
246 my $code = _getcode($arg);
247 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
248 unless defined $code;
e63dbbf9 249 my $hexk = sprintf("%06X", $code);
a6fa416b 250 my($rcode,$rname,$rdec);
251 foreach my $range (@CharinfoRanges){
74f8133e 252 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 253 $rcode = $hexk;
e63dbbf9 254 $rcode =~ s/^0+//;
255 $rcode = sprintf("%04X", hex($rcode));
a6fa416b 256 $rname = $range->[2] ? $range->[2]->($code) : '';
257 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 258 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b 259 last;
260 }
261 }
74f8133e 262 openunicode(\$UNICODEFH, "Unicode.txt");
10a6ecd2 263 if (defined $UNICODEFH) {
e63dbbf9 264 use Search::Dict 1.02;
265 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 266 my $line = <$UNICODEFH>;
561c79ed 267 chomp $line;
268 my %prop;
269 @prop{qw(
270 code name category
271 combining bidi decomposition
272 decimal digit numeric
273 mirrored unicode10 comment
274 upper lower title
275 )} = split(/;/, $line, -1);
e63dbbf9 276 $hexk =~ s/^0+//;
277 $hexk = sprintf("%04X", hex($hexk));
561c79ed 278 if ($prop{code} eq $hexk) {
a196fbfd 279 $prop{block} = charblock($code);
280 $prop{script} = charscript($code);
a6fa416b 281 if(defined $rname){
282 $prop{code} = $rcode;
283 $prop{name} = $rname;
284 $prop{decomposition} = $rdec;
285 }
b08cd201 286 return \%prop;
561c79ed 287 }
288 }
289 }
290 return;
291}
292
e882dd67 293sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
294 my ($table, $lo, $hi, $code) = @_;
295
296 return if $lo > $hi;
297
298 my $mid = int(($lo+$hi) / 2);
299
300 if ($table->[$mid]->[0] < $code) {
10a6ecd2 301 if ($table->[$mid]->[1] >= $code) {
e882dd67 302 return $table->[$mid]->[2];
303 } else {
304 _search($table, $mid + 1, $hi, $code);
305 }
306 } elsif ($table->[$mid]->[0] > $code) {
307 _search($table, $lo, $mid - 1, $code);
308 } else {
309 return $table->[$mid]->[2];
310 }
311}
312
10a6ecd2 313sub charinrange {
314 my ($range, $arg) = @_;
315 my $code = _getcode($arg);
316 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
317 unless defined $code;
318 _search($range, 0, $#$range, $code);
319}
320
354a27bf 321=head2 charblock
561c79ed 322
55d7b906 323 use Unicode::UCD 'charblock';
561c79ed 324
325 my $charblock = charblock(0x41);
10a6ecd2 326 my $charblock = charblock(1234);
327 my $charblock = charblock("0x263a");
328 my $charblock = charblock("U+263a");
329
78bf21c2 330 my $range = charblock('Armenian');
10a6ecd2 331
78bf21c2 332With a B<code point argument> charblock() returns the I<block> the character
10a6ecd2 333belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 334positions within all blocks are defined.
10a6ecd2 335
78bf21c2 336See also L</Blocks versus Scripts>.
337
10a6ecd2 338If supplied with an argument that can't be a code point, charblock()
339tries to do the opposite and interpret the argument as a character
340block. The return value is a I<range>: an anonymous list that
341contains anonymous lists, which in turn contain I<start-of-range>,
342I<end-of-range> code point pairs. You can test whether a code point
343is in a range using the L</charinrange> function. If the argument is
344not a known charater block, C<undef> is returned.
561c79ed 345
561c79ed 346=cut
347
348my @BLOCKS;
10a6ecd2 349my %BLOCKS;
561c79ed 350
10a6ecd2 351sub _charblocks {
561c79ed 352 unless (@BLOCKS) {
10a6ecd2 353 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
354 while (<$BLOCKSFH>) {
2796c109 355 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2 356 my ($lo, $hi) = (hex($1), hex($2));
357 my $subrange = [ $lo, $hi, $3 ];
358 push @BLOCKS, $subrange;
359 push @{$BLOCKS{$3}}, $subrange;
561c79ed 360 }
361 }
10a6ecd2 362 close($BLOCKSFH);
561c79ed 363 }
364 }
10a6ecd2 365}
366
367sub charblock {
368 my $arg = shift;
369
370 _charblocks() unless @BLOCKS;
371
372 my $code = _getcode($arg);
561c79ed 373
10a6ecd2 374 if (defined $code) {
375 _search(\@BLOCKS, 0, $#BLOCKS, $code);
376 } else {
377 if (exists $BLOCKS{$arg}) {
378 return $BLOCKS{$arg};
379 } else {
380 return;
381 }
382 }
e882dd67 383}
384
385=head2 charscript
386
55d7b906 387 use Unicode::UCD 'charscript';
e882dd67 388
389 my $charscript = charscript(0x41);
10a6ecd2 390 my $charscript = charscript(1234);
391 my $charscript = charscript("U+263a");
e882dd67 392
78bf21c2 393 my $range = charscript('Thai');
10a6ecd2 394
78bf21c2 395With a B<code point argument> charscript() returns the I<script> the
b08cd201 396character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2 397
78bf21c2 398See also L</Blocks versus Scripts>.
399
10a6ecd2 400If supplied with an argument that can't be a code point, charscript()
401tries to do the opposite and interpret the argument as a character
402script. The return value is a I<range>: an anonymous list that
403contains anonymous lists, which in turn contain I<start-of-range>,
404I<end-of-range> code point pairs. You can test whether a code point
405is in a range using the L</charinrange> function. If the argument is
406not a known charater script, C<undef> is returned.
e882dd67 407
e882dd67 408=cut
409
410my @SCRIPTS;
10a6ecd2 411my %SCRIPTS;
e882dd67 412
10a6ecd2 413sub _charscripts {
e882dd67 414 unless (@SCRIPTS) {
10a6ecd2 415 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
416 while (<$SCRIPTSFH>) {
e882dd67 417 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2 418 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
419 my $script = lc($3);
420 $script =~ s/\b(\w)/uc($1)/ge;
421 my $subrange = [ $lo, $hi, $script ];
422 push @SCRIPTS, $subrange;
423 push @{$SCRIPTS{$script}}, $subrange;
e882dd67 424 }
425 }
10a6ecd2 426 close($SCRIPTSFH);
e882dd67 427 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
428 }
429 }
10a6ecd2 430}
431
432sub charscript {
433 my $arg = shift;
434
435 _charscripts() unless @SCRIPTS;
e882dd67 436
10a6ecd2 437 my $code = _getcode($arg);
438
439 if (defined $code) {
440 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
441 } else {
442 if (exists $SCRIPTS{$arg}) {
443 return $SCRIPTS{$arg};
444 } else {
445 return;
446 }
447 }
448}
449
450=head2 charblocks
451
55d7b906 452 use Unicode::UCD 'charblocks';
10a6ecd2 453
b08cd201 454 my $charblocks = charblocks();
10a6ecd2 455
b08cd201 456charblocks() returns a reference to a hash with the known block names
457as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2 458
78bf21c2 459See also L</Blocks versus Scripts>.
460
10a6ecd2 461=cut
462
463sub charblocks {
b08cd201 464 _charblocks() unless %BLOCKS;
465 return \%BLOCKS;
10a6ecd2 466}
467
468=head2 charscripts
469
55d7b906 470 use Unicode::UCD 'charscripts';
10a6ecd2 471
472 my %charscripts = charscripts();
473
474charscripts() returns a hash with the known script names as the keys,
475and the code point ranges (see L</charscript>) as the values.
476
78bf21c2 477See also L</Blocks versus Scripts>.
478
10a6ecd2 479=cut
480
481sub charscripts {
b08cd201 482 _charscripts() unless %SCRIPTS;
483 return \%SCRIPTS;
561c79ed 484}
485
10a6ecd2 486=head2 Blocks versus Scripts
ad9cab37 487
10a6ecd2 488The difference between a block and a script is that scripts are closer
489to the linguistic notion of a set of characters required to present
490languages, while block is more of an artifact of the Unicode character
491numbering and separation into blocks of 256 characters.
3aa957f9 492
493For example the Latin B<script> is spread over several B<blocks>, such
494as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
495C<Latin Extended-B>. On the other hand, the Latin script does not
496contain all the characters of the C<Basic Latin> block (also known as
497the ASCII): it includes only the letters, not for example the digits
498or the punctuation.
ad9cab37 499
3aa957f9 500For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37 501
502For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
503
3aa957f9 504=head2 Matching Scripts and Blocks
505
506Both scripts and blocks can be matched using the regular expression
507construct C<\p{In...}> and its negation C<\P{In...}>.
508
509The name of the script or the block comes after the C<In>, for example
510C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
10a6ecd2 511removed from the names for the C<\p{In...}>, for example
512C<LatinExtendedA> instead of C<Latin Extended-A>.
513
78bf21c2 514There are a few cases where there is both a script and a block by the
515same name, in these cases the block version has C<Block> appended to
516its name: C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is
517the block.
10a6ecd2 518
b08cd201 519=head2 Code Point Arguments
520
78bf21c2 521A <code point argument> is either a decimal or a hexadecimal scalar
522designating a Unicode character, or "U+" followed by hexadecimals
523designating a Unicode character. Note that Unicode is B<not> limited
524to 16 bits (the number of Unicode characters is open-ended, in theory
525unlimited): you may have more than 4 hexdigits.
b08cd201 526
10a6ecd2 527=head2 charinrange
528
529In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
530can also test whether a code point is in the I<range> as returned by
531L</charblock> and L</charscript> or as the values of the hash returned
e618509d 532by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 533
55d7b906 534 use Unicode::UCD qw(charscript charinrange);
10a6ecd2 535
536 $range = charscript('Hiragana');
e145285f 537 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2 538
539=cut
540
b08cd201 541=head2 compexcl
542
55d7b906 543 use Unicode::UCD 'compexcl';
b08cd201 544
545 my $compexcl = compexcl("09dc");
546
547The compexcl() returns the composition exclusion (that is, if the
9046a8ae 548character should not be produced during a precomposition) of the
549character specified by a B<code point argument>.
b08cd201 550
551If there is a composition exclusion for the character, true is
552returned. Otherwise, false is returned.
553
554=cut
555
556my %COMPEXCL;
557
558sub _compexcl {
559 unless (%COMPEXCL) {
560 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
561 while (<$COMPEXCLFH>) {
562 if (/^([0-9A-F]+) \# /) {
563 my $code = hex($1);
564 $COMPEXCL{$code} = undef;
565 }
566 }
567 close($COMPEXCLFH);
568 }
569 }
570}
571
572sub compexcl {
573 my $arg = shift;
574 my $code = _getcode($arg);
74f8133e 575 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
576 unless defined $code;
b08cd201 577
578 _compexcl() unless %COMPEXCL;
579
580 return exists $COMPEXCL{$code};
581}
582
583=head2 casefold
584
55d7b906 585 use Unicode::UCD 'casefold';
b08cd201 586
587 my %casefold = casefold("09dc");
588
589The casefold() returns the locale-independent case folding of the
590character specified by a B<code point argument>.
591
592If there is a case folding for that character, a reference to a hash
593with the following fields is returned:
594
595 key
596
597 code code point with at least four hexdigits
598 status "C", "F", "S", or "I"
599 mapping one or more codes separated by spaces
600
601The meaning of the I<status> is as follows:
602
603 C common case folding, common mappings shared
604 by both simple and full mappings
605 F full case folding, mappings that cause strings
606 to grow in length. Multiple characters are separated
607 by spaces
608 S simple case folding, mappings to single characters
609 where different from F
610 I special case for dotted uppercase I and
611 dotless lowercase i
612 - If this mapping is included, the result is
613 case-insensitive, but dotless and dotted I's
614 are not distinguished
615 - If this mapping is excluded, the result is not
616 fully case-insensitive, but dotless and dotted
617 I's are distinguished
618
619If there is no case folding for that character, C<undef> is returned.
620
621For more information about case mappings see
622http://www.unicode.org/unicode/reports/tr21/
623
624=cut
625
626my %CASEFOLD;
627
628sub _casefold {
629 unless (%CASEFOLD) {
630 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
631 while (<$CASEFOLDFH>) {
632 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
633 my $code = hex($1);
634 $CASEFOLD{$code} = { code => $1,
635 status => $2,
636 mapping => $3 };
637 }
638 }
639 close($CASEFOLDFH);
640 }
641 }
642}
643
644sub casefold {
645 my $arg = shift;
646 my $code = _getcode($arg);
74f8133e 647 croak __PACKAGE__, "::casefold: unknown code '$arg'"
648 unless defined $code;
b08cd201 649
650 _casefold() unless %CASEFOLD;
651
652 return $CASEFOLD{$code};
653}
654
655=head2 casespec
656
55d7b906 657 use Unicode::UCD 'casespec';
b08cd201 658
659 my %casespec = casespec("09dc");
660
661The casespec() returns the potentially locale-dependent case mapping
662of the character specified by a B<code point argument>. The mapping
663may change the length of the string (which the basic Unicode case
664mappings as returned by charinfo() never do).
665
666If there is a case folding for that character, a reference to a hash
667with the following fields is returned:
668
669 key
670
671 code code point with at least four hexdigits
672 lower lowercase
673 title titlecase
674 upper uppercase
675 condition condition list (may be undef)
676
677The C<condition> is optional. Where present, it consists of one or
678more I<locales> or I<contexts>, separated by spaces (other than as
679used to separate elements, spaces are to be ignored). A condition
680list overrides the normal behavior if all of the listed conditions are
681true. Case distinctions in the condition list are not significant.
682Conditions preceded by "NON_" represent the negation of the condition
683
f499c386 684Note that when there are multiple case folding definitions for a
685single code point because of different locales, the value returned by
686casespec() is a hash reference which has the locales as the keys and
687hash references as described above as the values.
688
b08cd201 689A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d 690followed by a "_" and a 2-letter ISO language code (possibly followed
691by a "_" and a variant code). You can find the lists of those codes,
692see L<Locale::Country> and L<Locale::Language>.
b08cd201 693
694A I<context> is one of the following choices:
695
696 FINAL The letter is not followed by a letter of
697 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
698 MODERN The mapping is only used for modern text
e618509d 699 AFTER_i The last base character was "i" (U+0069)
b08cd201 700
701For more information about case mappings see
702http://www.unicode.org/unicode/reports/tr21/
703
704=cut
705
706my %CASESPEC;
707
708sub _casespec {
709 unless (%CASESPEC) {
710 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
711 while (<$CASESPECFH>) {
712 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386 713 my ($hexcode, $lower, $title, $upper, $condition) =
714 ($1, $2, $3, $4, $5);
715 my $code = hex($hexcode);
716 if (exists $CASESPEC{$code}) {
717 if (exists $CASESPEC{$code}->{code}) {
718 my ($oldlower,
719 $oldtitle,
720 $oldupper,
721 $oldcondition) =
722 @{$CASESPEC{$code}}{qw(lower
723 title
724 upper
725 condition)};
726 my ($oldlocale) =
727 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
728 if (defined $oldlocale) {
729 delete $CASESPEC{$code};
730 $CASESPEC{$code}->{$oldlocale} =
731 { code => $hexcode,
732 lower => $oldlower,
733 title => $oldtitle,
734 upper => $oldupper,
735 condition => $oldcondition };
736 } else {
737 warn __PACKAGE__, ": SpecCase.txt:", $., ": No oldlocale for 0x$hexcode\n"
738 }
739 }
740 my ($locale) =
741 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
742 $CASESPEC{$code}->{$locale} =
743 { code => $hexcode,
744 lower => $lower,
745 title => $title,
746 upper => $upper,
747 condition => $condition };
748 } else {
749 $CASESPEC{$code} =
750 { code => $hexcode,
751 lower => $lower,
752 title => $title,
753 upper => $upper,
754 condition => $condition };
755 }
b08cd201 756 }
757 }
758 close($CASESPECFH);
759 }
760 }
761}
762
763sub casespec {
764 my $arg = shift;
765 my $code = _getcode($arg);
74f8133e 766 croak __PACKAGE__, "::casespec: unknown code '$arg'"
767 unless defined $code;
b08cd201 768
769 _casespec() unless %CASESPEC;
770
771 return $CASESPEC{$code};
772}
773
55d7b906 774=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 775
55d7b906 776Unicode::UCD::UnicodeVersion() returns the version of the Unicode
777Character Database, in other words, the version of the Unicode
78bf21c2 778standard the database implements. The version is a string
779of numbers delimited by dots (C<'.'>).
10a6ecd2 780
781=cut
782
783my $UNICODEVERSION;
784
785sub UnicodeVersion {
786 unless (defined $UNICODEVERSION) {
787 openunicode(\$VERSIONFH, "version");
788 chomp($UNICODEVERSION = <$VERSIONFH>);
789 close($VERSIONFH);
790 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
791 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
792 }
793 return $UNICODEVERSION;
794}
3aa957f9 795
796=head2 Implementation Note
32c16050 797
ad9cab37 798The first use of charinfo() opens a read-only filehandle to the Unicode
799Character Database (the database is included in the Perl distribution).
78bf21c2 800The filehandle is then kept open for further queries. In other words,
801if you are wondering where one of your filehandles went, that's where.
32c16050 802
561c79ed 803=head1 AUTHOR
804
805Jarkko Hietaniemi
806
807=cut
808
8091;