X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FUnicode%2FUCD.pm;h=96dee9a8164c03ecfbc0b8f3b4c98b2e9f36ab80;hb=cd340a5d1de2dbc931e802b4aaed98e9e4d75f51;hp=6c20d4066d8526cc63c0f4d40330f3cf62fb5c73;hpb=2796c109dc2c56e2241410992d78bd8e0cccd71f;p=p5sagit%2Fp5-mst-13.2.git

diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index 6c20d40..96dee9a 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -3,12 +3,18 @@ package Unicode::UCD;
 use strict;
 use warnings;
 
-our $VERSION = '3.1.0';
+our $VERSION = '0.2';
 
 require Exporter;
 
 our @ISA = qw(Exporter);
-our @EXPORT_OK = qw(charinfo charblock charscript);
+
+our @EXPORT_OK = qw(charinfo
+		    charblock charscript
+		    charblocks charscripts
+		    charinrange
+		    compexcl
+		    casefold casespec);
 
 use Carp;
 
@@ -18,11 +24,8 @@ Unicode::UCD - Unicode character database
 
 =head1 SYNOPSIS
 
-    use Unicode::UCD 3.1.0;
-    # requires that level of the Unicode character database
-
     use Unicode::UCD 'charinfo';
-    my %charinfo   = charinfo($codepoint);
+    my $charinfo   = charinfo($codepoint);
 
     use Unicode::UCD 'charblock';
     my $charblock  = charblock($codepoint);
@@ -30,16 +33,35 @@ Unicode::UCD - Unicode character database
     use Unicode::UCD 'charscript';
     my $charscript = charblock($codepoint);
 
+    use Unicode::UCD 'charblocks';
+    my $charblocks = charblocks();
+
+    use Unicode::UCD 'charscripts';
+    my %charscripts = charscripts();
+
+    use Unicode::UCD qw(charscript charinrange);
+    my $range = charscript($script);
+    print "looks like $script\n" if charinrange($range, $codepoint);
+
+    use Unicode::UCD 'compexcl';
+    my $compexcl = compexcl($codepoint);
+
+    my $unicode_version = Unicode::UCD::UnicodeVersion();
+
 =head1 DESCRIPTION
 
-The Unicode module offers a simple interface to the Unicode Character
-Database.
+The Unicode::UCD module offers a simple interface to the Unicode
+Character Database.
 
 =cut
 
-my $UNICODE;
-my $BLOCKS;
-my $SCRIPTS;
+my $UNICODEFH;
+my $BLOCKSFH;
+my $SCRIPTSFH;
+my $VERSIONFH;
+my $COMPEXCLFH;
+my $CASEFOLDFH;
+my $CASESPECFH;
 
 sub openunicode {
     my ($rfh, @path) = @_;
@@ -47,7 +69,7 @@ sub openunicode {
     unless (defined $$rfh) {
 	for my $d (@INC) {
 	    use File::Spec;
-	    $f = File::Spec->catfile($d, "unicode", @path);
+	    $f = File::Spec->catfile($d, "unicore", @path);
 	    last if open($$rfh, $f);
 	    undef $f;
 	}
@@ -62,10 +84,10 @@ sub openunicode {
 
     use Unicode::UCD 'charinfo';
 
-    my %charinfo = charinfo(0x41);
+    my $charinfo = charinfo(0x41);
 
-charinfo() returns a hash that has the following fields as defined
-by the Unicode standard:
+charinfo() returns a reference to a hash that has the following fields
+as defined by the Unicode standard:
 
     key
 
@@ -86,32 +108,107 @@ by the Unicode standard:
     title            titlecase equivalent mapping
 
     block            block the character belongs to (used in \p{In...})
-    script           script the character belongs to 
+    script           script the character belongs to
 
-If no match is found, an empty hash is returned.
+If no match is found, a reference to an empty hash is returned.
 
-The C<block> property is the same as as returned by charinfo().  It is
+The C<block> property is the same as returned by charinfo().  It is
 not defined in the Unicode Character Database proper (Chapter 4 of the
-Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
-of TUS3).  Similarly for the C<script> property.
+Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
+(Chapter 14 of TUS3).  Similarly for the C<script> property.
 
 Note that you cannot do (de)composition and casing based solely on the
 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
-you will need also the I<Composition Exclusions>, I<Case Folding>, and
-I<SpecialCasing> tables, available as files F<CompExcl.txt>,
-F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
+you will need also the compexcl(), casefold(), and casespec() functions.
 
 =cut
 
+sub _getcode {
+    my $arg = shift;
+
+    if ($arg =~ /^\d+$/) {
+	return $arg;
+    } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
+	return hex($1);
+    }
+
+    return;
+}
+
+# Lingua::KO::Hangul::Util not part of the standard distribution
+# but it will be used if available.
+
+eval { require Lingua::KO::Hangul::Util };
+my $hasHangulUtil = ! $@;
+if ($hasHangulUtil) {
+    Lingua::KO::Hangul::Util->import();
+}
+
+sub hangul_decomp { # internal: called from charinfo
+    if ($hasHangulUtil) {
+	my @tmp = decomposeHangul(shift);
+	return sprintf("%04X %04X",      @tmp) if @tmp == 2;
+	return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
+    }
+    return;
+}
+
+sub hangul_charname { # internal: called from charinfo
+    return sprintf("HANGUL SYLLABLE-%04X", shift);
+}
+
+sub han_charname { # internal: called from charinfo
+    return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
+}
+
+my @CharinfoRanges = (
+# block name
+# [ first, last, coderef to name, coderef to decompose ],
+# CJK Ideographs Extension A
+  [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
+# CJK Ideographs
+  [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
+# Hangul Syllables
+  [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
+# Non-Private Use High Surrogates
+  [ 0xD800,   0xDB7F,   undef,   undef  ],
+# Private Use High Surrogates
+  [ 0xDB80,   0xDBFF,   undef,   undef  ],
+# Low Surrogates
+  [ 0xDC00,   0xDFFF,   undef,   undef  ],
+# The Private Use Area
+  [ 0xE000,   0xF8FF,   undef,   undef  ],
+# CJK Ideographs Extension B
+  [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
+# Plane 15 Private Use Area
+  [ 0xF0000,  0xFFFFD,  undef,   undef  ],
+# Plane 16 Private Use Area
+  [ 0x100000, 0x10FFFD, undef,   undef  ],
+);
+
 sub charinfo {
-    my $code = shift;
-    my $hexk = sprintf("%04X", $code);
-
-    openunicode(\$UNICODE, "Unicode.txt");
-    if (defined $UNICODE) {
-	use Search::Dict;
-	if (look($UNICODE, "$hexk;") >= 0) {
-	    my $line = <$UNICODE>;
+    my $arg  = shift;
+    my $code = _getcode($arg);
+    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
+	unless defined $code;
+    my $hexk = sprintf("%06X", $code);
+    my($rcode,$rname,$rdec);
+    foreach my $range (@CharinfoRanges){
+      if ($range->[0] <= $code && $code <= $range->[1]) {
+        $rcode = $hexk;
+	$rcode =~ s/^0+//;
+	$rcode =  sprintf("%04X", hex($rcode));
+        $rname = $range->[2] ? $range->[2]->($code) : '';
+        $rdec  = $range->[3] ? $range->[3]->($code) : '';
+        $hexk  = sprintf("%06X", $range->[0]); # replace by the first
+        last;
+      }
+    }
+    openunicode(\$UNICODEFH, "UnicodeData.txt");
+    if (defined $UNICODEFH) {
+	use Search::Dict 1.02;
+	if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
+	    my $line = <$UNICODEFH>;
 	    chomp $line;
 	    my %prop;
 	    @prop{qw(
@@ -121,10 +218,17 @@ sub charinfo {
 		     mirrored unicode10 comment
 		     upper lower title
 		    )} = split(/;/, $line, -1);
+	    $hexk =~ s/^0+//;
+	    $hexk =  sprintf("%04X", hex($hexk));
 	    if ($prop{code} eq $hexk) {
 		$prop{block}  = charblock($code);
 		$prop{script} = charscript($code);
-		return %prop;
+		if(defined $rname){
+                    $prop{code} = $rcode;
+                    $prop{name} = $rname;
+                    $prop{decomposition} = $rdec;
+                }
+		return \%prop;
 	    }
 	}
     }
@@ -139,7 +243,7 @@ sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
     my $mid = int(($lo+$hi) / 2);
 
     if ($table->[$mid]->[0] < $code) {
-	if (defined $table->[$mid]->[1] && $table->[$mid]->[1] >= $code) {
+	if ($table->[$mid]->[1] >= $code) {
 	    return $table->[$mid]->[2];
 	} else {
 	    _search($table, $mid + 1, $hi, $code);
@@ -151,40 +255,75 @@ sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
     }
 }
 
+sub charinrange {
+    my ($range, $arg) = @_;
+    my $code = _getcode($arg);
+    croak __PACKAGE__, "::charinrange: unknown code '$arg'"
+	unless defined $code;
+    _search($range, 0, $#$range, $code);
+}
+
 =head2 charblock
 
     use Unicode::UCD 'charblock';
 
     my $charblock = charblock(0x41);
+    my $charblock = charblock(1234);
+    my $charblock = charblock("0x263a");
+    my $charblock = charblock("U+263a");
+
+    my $range     = charblock('Armenian');
+
+With a B<code point argument> charblock() returns the I<block> the character
+belongs to, e.g.  C<Basic Latin>.  Note that not all the character
+positions within all blocks are defined.
 
-charblock() returns the block the character belongs to, e.g.
-C<Basic Latin>.  Note that not all the character positions within all
-blocks are defined.
+See also L</Blocks versus Scripts>.
 
-The name is the same name that is used in the C<\p{In...}> construct,
-for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
-away from the names for the C<\p{In...}>, for example C<LatinExtendedA>
-instead of C<Latin Extended-A>.
+If supplied with an argument that can't be a code point, charblock() tries
+to do the opposite and interpret the argument as a character block. The
+return value is a I<range>: an anonymous list of lists that contain
+I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
+code point is in a range using the L</charinrange> function. If the
+argument is not a known charater block, C<undef> is returned.
 
 =cut
 
 my @BLOCKS;
+my %BLOCKS;
 
-sub charblock {
-    my $code = shift;
-
+sub _charblocks {
     unless (@BLOCKS) {
-	if (openunicode(\$BLOCKS, "Blocks.txt")) {
-	    while (<$BLOCKS>) {
+	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
+	    while (<$BLOCKSFH>) {
 		if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
-		    push @BLOCKS, [ hex($1), hex($2), $3 ];
+		    my ($lo, $hi) = (hex($1), hex($2));
+		    my $subrange = [ $lo, $hi, $3 ];
+		    push @BLOCKS, $subrange;
+		    push @{$BLOCKS{$3}}, $subrange;
 		}
 	    }
-	    close($BLOCKS);
+	    close($BLOCKSFH);
 	}
     }
+}
+
+sub charblock {
+    my $arg = shift;
 
-    _search(\@BLOCKS, 0, $#BLOCKS, $code);
+    _charblocks() unless @BLOCKS;
+
+    my $code = _getcode($arg);
+
+    if (defined $code) {
+	_search(\@BLOCKS, 0, $#BLOCKS, $code);
+    } else {
+	if (exists $BLOCKS{$arg}) {
+	    return $BLOCKS{$arg};
+	} else {
+	    return;
+	}
+    }
 }
 
 =head2 charscript
@@ -192,94 +331,411 @@ sub charblock {
     use Unicode::UCD 'charscript';
 
     my $charscript = charscript(0x41);
+    my $charscript = charscript(1234);
+    my $charscript = charscript("U+263a");
+
+    my $range      = charscript('Thai');
 
-charscript() returns the script the character belongs to, e.g.
-C<Latin>, C<Greek>, C<Han>.
+With a B<code point argument> charscript() returns the I<script> the
+character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 
-Unfortunately, currently (Perl 5.8.0) there is no regular expression
-notation for matching scripts as there is for blocks (C<\p{In...}>.
+See also L</Blocks versus Scripts>.
+
+If supplied with an argument that can't be a code point, charscript() tries
+to do the opposite and interpret the argument as a character script. The
+return value is a I<range>: an anonymous list of lists that contain
+I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
+code point is in a range using the L</charinrange> function. If the
+argument is not a known charater script, C<undef> is returned.
 
 =cut
 
 my @SCRIPTS;
+my %SCRIPTS;
 
-sub charscript {
-    my $code = shift;
-
+sub _charscripts {
     unless (@SCRIPTS) {
-	if (openunicode(\$SCRIPTS, "Scripts.txt")) {
-	    while (<$SCRIPTS>) {
+	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
+	    while (<$SCRIPTSFH>) {
 		if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
-		    push @SCRIPTS, [ hex($1), $2 ? hex($2) : undef, $3 ];
+		    my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
+		    my $script = lc($3);
+		    $script =~ s/\b(\w)/uc($1)/ge;
+		    my $subrange = [ $lo, $hi, $script ];
+		    push @SCRIPTS, $subrange;
+		    push @{$SCRIPTS{$script}}, $subrange;
 		}
 	    }
-	    close($SCRIPTS);
+	    close($SCRIPTSFH);
 	    @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 	}
     }
+}
+
+sub charscript {
+    my $arg = shift;
+
+    _charscripts() unless @SCRIPTS;
+
+    my $code = _getcode($arg);
+
+    if (defined $code) {
+	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
+    } else {
+	if (exists $SCRIPTS{$arg}) {
+	    return $SCRIPTS{$arg};
+	} else {
+	    return;
+	}
+    }
+}
+
+=head2 charblocks
+
+    use Unicode::UCD 'charblocks';
 
-    _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
+    my $charblocks = charblocks();
+
+charblocks() returns a reference to a hash with the known block names
+as the keys, and the code point ranges (see L</charblock>) as the values.
+
+See also L</Blocks versus Scripts>.
+
+=cut
+
+sub charblocks {
+    _charblocks() unless %BLOCKS;
+    return \%BLOCKS;
 }
 
-=head2 charblock versus charscript
+=head2 charscripts
 
-The difference between a character block and a script is that scripts
-are closer to the linguistic notion of a set of characters required to
-present languages, while block is more of an artifact of the Unicode
-character numbering.  For example the Latin B<script> is spread over
-several B<blocks>, such as C<Basic Latin>, C<Latin 1 Supplement>,
-C<Latin Extended-A>, and C<Latin Extended-B>.  On the other hand, the
-Latin script does not contain all the characters of the C<Basic Latin>
-block (also known as the ASCII): it includes only the letters, not for
-example the digits or the punctuation.
+    use Unicode::UCD 'charscripts';
 
-For block see http://www.unicode.org/Public/UNIDATA/Blocks.txt
+    my %charscripts = charscripts();
+
+charscripts() returns a hash with the known script names as the keys,
+and the code point ranges (see L</charscript>) as the values.
+
+See also L</Blocks versus Scripts>.
+
+=cut
+
+sub charscripts {
+    _charscripts() unless %SCRIPTS;
+    return \%SCRIPTS;
+}
+
+=head2 Blocks versus Scripts
+
+The difference between a block and a script is that scripts are closer
+to the linguistic notion of a set of characters required to present
+languages, while block is more of an artifact of the Unicode character
+numbering and separation into blocks of (mostly) 256 characters.
+
+For example the Latin B<script> is spread over several B<blocks>, such
+as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
+C<Latin Extended-B>.  On the other hand, the Latin script does not
+contain all the characters of the C<Basic Latin> block (also known as
+the ASCII): it includes only the letters, and not, for example, the digits
+or the punctuation.
+
+For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
 
 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
 
-Note also that the script names are all in uppercase, e.g. C<HEBREW>,
-while the block names are Capitalized and with intermixed spaces,
-e.g. C<Yi Syllables>.
-
-Greek
-Cyrillic
-Armenian
-Hebrew
-Arabic
-Syriac
-Thaana
-Devanagari
-Bengali
-Gurmukhi
-Gujarati
-Oriya
-Tamil
-Telugu
-Kannada
-Malayalam
-Sinhala
-Thai
-Lao
-Tibetan
-Myanmar
-Georgian
-Ethiopic
-Cherokee
-Ogham
-Runic
-Khmer
-Hiragana
-Katakana
-Bopomofo
-OldItalic
-Gothic
-Deseret
-
-=head1 IMPLEMENTATION NOTE
+=head2 Matching Scripts and Blocks
+
+Scripts are matched with the regular-expression construct
+C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
+while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
+any of the 256 code points in the Tibetan block).
+
+=head2 Code Point Arguments
+
+A I<code point argument> is either a decimal or a hexadecimal scalar
+designating a Unicode character, or C<U+> followed by hexadecimals
+designating a Unicode character.  Note that Unicode is B<not> limited
+to 16 bits (the number of Unicode characters is open-ended, in theory
+unlimited): you may have more than 4 hexdigits.
+
+=head2 charinrange
+
+In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
+can also test whether a code point is in the I<range> as returned by
+L</charblock> and L</charscript> or as the values of the hash returned
+by L</charblocks> and L</charscripts> by using charinrange():
+
+    use Unicode::UCD qw(charscript charinrange);
+
+    $range = charscript('Hiragana');
+    print "looks like hiragana\n" if charinrange($range, $codepoint);
+
+=cut
+
+=head2 compexcl
+
+    use Unicode::UCD 'compexcl';
+
+    my $compexcl = compexcl("09dc");
+
+The compexcl() returns the composition exclusion (that is, if the
+character should not be produced during a precomposition) of the 
+character specified by a B<code point argument>.
+
+If there is a composition exclusion for the character, true is
+returned.  Otherwise, false is returned.
+
+=cut
+
+my %COMPEXCL;
+
+sub _compexcl {
+    unless (%COMPEXCL) {
+	if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
+	    while (<$COMPEXCLFH>) {
+		if (/^([0-9A-F]+)\s+\#\s+/) {
+		    my $code = hex($1);
+		    $COMPEXCL{$code} = undef;
+		}
+	    }
+	    close($COMPEXCLFH);
+	}
+    }
+}
+
+sub compexcl {
+    my $arg  = shift;
+    my $code = _getcode($arg);
+    croak __PACKAGE__, "::compexcl: unknown code '$arg'"
+	unless defined $code;
+
+    _compexcl() unless %COMPEXCL;
+
+    return exists $COMPEXCL{$code};
+}
+
+=head2 casefold
+
+    use Unicode::UCD 'casefold';
+
+    my %casefold = casefold("09dc");
+
+The casefold() returns the locale-independent case folding of the
+character specified by a B<code point argument>.
+
+If there is a case folding for that character, a reference to a hash
+with the following fields is returned:
+
+    key
+
+    code             code point with at least four hexdigits
+    status           "C", "F", "S", or "I"
+    mapping          one or more codes separated by spaces
+
+The meaning of the I<status> is as follows:
+
+   C                 common case folding, common mappings shared
+                     by both simple and full mappings
+   F                 full case folding, mappings that cause strings
+                     to grow in length. Multiple characters are separated
+                     by spaces
+   S                 simple case folding, mappings to single characters
+                     where different from F
+   I                 special case for dotted uppercase I and
+                     dotless lowercase i
+                     - If this mapping is included, the result is
+                       case-insensitive, but dotless and dotted I's
+                       are not distinguished
+                     - If this mapping is excluded, the result is not
+                       fully case-insensitive, but dotless and dotted
+                       I's are distinguished
+
+If there is no case folding for that character, C<undef> is returned.
+
+For more information about case mappings see
+http://www.unicode.org/unicode/reports/tr21/
+
+=cut
+
+my %CASEFOLD;
+
+sub _casefold {
+    unless (%CASEFOLD) {
+	if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
+	    while (<$CASEFOLDFH>) {
+		if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
+		    my $code = hex($1);
+		    $CASEFOLD{$code} = { code    => $1,
+					 status  => $2,
+					 mapping => $3 };
+		}
+	    }
+	    close($CASEFOLDFH);
+	}
+    }
+}
+
+sub casefold {
+    my $arg  = shift;
+    my $code = _getcode($arg);
+    croak __PACKAGE__, "::casefold: unknown code '$arg'"
+	unless defined $code;
+
+    _casefold() unless %CASEFOLD;
+
+    return $CASEFOLD{$code};
+}
+
+=head2 casespec
+
+    use Unicode::UCD 'casespec';
+
+    my %casespec = casespec("09dc");
+
+The casespec() returns the potentially locale-dependent case mapping
+of the character specified by a B<code point argument>.  The mapping
+may change the length of the string (which the basic Unicode case
+mappings as returned by charinfo() never do).
+
+If there is a case folding for that character, a reference to a hash
+with the following fields is returned:
+
+    key
+
+    code             code point with at least four hexdigits
+    lower            lowercase
+    title            titlecase
+    upper            uppercase
+    condition        condition list (may be undef)
+
+The C<condition> is optional.  Where present, it consists of one or
+more I<locales> or I<contexts>, separated by spaces (other than as
+used to separate elements, spaces are to be ignored).  A condition
+list overrides the normal behavior if all of the listed conditions are
+true.  Case distinctions in the condition list are not significant.
+Conditions preceded by "NON_" represent the negation of the condition
+
+Note that when there are multiple case folding definitions for a
+single code point because of different locales, the value returned by
+casespec() is a hash reference which has the locales as the keys and
+hash references as described above as the values.
+
+A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
+followed by a "_" and a 2-letter ISO language code (possibly followed
+by a "_" and a variant code).  You can find the lists of those codes,
+see L<Locale::Country> and L<Locale::Language>.
+
+A I<context> is one of the following choices:
+
+    FINAL            The letter is not followed by a letter of
+                     general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
+    MODERN           The mapping is only used for modern text
+    AFTER_i          The last base character was "i" (U+0069)
+
+For more information about case mappings see
+http://www.unicode.org/unicode/reports/tr21/
+
+=cut
+
+my %CASESPEC;
+
+sub _casespec {
+    unless (%CASESPEC) {
+	if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
+	    while (<$CASESPECFH>) {
+		if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
+		    my ($hexcode, $lower, $title, $upper, $condition) =
+			($1, $2, $3, $4, $5);
+		    my $code = hex($hexcode);
+		    if (exists $CASESPEC{$code}) {
+			if (exists $CASESPEC{$code}->{code}) {
+			    my ($oldlower,
+				$oldtitle,
+				$oldupper,
+				$oldcondition) =
+				    @{$CASESPEC{$code}}{qw(lower
+							   title
+							   upper
+							   condition)};
+			    if (defined $oldcondition) {
+				my ($oldlocale) =
+				($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
+				delete $CASESPEC{$code};
+				$CASESPEC{$code}->{$oldlocale} =
+				{ code      => $hexcode,
+				  lower     => $oldlower,
+				  title     => $oldtitle,
+				  upper     => $oldupper,
+				  condition => $oldcondition };
+			    }
+			}
+			my ($locale) =
+			    ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
+			$CASESPEC{$code}->{$locale} =
+			{ code      => $hexcode,
+			  lower     => $lower,
+			  title     => $title,
+			  upper     => $upper,
+			  condition => $condition };
+		    } else {
+			$CASESPEC{$code} =
+			{ code      => $hexcode,
+			  lower     => $lower,
+			  title     => $title,
+			  upper     => $upper,
+			  condition => $condition };
+		    }
+		}
+	    }
+	    close($CASESPECFH);
+	}
+    }
+}
+
+sub casespec {
+    my $arg  = shift;
+    my $code = _getcode($arg);
+    croak __PACKAGE__, "::casespec: unknown code '$arg'"
+	unless defined $code;
+
+    _casespec() unless %CASESPEC;
+
+    return $CASESPEC{$code};
+}
+
+=head2 Unicode::UCD::UnicodeVersion
+
+Unicode::UCD::UnicodeVersion() returns the version of the Unicode
+Character Database, in other words, the version of the Unicode
+standard the database implements.  The version is a string
+of numbers delimited by dots (C<'.'>).
+
+=cut
+
+my $UNICODEVERSION;
+
+sub UnicodeVersion {
+    unless (defined $UNICODEVERSION) {
+	openunicode(\$VERSIONFH, "version");
+	chomp($UNICODEVERSION = <$VERSIONFH>);
+	close($VERSIONFH);
+	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
+	    unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
+    }
+    return $UNICODEVERSION;
+}
+
+=head2 Implementation Note
 
 The first use of charinfo() opens a read-only filehandle to the Unicode
 Character Database (the database is included in the Perl distribution).
-The filehandle is then kept open for further queries.
+The filehandle is then kept open for further queries.  In other words,
+if you are wondering where one of your filehandles went, that's where.
+
+=head1 BUGS
+
+Does not yet support EBCDIC platforms.
 
 =head1 AUTHOR