[p5sagit/p5-mst-13.2.git] / lib / UnicodeCD.pm

package UnicodeCD;

use strict;
use warnings;

our $VERSION = '0.1';

require Exporter;

our @ISA = qw(Exporter);
our @EXPORT_OK = qw(charinfo
		    charblock charscript
		    charblocks charscripts
		    charinrange
		    compexcl
		    casefold casespec);

use Carp;

=head1 NAME

UnicodeCD - Unicode character database

=head1 SYNOPSIS

    use UnicodeCD 'charinfo';
    my $charinfo   = charinfo($codepoint);

    use UnicodeCD 'charblock';
    my $charblock  = charblock($codepoint);

    use UnicodeCD 'charscript';
    my $charscript = charblock($codepoint);

    use UnicodeCD 'charblocks';
    my $charblocks = charblocks();

    use UnicodeCD 'charscripts';
    my %charscripts = charscripts();

    use UnicodeCD qw(charscript charinrange);
    my $range = charscript($script);
    print "looks like $script\n" if charinrange($range, $codepoint);

    use UnicodeCD 'compexcl';
    my $compexcl = compexcl($codepoint);

    my $unicode_version = UnicodeCD::UnicodeVersion();

=head1 DESCRIPTION

The UnicodeCD module offers a simple interface to the Unicode Character
Database.

=cut

my $UNICODEFH;
my $BLOCKSFH;
my $SCRIPTSFH;
my $VERSIONFH;
my $COMPEXCLFH;
my $CASEFOLDFH;
my $CASESPECFH;

sub openunicode {
    my ($rfh, @path) = @_;
    my $f;
    unless (defined $$rfh) {
	for my $d (@INC) {
	    use File::Spec;
	    $f = File::Spec->catfile($d, "unicode", @path);
	    last if open($$rfh, $f);
	    undef $f;
	}
	croak __PACKAGE__, ": failed to find ",
              File::Spec->catfile(@path), " in @INC"
	    unless defined $f;
    }
    return $f;
}

=head2 charinfo

    use UnicodeCD 'charinfo';

    my $charinfo = charinfo(0x41);

charinfo() returns a reference to a hash that has the following fields
as defined by the Unicode standard:

    key

    code             code point with at least four hexdigits
    name             name of the character IN UPPER CASE
    category         general category of the character
    combining        classes used in the Canonical Ordering Algorithm
    bidi             bidirectional category
    decomposition    character decomposition mapping
    decimal          if decimal digit this is the integer numeric value
    digit            if digit this is the numeric value
    numeric          if numeric is the integer or rational numeric value
    mirrored         if mirrored in bidirectional text
    unicode10        Unicode 1.0 name if existed and different
    comment          ISO 10646 comment field
    upper            uppercase equivalent mapping
    lower            lowercase equivalent mapping
    title            titlecase equivalent mapping

    block            block the character belongs to (used in \p{In...})
    script           script the character belongs to 

If no match is found, a reference to an empty hash is returned.

The C<block> property is the same as as returned by charinfo().  It is
not defined in the Unicode Character Database proper (Chapter 4 of the
Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
of TUS3).  Similarly for the C<script> property.

Note that you cannot do (de)composition and casing based solely on the
above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
you will need also the compexcl(), casefold(), and casespec() functions.

=cut

sub _getcode {
    my $arg = shift;

    if ($arg =~ /^\d+$/) {
	return $arg;
    } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
	return hex($1);
    }

    return;
}

sub charinfo {
    my $arg  = shift;
    my $code = _getcode($arg);
    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
	unless defined $code;
    my $hexk = sprintf("%04X", $code);

    openunicode(\$UNICODEFH, "Unicode.txt");
    if (defined $UNICODEFH) {
	use Search::Dict;
	if (look($UNICODEFH, "$hexk;") >= 0) {
	    my $line = <$UNICODEFH>;
	    chomp $line;
	    my %prop;
	    @prop{qw(
		     code name category
		     combining bidi decomposition
		     decimal digit numeric
		     mirrored unicode10 comment
		     upper lower title
		    )} = split(/;/, $line, -1);
	    if ($prop{code} eq $hexk) {
		$prop{block}  = charblock($code);
		$prop{script} = charscript($code);
		return \%prop;
	    }
	}
    }
    return;
}

sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
    my ($table, $lo, $hi, $code) = @_;

    return if $lo > $hi;

    my $mid = int(($lo+$hi) / 2);

    if ($table->[$mid]->[0] < $code) {
	if ($table->[$mid]->[1] >= $code) {
	    return $table->[$mid]->[2];
	} else {
	    _search($table, $mid + 1, $hi, $code);
	}
    } elsif ($table->[$mid]->[0] > $code) {
	_search($table, $lo, $mid - 1, $code);
    } else {
	return $table->[$mid]->[2];
    }
}

sub charinrange {
    my ($range, $arg) = @_;
    my $code = _getcode($arg);
    croak __PACKAGE__, "::charinrange: unknown code '$arg'"
	unless defined $code;
    _search($range, 0, $#$range, $code);
}

=head2 charblock

    use UnicodeCD 'charblock';

    my $charblock = charblock(0x41);
    my $charblock = charblock(1234);
    my $charblock = charblock("0x263a");
    my $charblock = charblock("U+263a");

    my $ranges    = charblock('Armenian');

With a B<code point argument> charblock() returns the block the character
belongs to, e.g.  C<Basic Latin>.  Note that not all the character
positions within all blocks are defined.

If supplied with an argument that can't be a code point, charblock()
tries to do the opposite and interpret the argument as a character
block.  The return value is a I<range>: an anonymous list that
contains anonymous lists, which in turn contain I<start-of-range>,
I<end-of-range> code point pairs.  You can test whether a code point
is in a range using the L</charinrange> function.  If the argument is
not a known charater block, C<undef> is returned.

=cut

my @BLOCKS;
my %BLOCKS;

sub _charblocks {
    unless (@BLOCKS) {
	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
	    while (<$BLOCKSFH>) {
		if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
		    my ($lo, $hi) = (hex($1), hex($2));
		    my $subrange = [ $lo, $hi, $3 ];
		    push @BLOCKS, $subrange;
		    push @{$BLOCKS{$3}}, $subrange;
		}
	    }
	    close($BLOCKSFH);
	}
    }
}

sub charblock {
    my $arg = shift;

    _charblocks() unless @BLOCKS;

    my $code = _getcode($arg);

    if (defined $code) {
	_search(\@BLOCKS, 0, $#BLOCKS, $code);
    } else {
	if (exists $BLOCKS{$arg}) {
	    return $BLOCKS{$arg};
	} else {
	    return;
	}
    }
}

=head2 charscript

    use UnicodeCD 'charscript';

    my $charscript = charscript(0x41);
    my $charscript = charscript(1234);
    my $charscript = charscript("U+263a");

    my $ranges     = charscript('Thai');

With a B<code point argument> charscript() returns the script the
character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.

If supplied with an argument that can't be a code point, charscript()
tries to do the opposite and interpret the argument as a character
script.  The return value is a I<range>: an anonymous list that
contains anonymous lists, which in turn contain I<start-of-range>,
I<end-of-range> code point pairs.  You can test whether a code point
is in a range using the L</charinrange> function.  If the argument is
not a known charater script, C<undef> is returned.

=cut

my @SCRIPTS;
my %SCRIPTS;

sub _charscripts {
    unless (@SCRIPTS) {
	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
	    while (<$SCRIPTSFH>) {
		if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
		    my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
		    my $script = lc($3);
		    $script =~ s/\b(\w)/uc($1)/ge;
		    my $subrange = [ $lo, $hi, $script ];
		    push @SCRIPTS, $subrange;
		    push @{$SCRIPTS{$script}}, $subrange;
		}
	    }
	    close($SCRIPTSFH);
	    @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
	}
    }
}

sub charscript {
    my $arg = shift;

    _charscripts() unless @SCRIPTS;

    my $code = _getcode($arg);

    if (defined $code) {
	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
    } else {
	if (exists $SCRIPTS{$arg}) {
	    return $SCRIPTS{$arg};
	} else {
	    return;
	}
    }
}

=head2 charblocks

    use UnicodeCD 'charblocks';

    my $charblocks = charblocks();

charblocks() returns a reference to a hash with the known block names
as the keys, and the code point ranges (see L</charblock>) as the values.

=cut

sub charblocks {
    _charblocks() unless %BLOCKS;
    return \%BLOCKS;
}

=head2 charscripts

    use UnicodeCD 'charscripts';

    my %charscripts = charscripts();

charscripts() returns a hash with the known script names as the keys,
and the code point ranges (see L</charscript>) as the values.

=cut

sub charscripts {
    _charscripts() unless %SCRIPTS;
    return \%SCRIPTS;
}

=head2 Blocks versus Scripts

The difference between a block and a script is that scripts are closer
to the linguistic notion of a set of characters required to present
languages, while block is more of an artifact of the Unicode character
numbering and separation into blocks of 256 characters.

For example the Latin B<script> is spread over several B<blocks>, such
as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
C<Latin Extended-B>.  On the other hand, the Latin script does not
contain all the characters of the C<Basic Latin> block (also known as
the ASCII): it includes only the letters, not for example the digits
or the punctuation.

For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt

For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/

=head2 Matching Scripts and Blocks

Both scripts and blocks can be matched using the regular expression
construct C<\p{In...}> and its negation C<\P{In...}>.

The name of the script or the block comes after the C<In>, for example
C<\p{InCyrillic}>, C<\P{InBasicLatin}>.  Spaces and dashes ('-') are
removed from the names for the C<\p{In...}>, for example
C<LatinExtendedA> instead of C<Latin Extended-A>.

There are a few cases where there exists both a script and a block by
the same name, in these cases the block version has C<Block> appended:
C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.

=head2 Code Point Arguments

A <code point argument> is either a decimal or a hexadecimal scalar,
or "U+" followed by hexadecimals.

=head2 charinrange

In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
can also test whether a code point is in the I<range> as returned by
L</charblock> and L</charscript> or as the values of the hash returned
by L</charblocks> and L</charscripts> by using charinrange():

    use UnicodeCD qw(charscript charinrange);

    $range = charscript('Hiragana');
    print "looks like hiragana\n" if charinrange($range, $codepoint);

=cut

=head2 compexcl

    use UnicodeCD 'compexcl';

    my $compexcl = compexcl("09dc");

The compexcl() returns the composition exclusion (that is, if the
character should not be produced during a precomposition) of the 
character specified by a B<code point argument>.

If there is a composition exclusion for the character, true is
returned.  Otherwise, false is returned.

=cut

my %COMPEXCL;

sub _compexcl {
    unless (%COMPEXCL) {
	if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
	    while (<$COMPEXCLFH>) {
		if (/^([0-9A-F]+) \# /) {
		    my $code = hex($1);
		    $COMPEXCL{$code} = undef;
		}
	    }
	    close($COMPEXCLFH);
	}
    }
}

sub compexcl {
    my $arg  = shift;
    my $code = _getcode($arg);

    _compexcl() unless %COMPEXCL;

    return exists $COMPEXCL{$code};
}

=head2 casefold

    use UnicodeCD 'casefold';

    my %casefold = casefold("09dc");

The casefold() returns the locale-independent case folding of the
character specified by a B<code point argument>.

If there is a case folding for that character, a reference to a hash
with the following fields is returned:

    key

    code             code point with at least four hexdigits
    status           "C", "F", "S", or "I"
    mapping          one or more codes separated by spaces

The meaning of the I<status> is as follows:

   C                 common case folding, common mappings shared
                     by both simple and full mappings
   F                 full case folding, mappings that cause strings
                     to grow in length. Multiple characters are separated
                     by spaces
   S                 simple case folding, mappings to single characters
                     where different from F
   I                 special case for dotted uppercase I and
                     dotless lowercase i
                     - If this mapping is included, the result is
                       case-insensitive, but dotless and dotted I's
                       are not distinguished
                     - If this mapping is excluded, the result is not
                       fully case-insensitive, but dotless and dotted
                       I's are distinguished

If there is no case folding for that character, C<undef> is returned.

For more information about case mappings see
http://www.unicode.org/unicode/reports/tr21/

=cut

my %CASEFOLD;

sub _casefold {
    unless (%CASEFOLD) {
	if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
	    while (<$CASEFOLDFH>) {
		if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
		    my $code = hex($1);
		    $CASEFOLD{$code} = { code    => $1,
					 status  => $2,
					 mapping => $3 };
		}
	    }
	    close($CASEFOLDFH);
	}
    }
}

sub casefold {
    my $arg  = shift;
    my $code = _getcode($arg);

    _casefold() unless %CASEFOLD;

    return $CASEFOLD{$code};
}

=head2 casespec

    use UnicodeCD 'casespec';

    my %casespec = casespec("09dc");

The casespec() returns the potentially locale-dependent case mapping
of the character specified by a B<code point argument>.  The mapping
may change the length of the string (which the basic Unicode case
mappings as returned by charinfo() never do).

If there is a case folding for that character, a reference to a hash
with the following fields is returned:

    key

    code             code point with at least four hexdigits
    lower            lowercase
    title            titlecase
    upper            uppercase
    condition        condition list (may be undef)

The C<condition> is optional.  Where present, it consists of one or
more I<locales> or I<contexts>, separated by spaces (other than as
used to separate elements, spaces are to be ignored).  A condition
list overrides the normal behavior if all of the listed conditions are
true.  Case distinctions in the condition list are not significant.
Conditions preceded by "NON_" represent the negation of the condition

A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
followed by a "_" and a 2-letter ISO language code (possibly followed
by a "_" and a variant code).  You can find the lists of those codes,
see L<Locale::Country> and L<Locale::Language>.

A I<context> is one of the following choices:

    FINAL            The letter is not followed by a letter of
                     general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
    MODERN           The mapping is only used for modern text
    AFTER_i          The last base character was "i" (U+0069)

For more information about case mappings see
http://www.unicode.org/unicode/reports/tr21/

=cut

my %CASESPEC;

sub _casespec {
    unless (%CASESPEC) {
	if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
	    while (<$CASESPECFH>) {
		if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
		    my $code = hex($1);
		    $CASESPEC{$code} = { code      => $1,
					 lower     => $2,
					 title     => $3,
					 upper     => $4,
					 condition => $5 };
		}
	    }
	    close($CASESPECFH);
	}
    }
}

sub casespec {
    my $arg  = shift;
    my $code = _getcode($arg);

    _casespec() unless %CASESPEC;

    return $CASESPEC{$code};
}

=head2 UnicodeCD::UnicodeVersion

UnicodeCD::UnicodeVersion() returns the version of the Unicode Character
Database, in other words, the version of the Unicode standard the
database implements.

=cut

my $UNICODEVERSION;

sub UnicodeVersion {
    unless (defined $UNICODEVERSION) {
	openunicode(\$VERSIONFH, "version");
	chomp($UNICODEVERSION = <$VERSIONFH>);
	close($VERSIONFH);
	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
	    unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
    }
    return $UNICODEVERSION;
}

=head2 Implementation Note

The first use of charinfo() opens a read-only filehandle to the Unicode
Character Database (the database is included in the Perl distribution).
The filehandle is then kept open for further queries.

=head1 AUTHOR

Jarkko Hietaniemi

=cut

1;
Commit	Line	Data
1189d1e4	1	package UnicodeCD;
561c79ed	2
	3	use strict;
	4	use warnings;
	5
10a6ecd2	6	our $VERSION = '0.1';
561c79ed	7
	8	require Exporter;
	9
	10	our @ISA = qw(Exporter);
10a6ecd2	11	our @EXPORT_OK = qw(charinfo
	12	charblock charscript
	13	charblocks charscripts
b08cd201	14	charinrange
	15	compexcl
	16	casefold casespec);
561c79ed	17
	18	use Carp;
	19
	20	=head1 NAME
	21
1189d1e4	22	UnicodeCD - Unicode character database
561c79ed	23
	24	=head1 SYNOPSIS
	25
1189d1e4	26	use UnicodeCD 'charinfo';
b08cd201	27	my $charinfo = charinfo($codepoint);
561c79ed	28
1189d1e4	29	use UnicodeCD 'charblock';
e882dd67	30	my $charblock = charblock($codepoint);
e882dd67	31
1189d1e4	32	use UnicodeCD 'charscript';
e882dd67	33	my $charscript = charblock($codepoint);
561c79ed	34
e145285f	35	use UnicodeCD 'charblocks';
	36	my $charblocks = charblocks();
	37
	38	use UnicodeCD 'charscripts';
	39	my %charscripts = charscripts();
	40
	41	use UnicodeCD qw(charscript charinrange);
	42	my $range = charscript($script);
	43	print "looks like $script\n" if charinrange($range, $codepoint);
	44
	45	use UnicodeCD 'compexcl';
	46	my $compexcl = compexcl($codepoint);
	47
	48	my $unicode_version = UnicodeCD::UnicodeVersion();
	49
561c79ed	50	=head1 DESCRIPTION
561c79ed	51
e145285f	52	The UnicodeCD module offers a simple interface to the Unicode Character
561c79ed	53	Database.
	54
	55	=cut
	56
10a6ecd2	57	my $UNICODEFH;
	58	my $BLOCKSFH;
	59	my $SCRIPTSFH;
	60	my $VERSIONFH;
b08cd201	61	my $COMPEXCLFH;
	62	my $CASEFOLDFH;
	63	my $CASESPECFH;
561c79ed	64
	65	sub openunicode {
	66	my ($rfh, @path) = @_;
	67	my $f;
	68	unless (defined $$rfh) {
	69	for my $d (@INC) {
	70	use File::Spec;
	71	$f = File::Spec->catfile($d, "unicode", @path);
32c16050	72	last if open($$rfh, $f);
e882dd67	73	undef $f;
561c79ed	74	}
e882dd67	75	croak __PACKAGE__, ": failed to find ",
	76	File::Spec->catfile(@path), " in @INC"
	77	unless defined $f;
561c79ed	78	}
	79	return $f;
	80	}
	81
	82	=head2 charinfo
	83
1189d1e4	84	use UnicodeCD 'charinfo';
561c79ed	85
b08cd201	86	my $charinfo = charinfo(0x41);
561c79ed	87
b08cd201	88	charinfo() returns a reference to a hash that has the following fields
b08cd201	89	as defined by the Unicode standard:
561c79ed	90
	91	key
	92
	93	code code point with at least four hexdigits
	94	name name of the character IN UPPER CASE
	95	category general category of the character
	96	combining classes used in the Canonical Ordering Algorithm
	97	bidi bidirectional category
	98	decomposition character decomposition mapping
	99	decimal if decimal digit this is the integer numeric value
	100	digit if digit this is the numeric value
	101	numeric if numeric is the integer or rational numeric value
	102	mirrored if mirrored in bidirectional text
	103	unicode10 Unicode 1.0 name if existed and different
	104	comment ISO 10646 comment field
	105	upper uppercase equivalent mapping
	106	lower lowercase equivalent mapping
	107	title titlecase equivalent mapping
e882dd67	108
561c79ed	109	block block the character belongs to (used in \p{In...})
e882dd67	110	script script the character belongs to
561c79ed	111
b08cd201	112	If no match is found, a reference to an empty hash is returned.
561c79ed	113
32c16050	114	The C<block> property is the same as as returned by charinfo(). It is
	115	not defined in the Unicode Character Database proper (Chapter 4 of the
	116	Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
e882dd67	117	of TUS3). Similarly for the C<script> property.
32c16050	118
	119	Note that you cannot do (de)composition and casing based solely on the
	120	above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201	121	you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed	122
	123	=cut
	124
10a6ecd2	125	sub _getcode {
	126	my $arg = shift;
	127
	128	if ($arg =~ /^\d+$/) {
	129	return $arg;
	130	} elsif ($arg =~ /^(?:U\+\|0x)?([[:xdigit:]]+)$/) {
	131	return hex($1);
	132	}
	133
	134	return;
	135	}
	136
561c79ed	137	sub charinfo {
10a6ecd2	138	my $arg = shift;
	139	my $code = _getcode($arg);
	140	croak __PACKAGE__, "::charinfo: unknown code '$arg'"
	141	unless defined $code;
561c79ed	142	my $hexk = sprintf("%04X", $code);
561c79ed	143
10a6ecd2	144	openunicode(\$UNICODEFH, "Unicode.txt");
10a6ecd2	145	if (defined $UNICODEFH) {
561c79ed	146	use Search::Dict;
10a6ecd2	147	if (look($UNICODEFH, "$hexk;") >= 0) {
10a6ecd2	148	my $line = <$UNICODEFH>;
561c79ed	149	chomp $line;
	150	my %prop;
	151	@prop{qw(
	152	code name category
	153	combining bidi decomposition
	154	decimal digit numeric
	155	mirrored unicode10 comment
	156	upper lower title
	157	)} = split(/;/, $line, -1);
	158	if ($prop{code} eq $hexk) {
a196fbfd	159	$prop{block} = charblock($code);
a196fbfd	160	$prop{script} = charscript($code);
b08cd201	161	return \%prop;
561c79ed	162	}
	163	}
	164	}
	165	return;
	166	}
	167
e882dd67	168	sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
	169	my ($table, $lo, $hi, $code) = @_;
	170
	171	return if $lo > $hi;
	172
	173	my $mid = int(($lo+$hi) / 2);
	174
	175	if ($table->[$mid]->[0] < $code) {
10a6ecd2	176	if ($table->[$mid]->[1] >= $code) {
e882dd67	177	return $table->[$mid]->[2];
	178	} else {
	179	_search($table, $mid + 1, $hi, $code);
	180	}
	181	} elsif ($table->[$mid]->[0] > $code) {
	182	_search($table, $lo, $mid - 1, $code);
	183	} else {
	184	return $table->[$mid]->[2];
	185	}
	186	}
	187
10a6ecd2	188	sub charinrange {
	189	my ($range, $arg) = @_;
	190	my $code = _getcode($arg);
	191	croak __PACKAGE__, "::charinrange: unknown code '$arg'"
	192	unless defined $code;
	193	_search($range, 0, $#$range, $code);
	194	}
	195
354a27bf	196	=head2 charblock
561c79ed	197
1189d1e4	198	use UnicodeCD 'charblock';
561c79ed	199
561c79ed	200	my $charblock = charblock(0x41);
10a6ecd2	201	my $charblock = charblock(1234);
	202	my $charblock = charblock("0x263a");
	203	my $charblock = charblock("U+263a");
	204
	205	my $ranges = charblock('Armenian');
	206
	207	With a B<code point argument> charblock() returns the block the character
	208	belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201	209	positions within all blocks are defined.
10a6ecd2	210
	211	If supplied with an argument that can't be a code point, charblock()
	212	tries to do the opposite and interpret the argument as a character
	213	block. The return value is a I<range>: an anonymous list that
	214	contains anonymous lists, which in turn contain I<start-of-range>,
	215	I<end-of-range> code point pairs. You can test whether a code point
	216	is in a range using the L</charinrange> function. If the argument is
	217	not a known charater block, C<undef> is returned.
561c79ed	218
561c79ed	219	=cut
	220
	221	my @BLOCKS;
10a6ecd2	222	my %BLOCKS;
561c79ed	223
10a6ecd2	224	sub _charblocks {
561c79ed	225	unless (@BLOCKS) {
10a6ecd2	226	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
10a6ecd2	227	while (<$BLOCKSFH>) {
2796c109	228	if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2	229	my ($lo, $hi) = (hex($1), hex($2));
	230	my $subrange = [ $lo, $hi, $3 ];
	231	push @BLOCKS, $subrange;
	232	push @{$BLOCKS{$3}}, $subrange;
561c79ed	233	}
561c79ed	234	}
10a6ecd2	235	close($BLOCKSFH);
561c79ed	236	}
561c79ed	237	}
10a6ecd2	238	}
	239
	240	sub charblock {
	241	my $arg = shift;
	242
	243	_charblocks() unless @BLOCKS;
	244
	245	my $code = _getcode($arg);
561c79ed	246
10a6ecd2	247	if (defined $code) {
	248	_search(\@BLOCKS, 0, $#BLOCKS, $code);
	249	} else {
	250	if (exists $BLOCKS{$arg}) {
	251	return $BLOCKS{$arg};
	252	} else {
	253	return;
	254	}
	255	}
e882dd67	256	}
	257
	258	=head2 charscript
	259
1189d1e4	260	use UnicodeCD 'charscript';
e882dd67	261
e882dd67	262	my $charscript = charscript(0x41);
10a6ecd2	263	my $charscript = charscript(1234);
10a6ecd2	264	my $charscript = charscript("U+263a");
e882dd67	265
10a6ecd2	266	my $ranges = charscript('Thai');
	267
	268	With a B<code point argument> charscript() returns the script the
b08cd201	269	character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2	270
	271	If supplied with an argument that can't be a code point, charscript()
	272	tries to do the opposite and interpret the argument as a character
	273	script. The return value is a I<range>: an anonymous list that
	274	contains anonymous lists, which in turn contain I<start-of-range>,
	275	I<end-of-range> code point pairs. You can test whether a code point
	276	is in a range using the L</charinrange> function. If the argument is
	277	not a known charater script, C<undef> is returned.
e882dd67	278
e882dd67	279	=cut
	280
	281	my @SCRIPTS;
10a6ecd2	282	my %SCRIPTS;
e882dd67	283
10a6ecd2	284	sub _charscripts {
e882dd67	285	unless (@SCRIPTS) {
10a6ecd2	286	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
10a6ecd2	287	while (<$SCRIPTSFH>) {
e882dd67	288	if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2	289	my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
	290	my $script = lc($3);
	291	$script =~ s/\b(\w)/uc($1)/ge;
	292	my $subrange = [ $lo, $hi, $script ];
	293	push @SCRIPTS, $subrange;
	294	push @{$SCRIPTS{$script}}, $subrange;
e882dd67	295	}
e882dd67	296	}
10a6ecd2	297	close($SCRIPTSFH);
e882dd67	298	@SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
	299	}
	300	}
10a6ecd2	301	}
	302
	303	sub charscript {
	304	my $arg = shift;
	305
	306	_charscripts() unless @SCRIPTS;
e882dd67	307
10a6ecd2	308	my $code = _getcode($arg);
	309
	310	if (defined $code) {
	311	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
	312	} else {
	313	if (exists $SCRIPTS{$arg}) {
	314	return $SCRIPTS{$arg};
	315	} else {
	316	return;
	317	}
	318	}
	319	}
	320
	321	=head2 charblocks
	322
1189d1e4	323	use UnicodeCD 'charblocks';
10a6ecd2	324
b08cd201	325	my $charblocks = charblocks();
10a6ecd2	326
b08cd201	327	charblocks() returns a reference to a hash with the known block names
b08cd201	328	as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2	329
	330	=cut
	331
	332	sub charblocks {
b08cd201	333	_charblocks() unless %BLOCKS;
b08cd201	334	return \%BLOCKS;
10a6ecd2	335	}
	336
	337	=head2 charscripts
	338
1189d1e4	339	use UnicodeCD 'charscripts';
10a6ecd2	340
	341	my %charscripts = charscripts();
	342
	343	charscripts() returns a hash with the known script names as the keys,
	344	and the code point ranges (see L</charscript>) as the values.
	345
	346	=cut
	347
	348	sub charscripts {
b08cd201	349	_charscripts() unless %SCRIPTS;
b08cd201	350	return \%SCRIPTS;
561c79ed	351	}
561c79ed	352
10a6ecd2	353	=head2 Blocks versus Scripts
ad9cab37	354
10a6ecd2	355	The difference between a block and a script is that scripts are closer
	356	to the linguistic notion of a set of characters required to present
	357	languages, while block is more of an artifact of the Unicode character
	358	numbering and separation into blocks of 256 characters.
3aa957f9	359
	360	For example the Latin B<script> is spread over several B<blocks>, such
	361	as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
	362	C<Latin Extended-B>. On the other hand, the Latin script does not
	363	contain all the characters of the C<Basic Latin> block (also known as
	364	the ASCII): it includes only the letters, not for example the digits
	365	or the punctuation.
ad9cab37	366
3aa957f9	367	For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37	368
	369	For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
	370
3aa957f9	371	=head2 Matching Scripts and Blocks
	372
	373	Both scripts and blocks can be matched using the regular expression
	374	construct C<\p{In...}> and its negation C<\P{In...}>.
	375
	376	The name of the script or the block comes after the C<In>, for example
	377	C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
10a6ecd2	378	removed from the names for the C<\p{In...}>, for example
	379	C<LatinExtendedA> instead of C<Latin Extended-A>.
	380
	381	There are a few cases where there exists both a script and a block by
	382	the same name, in these cases the block version has C<Block> appended:
	383	C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
	384
b08cd201	385	=head2 Code Point Arguments
	386
	387	A <code point argument> is either a decimal or a hexadecimal scalar,
	388	or "U+" followed by hexadecimals.
	389
10a6ecd2	390	=head2 charinrange
	391
	392	In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
	393	can also test whether a code point is in the I<range> as returned by
	394	L</charblock> and L</charscript> or as the values of the hash returned
e618509d	395	by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2	396
1189d1e4	397	use UnicodeCD qw(charscript charinrange);
10a6ecd2	398
10a6ecd2	399	$range = charscript('Hiragana');
e145285f	400	print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2	401
	402	=cut
	403
b08cd201	404	=head2 compexcl
b08cd201	405
1189d1e4	406	use UnicodeCD 'compexcl';
b08cd201	407
	408	my $compexcl = compexcl("09dc");
	409
	410	The compexcl() returns the composition exclusion (that is, if the
9046a8ae	411	character should not be produced during a precomposition) of the
9046a8ae	412	character specified by a B<code point argument>.
b08cd201	413
	414	If there is a composition exclusion for the character, true is
	415	returned. Otherwise, false is returned.
	416
	417	=cut
	418
	419	my %COMPEXCL;
	420
	421	sub _compexcl {
	422	unless (%COMPEXCL) {
	423	if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
	424	while (<$COMPEXCLFH>) {
	425	if (/^([0-9A-F]+) \# /) {
	426	my $code = hex($1);
	427	$COMPEXCL{$code} = undef;
	428	}
	429	}
	430	close($COMPEXCLFH);
	431	}
	432	}
	433	}
	434
	435	sub compexcl {
	436	my $arg = shift;
	437	my $code = _getcode($arg);
	438
	439	_compexcl() unless %COMPEXCL;
	440
	441	return exists $COMPEXCL{$code};
	442	}
	443
	444	=head2 casefold
	445
1189d1e4	446	use UnicodeCD 'casefold';
b08cd201	447
	448	my %casefold = casefold("09dc");
	449
	450	The casefold() returns the locale-independent case folding of the
	451	character specified by a B<code point argument>.
	452
	453	If there is a case folding for that character, a reference to a hash
	454	with the following fields is returned:
	455
	456	key
	457
	458	code code point with at least four hexdigits
	459	status "C", "F", "S", or "I"
	460	mapping one or more codes separated by spaces
	461
	462	The meaning of the I<status> is as follows:
	463
	464	C common case folding, common mappings shared
	465	by both simple and full mappings
	466	F full case folding, mappings that cause strings
	467	to grow in length. Multiple characters are separated
	468	by spaces
	469	S simple case folding, mappings to single characters
	470	where different from F
	471	I special case for dotted uppercase I and
	472	dotless lowercase i
	473	- If this mapping is included, the result is
	474	case-insensitive, but dotless and dotted I's
	475	are not distinguished
	476	- If this mapping is excluded, the result is not
	477	fully case-insensitive, but dotless and dotted
	478	I's are distinguished
	479
	480	If there is no case folding for that character, C<undef> is returned.
	481
	482	For more information about case mappings see
	483	http://www.unicode.org/unicode/reports/tr21/
	484
	485	=cut
	486
	487	my %CASEFOLD;
	488
	489	sub _casefold {
	490	unless (%CASEFOLD) {
	491	if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
	492	while (<$CASEFOLDFH>) {
	493	if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
	494	my $code = hex($1);
	495	$CASEFOLD{$code} = { code => $1,
	496	status => $2,
	497	mapping => $3 };
	498	}
	499	}
	500	close($CASEFOLDFH);
	501	}
	502	}
	503	}
	504
	505	sub casefold {
	506	my $arg = shift;
	507	my $code = _getcode($arg);
	508
	509	_casefold() unless %CASEFOLD;
	510
511	return $CASEFOLD{$code};
512	}
513
514	=head2 casespec
515
1189d1e4	516	use UnicodeCD 'casespec';
b08cd201	517
	518	my %casespec = casespec("09dc");
	519
	520	The casespec() returns the potentially locale-dependent case mapping
	521	of the character specified by a B<code point argument>. The mapping
	522	may change the length of the string (which the basic Unicode case
	523	mappings as returned by charinfo() never do).
	524
	525	If there is a case folding for that character, a reference to a hash
	526	with the following fields is returned:
	527
	528	key
	529
	530	code code point with at least four hexdigits
	531	lower lowercase
	532	title titlecase
	533	upper uppercase
	534	condition condition list (may be undef)
	535
	536	The C<condition> is optional. Where present, it consists of one or
	537	more I<locales> or I<contexts>, separated by spaces (other than as
	538	used to separate elements, spaces are to be ignored). A condition
	539	list overrides the normal behavior if all of the listed conditions are
	540	true. Case distinctions in the condition list are not significant.
	541	Conditions preceded by "NON_" represent the negation of the condition
	542
	543	A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d	544	followed by a "_" and a 2-letter ISO language code (possibly followed
	545	by a "_" and a variant code). You can find the lists of those codes,
	546	see L<Locale::Country> and L<Locale::Language>.
b08cd201	547
	548	A I<context> is one of the following choices:
	549
	550	FINAL The letter is not followed by a letter of
	551	general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
	552	MODERN The mapping is only used for modern text
e618509d	553	AFTER_i The last base character was "i" (U+0069)
b08cd201	554
	555	For more information about case mappings see
	556	http://www.unicode.org/unicode/reports/tr21/
	557
	558	=cut
	559
	560	my %CASESPEC;
	561
	562	sub _casespec {
	563	unless (%CASESPEC) {
	564	if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
	565	while (<$CASESPECFH>) {
	566	if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+))?; ([0-9A-F]+(?: [0-9A-F]+))?; ([0-9A-F]+(?: [0-9A-F]+))?; (\w+(?: \w+))?/) {
	567	my $code = hex($1);
	568	$CASESPEC{$code} = { code => $1,
	569	lower => $2,
	570	title => $3,
	571	upper => $4,
	572	condition => $5 };
	573	}
	574	}
	575	close($CASESPECFH);
	576	}
	577	}
	578	}
	579
	580	sub casespec {
	581	my $arg = shift;
	582	my $code = _getcode($arg);
	583
	584	_casespec() unless %CASESPEC;
	585
	586	return $CASESPEC{$code};
	587	}
	588
1189d1e4	589	=head2 UnicodeCD::UnicodeVersion
10a6ecd2	590
1189d1e4	591	UnicodeCD::UnicodeVersion() returns the version of the Unicode Character
10a6ecd2	592	Database, in other words, the version of the Unicode standard the
	593	database implements.
	594
	595	=cut
	596
	597	my $UNICODEVERSION;
	598
	599	sub UnicodeVersion {
	600	unless (defined $UNICODEVERSION) {
	601	openunicode(\$VERSIONFH, "version");
	602	chomp($UNICODEVERSION = <$VERSIONFH>);
	603	close($VERSIONFH);
	604	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
	605	unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
	606	}
	607	return $UNICODEVERSION;
	608	}
3aa957f9	609
3aa957f9	610	=head2 Implementation Note
32c16050	611
ad9cab37	612	The first use of charinfo() opens a read-only filehandle to the Unicode
	613	Character Database (the database is included in the Perl distribution).
	614	The filehandle is then kept open for further queries.
32c16050	615
561c79ed	616	=head1 AUTHOR
	617
	618	Jarkko Hietaniemi
	619
	620	=cut
	621
	622	1;