[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm

package Unicode::UCD;

use strict;
use warnings;

our $VERSION = '0.1';

require Exporter;

our @ISA = qw(Exporter);
our @EXPORT_OK = qw(charinfo
		    charblock charscript
		    charblocks charscripts
		    charinrange
		    compexcl
		    casefold casespec);

use Carp;

=head1 NAME

Unicode::UCD - Unicode character database

=head1 SYNOPSIS

    use Unicode::UCD 'charinfo';
    my $charinfo   = charinfo($codepoint);

    use Unicode::UCD 'charblock';
    my $charblock  = charblock($codepoint);

    use Unicode::UCD 'charscript';
    my $charscript = charblock($codepoint);

=head1 DESCRIPTION

The Unicode module offers a simple interface to the Unicode Character
Database.

=cut

my $UNICODEFH;
my $BLOCKSFH;
my $SCRIPTSFH;
my $VERSIONFH;
my $COMPEXCLFH;
my $CASEFOLDFH;
my $CASESPECFH;

sub openunicode {
    my ($rfh, @path) = @_;
    my $f;
    unless (defined $$rfh) {
	for my $d (@INC) {
	    use File::Spec;
	    $f = File::Spec->catfile($d, "unicode", @path);
	    last if open($$rfh, $f);
	    undef $f;
	}
	croak __PACKAGE__, ": failed to find ",
              File::Spec->catfile(@path), " in @INC"
	    unless defined $f;
    }
    return $f;
}

=head2 charinfo

    use Unicode::UCD 'charinfo';

    my $charinfo = charinfo(0x41);

charinfo() returns a reference to a hash that has the following fields
as defined by the Unicode standard:

    key

    code             code point with at least four hexdigits
    name             name of the character IN UPPER CASE
    category         general category of the character
    combining        classes used in the Canonical Ordering Algorithm
    bidi             bidirectional category
    decomposition    character decomposition mapping
    decimal          if decimal digit this is the integer numeric value
    digit            if digit this is the numeric value
    numeric          if numeric is the integer or rational numeric value
    mirrored         if mirrored in bidirectional text
    unicode10        Unicode 1.0 name if existed and different
    comment          ISO 10646 comment field
    upper            uppercase equivalent mapping
    lower            lowercase equivalent mapping
    title            titlecase equivalent mapping

    block            block the character belongs to (used in \p{In...})
    script           script the character belongs to 

If no match is found, a reference to an empty hash is returned.

The C<block> property is the same as as returned by charinfo().  It is
not defined in the Unicode Character Database proper (Chapter 4 of the
Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
of TUS3).  Similarly for the C<script> property.

Note that you cannot do (de)composition and casing based solely on the
above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
you will need also the compexcl(), casefold(), and casespec() functions.

=cut

sub _getcode {
    my $arg = shift;

    if ($arg =~ /^\d+$/) {
	return $arg;
    } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
	return hex($1);
    }

    return;
}

sub charinfo {
    my $arg  = shift;
    my $code = _getcode($arg);
    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
	unless defined $code;
    my $hexk = sprintf("%04X", $code);

    openunicode(\$UNICODEFH, "Unicode.txt");
    if (defined $UNICODEFH) {
	use Search::Dict;
	if (look($UNICODEFH, "$hexk;") >= 0) {
	    my $line = <$UNICODEFH>;
	    chomp $line;
	    my %prop;
	    @prop{qw(
		     code name category
		     combining bidi decomposition
		     decimal digit numeric
		     mirrored unicode10 comment
		     upper lower title
		    )} = split(/;/, $line, -1);
	    if ($prop{code} eq $hexk) {
		$prop{block}  = charblock($code);
		$prop{script} = charscript($code);
		return \%prop;
	    }
	}
    }
    return;
}

sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
    my ($table, $lo, $hi, $code) = @_;

    return if $lo > $hi;

    my $mid = int(($lo+$hi) / 2);

    if ($table->[$mid]->[0] < $code) {
	if ($table->[$mid]->[1] >= $code) {
	    return $table->[$mid]->[2];
	} else {
	    _search($table, $mid + 1, $hi, $code);
	}
    } elsif ($table->[$mid]->[0] > $code) {
	_search($table, $lo, $mid - 1, $code);
    } else {
	return $table->[$mid]->[2];
    }
}

sub charinrange {
    my ($range, $arg) = @_;
    my $code = _getcode($arg);
    croak __PACKAGE__, "::charinrange: unknown code '$arg'"
	unless defined $code;
    _search($range, 0, $#$range, $code);
}

=head2 charblock

    use Unicode::UCD 'charblock';

    my $charblock = charblock(0x41);
    my $charblock = charblock(1234);
    my $charblock = charblock("0x263a");
    my $charblock = charblock("U+263a");

    my $ranges    = charblock('Armenian');

With a B<code point argument> charblock() returns the block the character
belongs to, e.g.  C<Basic Latin>.  Note that not all the character
positions within all blocks are defined.

If supplied with an argument that can't be a code point, charblock()
tries to do the opposite and interpret the argument as a character
block.  The return value is a I<range>: an anonymous list that
contains anonymous lists, which in turn contain I<start-of-range>,
I<end-of-range> code point pairs.  You can test whether a code point
is in a range using the L</charinrange> function.  If the argument is
not a known charater block, C<undef> is returned.

=cut

my @BLOCKS;
my %BLOCKS;

sub _charblocks {
    unless (@BLOCKS) {
	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
	    while (<$BLOCKSFH>) {
		if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
		    my ($lo, $hi) = (hex($1), hex($2));
		    my $subrange = [ $lo, $hi, $3 ];
		    push @BLOCKS, $subrange;
		    push @{$BLOCKS{$3}}, $subrange;
		}
	    }
	    close($BLOCKSFH);
	}
    }
}

sub charblock {
    my $arg = shift;

    _charblocks() unless @BLOCKS;

    my $code = _getcode($arg);

    if (defined $code) {
	_search(\@BLOCKS, 0, $#BLOCKS, $code);
    } else {
	if (exists $BLOCKS{$arg}) {
	    return $BLOCKS{$arg};
	} else {
	    return;
	}
    }
}

=head2 charscript

    use Unicode::UCD 'charscript';

    my $charscript = charscript(0x41);
    my $charscript = charscript(1234);
    my $charscript = charscript("U+263a");

    my $ranges     = charscript('Thai');

With a B<code point argument> charscript() returns the script the
character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.

If supplied with an argument that can't be a code point, charscript()
tries to do the opposite and interpret the argument as a character
script.  The return value is a I<range>: an anonymous list that
contains anonymous lists, which in turn contain I<start-of-range>,
I<end-of-range> code point pairs.  You can test whether a code point
is in a range using the L</charinrange> function.  If the argument is
not a known charater script, C<undef> is returned.

=cut

my @SCRIPTS;
my %SCRIPTS;

sub _charscripts {
    unless (@SCRIPTS) {
	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
	    while (<$SCRIPTSFH>) {
		if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
		    my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
		    my $script = lc($3);
		    $script =~ s/\b(\w)/uc($1)/ge;
		    my $subrange = [ $lo, $hi, $script ];
		    push @SCRIPTS, $subrange;
		    push @{$SCRIPTS{$script}}, $subrange;
		}
	    }
	    close($SCRIPTSFH);
	    @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
	}
    }
}

sub charscript {
    my $arg = shift;

    _charscripts() unless @SCRIPTS;

    my $code = _getcode($arg);

    if (defined $code) {
	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
    } else {
	if (exists $SCRIPTS{$arg}) {
	    return $SCRIPTS{$arg};
	} else {
	    return;
	}
    }
}

=head2 charblocks

    use Unicode::UCD 'charblocks';

    my $charblocks = charblocks();

charblocks() returns a reference to a hash with the known block names
as the keys, and the code point ranges (see L</charblock>) as the values.

=cut

sub charblocks {
    _charblocks() unless %BLOCKS;
    return \%BLOCKS;
}

=head2 charscripts

    use Unicode::UCD 'charscripts';

    my %charscripts = charscripts();

charscripts() returns a hash with the known script names as the keys,
and the code point ranges (see L</charscript>) as the values.

=cut

sub charscripts {
    _charscripts() unless %SCRIPTS;
    return \%SCRIPTS;
}

=head2 Blocks versus Scripts

The difference between a block and a script is that scripts are closer
to the linguistic notion of a set of characters required to present
languages, while block is more of an artifact of the Unicode character
numbering and separation into blocks of 256 characters.

For example the Latin B<script> is spread over several B<blocks>, such
as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
C<Latin Extended-B>.  On the other hand, the Latin script does not
contain all the characters of the C<Basic Latin> block (also known as
the ASCII): it includes only the letters, not for example the digits
or the punctuation.

For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt

For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/

=head2 Matching Scripts and Blocks

Both scripts and blocks can be matched using the regular expression
construct C<\p{In...}> and its negation C<\P{In...}>.

The name of the script or the block comes after the C<In>, for example
C<\p{InCyrillic}>, C<\P{InBasicLatin}>.  Spaces and dashes ('-') are
removed from the names for the C<\p{In...}>, for example
C<LatinExtendedA> instead of C<Latin Extended-A>.

There are a few cases where there exists both a script and a block by
the same name, in these cases the block version has C<Block> appended:
C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.

=head2 Code Point Arguments

A <code point argument> is either a decimal or a hexadecimal scalar,
or "U+" followed by hexadecimals.

=head2 charinrange

In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
can also test whether a code point is in the I<range> as returned by
L</charblock> and L</charscript> or as the values of the hash returned
by L</charblocks> and </charscripts> by using charinrange():

    use Unicode::UCD qw(charscript charinrange);

    $range = charscript('Hiragana');
    print "looks like hiragana\n" if charinrange($range, $code);

=cut

=head2 compexcl

    use Unicode::UCD 'compexcl';

    my $compexcl = compexcl("09dc");

The compexcl() returns the composition exclusion (that is, if the
character cannot be decomposed) of the character specified by a B<code
point argument>.

If there is a composition exclusion for the character, true is
returned.  Otherwise, false is returned.

=cut

my %COMPEXCL;

sub _compexcl {
    unless (%COMPEXCL) {
	if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
	    while (<$COMPEXCLFH>) {
		if (/^([0-9A-F]+) \# /) {
		    my $code = hex($1);
		    $COMPEXCL{$code} = undef;
		}
	    }
	    close($COMPEXCLFH);
	}
    }
}

sub compexcl {
    my $arg  = shift;
    my $code = _getcode($arg);

    _compexcl() unless %COMPEXCL;

    return exists $COMPEXCL{$code};
}

=head2 casefold

    use Unicode::UCD 'casefold';

    my %casefold = casefold("09dc");

The casefold() returns the locale-independent case folding of the
character specified by a B<code point argument>.

If there is a case folding for that character, a reference to a hash
with the following fields is returned:

    key

    code             code point with at least four hexdigits
    status           "C", "F", "S", or "I"
    mapping          one or more codes separated by spaces

The meaning of the I<status> is as follows:

   C                 common case folding, common mappings shared
                     by both simple and full mappings
   F                 full case folding, mappings that cause strings
                     to grow in length. Multiple characters are separated
                     by spaces
   S                 simple case folding, mappings to single characters
                     where different from F
   I                 special case for dotted uppercase I and
                     dotless lowercase i
                     - If this mapping is included, the result is
                       case-insensitive, but dotless and dotted I's
                       are not distinguished
                     - If this mapping is excluded, the result is not
                       fully case-insensitive, but dotless and dotted
                       I's are distinguished

If there is no case folding for that character, C<undef> is returned.

For more information about case mappings see
http://www.unicode.org/unicode/reports/tr21/

=cut

my %CASEFOLD;

sub _casefold {
    unless (%CASEFOLD) {
	if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
	    while (<$CASEFOLDFH>) {
		if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
		    my $code = hex($1);
		    $CASEFOLD{$code} = { code    => $1,
					 status  => $2,
					 mapping => $3 };
		}
	    }
	    close($CASEFOLDFH);
	}
    }
}

sub casefold {
    my $arg  = shift;
    my $code = _getcode($arg);

    _casefold() unless %CASEFOLD;

    return $CASEFOLD{$code};
}

=head2 casespec

    use Unicode::UCD 'casespec';

    my %casespec = casespec("09dc");

The casespec() returns the potentially locale-dependent case mapping
of the character specified by a B<code point argument>.  The mapping
may change the length of the string (which the basic Unicode case
mappings as returned by charinfo() never do).

If there is a case folding for that character, a reference to a hash
with the following fields is returned:

    key

    code             code point with at least four hexdigits
    lower            lowercase
    title            titlecase
    upper            uppercase
    condition        condition list (may be undef)

The C<condition> is optional.  Where present, it consists of one or
more I<locales> or I<contexts>, separated by spaces (other than as
used to separate elements, spaces are to be ignored).  A condition
list overrides the normal behavior if all of the listed conditions are
true.  Case distinctions in the condition list are not significant.
Conditions preceded by "NON_" represent the negation of the condition

A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
followed by a "_" and a 2-letter ISO language code (, possibly followed
by a "_" and a variant code).  You can find the list of those codes
in L<Locale::Country> and L<Locale::Language>.

A I<context> is one of the following choices:

    FINAL            The letter is not followed by a letter of
                     general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
    MODERN           The mapping is only used for modern text
    AFTER_i          The last base character was "i" 0069

For more information about case mappings see
http://www.unicode.org/unicode/reports/tr21/

=cut

my %CASESPEC;

sub _casespec {
    unless (%CASESPEC) {
	if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
	    while (<$CASESPECFH>) {
		if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
		    my $code = hex($1);
		    $CASESPEC{$code} = { code      => $1,
					 lower     => $2,
					 title     => $3,
					 upper     => $4,
					 condition => $5 };
		}
	    }
	    close($CASESPECFH);
	}
    }
}

sub casespec {
    my $arg  = shift;
    my $code = _getcode($arg);

    _casespec() unless %CASESPEC;

    return $CASESPEC{$code};
}

=head2 Unicode::UCD::UnicodeVersion

Unicode::UCD::UnicodeVersion() returns the version of the Unicode Character
Database, in other words, the version of the Unicode standard the
database implements.

=cut

my $UNICODEVERSION;

sub UnicodeVersion {
    unless (defined $UNICODEVERSION) {
	openunicode(\$VERSIONFH, "version");
	chomp($UNICODEVERSION = <$VERSIONFH>);
	close($VERSIONFH);
	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
	    unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
    }
    return $UNICODEVERSION;
}

=head2 Implementation Note

The first use of charinfo() opens a read-only filehandle to the Unicode
Character Database (the database is included in the Perl distribution).
The filehandle is then kept open for further queries.

=head1 AUTHOR

Jarkko Hietaniemi

=cut

1;
Commit	Line	Data
561c79ed	1	package Unicode::UCD;
	2
	3	use strict;
	4	use warnings;
	5
10a6ecd2	6	our $VERSION = '0.1';
561c79ed	7
	8	require Exporter;
	9
	10	our @ISA = qw(Exporter);
10a6ecd2	11	our @EXPORT_OK = qw(charinfo
	12	charblock charscript
	13	charblocks charscripts
b08cd201	14	charinrange
	15	compexcl
	16	casefold casespec);
561c79ed	17
	18	use Carp;
	19
	20	=head1 NAME
	21
00f2772c	22	Unicode::UCD - Unicode character database
561c79ed	23
	24	=head1 SYNOPSIS
	25
561c79ed	26	use Unicode::UCD 'charinfo';
b08cd201	27	my $charinfo = charinfo($codepoint);
561c79ed	28
561c79ed	29	use Unicode::UCD 'charblock';
e882dd67	30	my $charblock = charblock($codepoint);
	31
	32	use Unicode::UCD 'charscript';
	33	my $charscript = charblock($codepoint);
561c79ed	34
	35	=head1 DESCRIPTION
	36
	37	The Unicode module offers a simple interface to the Unicode Character
	38	Database.
	39
	40	=cut
	41
10a6ecd2	42	my $UNICODEFH;
	43	my $BLOCKSFH;
	44	my $SCRIPTSFH;
	45	my $VERSIONFH;
b08cd201	46	my $COMPEXCLFH;
	47	my $CASEFOLDFH;
	48	my $CASESPECFH;
561c79ed	49
	50	sub openunicode {
	51	my ($rfh, @path) = @_;
	52	my $f;
	53	unless (defined $$rfh) {
	54	for my $d (@INC) {
	55	use File::Spec;
	56	$f = File::Spec->catfile($d, "unicode", @path);
32c16050	57	last if open($$rfh, $f);
e882dd67	58	undef $f;
561c79ed	59	}
e882dd67	60	croak __PACKAGE__, ": failed to find ",
	61	File::Spec->catfile(@path), " in @INC"
	62	unless defined $f;
561c79ed	63	}
	64	return $f;
	65	}
	66
	67	=head2 charinfo
	68
	69	use Unicode::UCD 'charinfo';
	70
b08cd201	71	my $charinfo = charinfo(0x41);
561c79ed	72
b08cd201	73	charinfo() returns a reference to a hash that has the following fields
b08cd201	74	as defined by the Unicode standard:
561c79ed	75
	76	key
	77
	78	code code point with at least four hexdigits
	79	name name of the character IN UPPER CASE
	80	category general category of the character
	81	combining classes used in the Canonical Ordering Algorithm
	82	bidi bidirectional category
	83	decomposition character decomposition mapping
	84	decimal if decimal digit this is the integer numeric value
	85	digit if digit this is the numeric value
	86	numeric if numeric is the integer or rational numeric value
	87	mirrored if mirrored in bidirectional text
	88	unicode10 Unicode 1.0 name if existed and different
	89	comment ISO 10646 comment field
	90	upper uppercase equivalent mapping
	91	lower lowercase equivalent mapping
	92	title titlecase equivalent mapping
e882dd67	93
561c79ed	94	block block the character belongs to (used in \p{In...})
e882dd67	95	script script the character belongs to
561c79ed	96
b08cd201	97	If no match is found, a reference to an empty hash is returned.
561c79ed	98
32c16050	99	The C<block> property is the same as as returned by charinfo(). It is
	100	not defined in the Unicode Character Database proper (Chapter 4 of the
	101	Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
e882dd67	102	of TUS3). Similarly for the C<script> property.
32c16050	103
	104	Note that you cannot do (de)composition and casing based solely on the
	105	above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201	106	you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed	107
	108	=cut
	109
10a6ecd2	110	sub _getcode {
	111	my $arg = shift;
	112
	113	if ($arg =~ /^\d+$/) {
	114	return $arg;
	115	} elsif ($arg =~ /^(?:U\+\|0x)?([[:xdigit:]]+)$/) {
	116	return hex($1);
	117	}
	118
	119	return;
	120	}
	121
561c79ed	122	sub charinfo {
10a6ecd2	123	my $arg = shift;
	124	my $code = _getcode($arg);
	125	croak __PACKAGE__, "::charinfo: unknown code '$arg'"
	126	unless defined $code;
561c79ed	127	my $hexk = sprintf("%04X", $code);
561c79ed	128
10a6ecd2	129	openunicode(\$UNICODEFH, "Unicode.txt");
10a6ecd2	130	if (defined $UNICODEFH) {
561c79ed	131	use Search::Dict;
10a6ecd2	132	if (look($UNICODEFH, "$hexk;") >= 0) {
10a6ecd2	133	my $line = <$UNICODEFH>;
561c79ed	134	chomp $line;
	135	my %prop;
	136	@prop{qw(
	137	code name category
	138	combining bidi decomposition
	139	decimal digit numeric
	140	mirrored unicode10 comment
	141	upper lower title
	142	)} = split(/;/, $line, -1);
	143	if ($prop{code} eq $hexk) {
a196fbfd	144	$prop{block} = charblock($code);
a196fbfd	145	$prop{script} = charscript($code);
b08cd201	146	return \%prop;
561c79ed	147	}
	148	}
	149	}
	150	return;
	151	}
	152
e882dd67	153	sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
	154	my ($table, $lo, $hi, $code) = @_;
	155
	156	return if $lo > $hi;
	157
	158	my $mid = int(($lo+$hi) / 2);
	159
	160	if ($table->[$mid]->[0] < $code) {
10a6ecd2	161	if ($table->[$mid]->[1] >= $code) {
e882dd67	162	return $table->[$mid]->[2];
	163	} else {
	164	_search($table, $mid + 1, $hi, $code);
	165	}
	166	} elsif ($table->[$mid]->[0] > $code) {
	167	_search($table, $lo, $mid - 1, $code);
	168	} else {
	169	return $table->[$mid]->[2];
	170	}
	171	}
	172
10a6ecd2	173	sub charinrange {
	174	my ($range, $arg) = @_;
	175	my $code = _getcode($arg);
	176	croak __PACKAGE__, "::charinrange: unknown code '$arg'"
	177	unless defined $code;
	178	_search($range, 0, $#$range, $code);
	179	}
	180
354a27bf	181	=head2 charblock
561c79ed	182
	183	use Unicode::UCD 'charblock';
	184
	185	my $charblock = charblock(0x41);
10a6ecd2	186	my $charblock = charblock(1234);
	187	my $charblock = charblock("0x263a");
	188	my $charblock = charblock("U+263a");
	189
	190	my $ranges = charblock('Armenian');
	191
	192	With a B<code point argument> charblock() returns the block the character
	193	belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201	194	positions within all blocks are defined.
10a6ecd2	195
	196	If supplied with an argument that can't be a code point, charblock()
	197	tries to do the opposite and interpret the argument as a character
	198	block. The return value is a I<range>: an anonymous list that
	199	contains anonymous lists, which in turn contain I<start-of-range>,
	200	I<end-of-range> code point pairs. You can test whether a code point
	201	is in a range using the L</charinrange> function. If the argument is
	202	not a known charater block, C<undef> is returned.
561c79ed	203
561c79ed	204	=cut
	205
	206	my @BLOCKS;
10a6ecd2	207	my %BLOCKS;
561c79ed	208
10a6ecd2	209	sub _charblocks {
561c79ed	210	unless (@BLOCKS) {
10a6ecd2	211	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
10a6ecd2	212	while (<$BLOCKSFH>) {
2796c109	213	if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2	214	my ($lo, $hi) = (hex($1), hex($2));
	215	my $subrange = [ $lo, $hi, $3 ];
	216	push @BLOCKS, $subrange;
	217	push @{$BLOCKS{$3}}, $subrange;
561c79ed	218	}
561c79ed	219	}
10a6ecd2	220	close($BLOCKSFH);
561c79ed	221	}
561c79ed	222	}
10a6ecd2	223	}
	224
	225	sub charblock {
	226	my $arg = shift;
	227
	228	_charblocks() unless @BLOCKS;
	229
	230	my $code = _getcode($arg);
561c79ed	231
10a6ecd2	232	if (defined $code) {
	233	_search(\@BLOCKS, 0, $#BLOCKS, $code);
	234	} else {
	235	if (exists $BLOCKS{$arg}) {
	236	return $BLOCKS{$arg};
	237	} else {
	238	return;
	239	}
	240	}
e882dd67	241	}
	242
	243	=head2 charscript
	244
	245	use Unicode::UCD 'charscript';
	246
	247	my $charscript = charscript(0x41);
10a6ecd2	248	my $charscript = charscript(1234);
10a6ecd2	249	my $charscript = charscript("U+263a");
e882dd67	250
10a6ecd2	251	my $ranges = charscript('Thai');
	252
	253	With a B<code point argument> charscript() returns the script the
b08cd201	254	character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2	255
	256	If supplied with an argument that can't be a code point, charscript()
	257	tries to do the opposite and interpret the argument as a character
	258	script. The return value is a I<range>: an anonymous list that
	259	contains anonymous lists, which in turn contain I<start-of-range>,
	260	I<end-of-range> code point pairs. You can test whether a code point
	261	is in a range using the L</charinrange> function. If the argument is
	262	not a known charater script, C<undef> is returned.
e882dd67	263
e882dd67	264	=cut
	265
	266	my @SCRIPTS;
10a6ecd2	267	my %SCRIPTS;
e882dd67	268
10a6ecd2	269	sub _charscripts {
e882dd67	270	unless (@SCRIPTS) {
10a6ecd2	271	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
10a6ecd2	272	while (<$SCRIPTSFH>) {
e882dd67	273	if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2	274	my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
	275	my $script = lc($3);
	276	$script =~ s/\b(\w)/uc($1)/ge;
	277	my $subrange = [ $lo, $hi, $script ];
	278	push @SCRIPTS, $subrange;
	279	push @{$SCRIPTS{$script}}, $subrange;
e882dd67	280	}
e882dd67	281	}
10a6ecd2	282	close($SCRIPTSFH);
e882dd67	283	@SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
	284	}
	285	}
10a6ecd2	286	}
	287
	288	sub charscript {
	289	my $arg = shift;
	290
	291	_charscripts() unless @SCRIPTS;
e882dd67	292
10a6ecd2	293	my $code = _getcode($arg);
	294
	295	if (defined $code) {
	296	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
	297	} else {
	298	if (exists $SCRIPTS{$arg}) {
	299	return $SCRIPTS{$arg};
	300	} else {
	301	return;
	302	}
	303	}
	304	}
	305
	306	=head2 charblocks
	307
	308	use Unicode::UCD 'charblocks';
	309
b08cd201	310	my $charblocks = charblocks();
10a6ecd2	311
b08cd201	312	charblocks() returns a reference to a hash with the known block names
b08cd201	313	as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2	314
	315	=cut
	316
	317	sub charblocks {
b08cd201	318	_charblocks() unless %BLOCKS;
b08cd201	319	return \%BLOCKS;
10a6ecd2	320	}
	321
	322	=head2 charscripts
	323
	324	use Unicode::UCD 'charscripts';
	325
	326	my %charscripts = charscripts();
	327
	328	charscripts() returns a hash with the known script names as the keys,
	329	and the code point ranges (see L</charscript>) as the values.
	330
	331	=cut
	332
	333	sub charscripts {
b08cd201	334	_charscripts() unless %SCRIPTS;
b08cd201	335	return \%SCRIPTS;
561c79ed	336	}
561c79ed	337
10a6ecd2	338	=head2 Blocks versus Scripts
ad9cab37	339
10a6ecd2	340	The difference between a block and a script is that scripts are closer
	341	to the linguistic notion of a set of characters required to present
	342	languages, while block is more of an artifact of the Unicode character
	343	numbering and separation into blocks of 256 characters.
3aa957f9	344
	345	For example the Latin B<script> is spread over several B<blocks>, such
	346	as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
	347	C<Latin Extended-B>. On the other hand, the Latin script does not
	348	contain all the characters of the C<Basic Latin> block (also known as
	349	the ASCII): it includes only the letters, not for example the digits
	350	or the punctuation.
ad9cab37	351
3aa957f9	352	For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37	353
	354	For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
	355
3aa957f9	356	=head2 Matching Scripts and Blocks
	357
	358	Both scripts and blocks can be matched using the regular expression
	359	construct C<\p{In...}> and its negation C<\P{In...}>.
	360
	361	The name of the script or the block comes after the C<In>, for example
	362	C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
10a6ecd2	363	removed from the names for the C<\p{In...}>, for example
	364	C<LatinExtendedA> instead of C<Latin Extended-A>.
	365
	366	There are a few cases where there exists both a script and a block by
	367	the same name, in these cases the block version has C<Block> appended:
	368	C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
	369
b08cd201	370	=head2 Code Point Arguments
	371
	372	A <code point argument> is either a decimal or a hexadecimal scalar,
	373	or "U+" followed by hexadecimals.
	374
10a6ecd2	375	=head2 charinrange
	376
	377	In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
	378	can also test whether a code point is in the I<range> as returned by
	379	L</charblock> and L</charscript> or as the values of the hash returned
	380	by L</charblocks> and </charscripts> by using charinrange():
	381
	382	use Unicode::UCD qw(charscript charinrange);
	383
	384	$range = charscript('Hiragana');
	385	print "looks like hiragana\n" if charinrange($range, $code);
	386
	387	=cut
	388
b08cd201	389	=head2 compexcl
	390
	391	use Unicode::UCD 'compexcl';
	392
	393	my $compexcl = compexcl("09dc");
	394
	395	The compexcl() returns the composition exclusion (that is, if the
	396	character cannot be decomposed) of the character specified by a B<code
	397	point argument>.
	398
	399	If there is a composition exclusion for the character, true is
	400	returned. Otherwise, false is returned.
	401
	402	=cut
	403
	404	my %COMPEXCL;
	405
	406	sub _compexcl {
	407	unless (%COMPEXCL) {
	408	if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
	409	while (<$COMPEXCLFH>) {
	410	if (/^([0-9A-F]+) \# /) {
	411	my $code = hex($1);
	412	$COMPEXCL{$code} = undef;
	413	}
	414	}
	415	close($COMPEXCLFH);
	416	}
	417	}
	418	}
	419
	420	sub compexcl {
	421	my $arg = shift;
	422	my $code = _getcode($arg);
	423
	424	_compexcl() unless %COMPEXCL;
	425
	426	return exists $COMPEXCL{$code};
	427	}
	428
	429	=head2 casefold
	430
	431	use Unicode::UCD 'casefold';
	432
	433	my %casefold = casefold("09dc");
	434
	435	The casefold() returns the locale-independent case folding of the
	436	character specified by a B<code point argument>.
	437
	438	If there is a case folding for that character, a reference to a hash
	439	with the following fields is returned:
	440
	441	key
	442
	443	code code point with at least four hexdigits
	444	status "C", "F", "S", or "I"
	445	mapping one or more codes separated by spaces
	446
	447	The meaning of the I<status> is as follows:
	448
	449	C common case folding, common mappings shared
	450	by both simple and full mappings
	451	F full case folding, mappings that cause strings
	452	to grow in length. Multiple characters are separated
453	by spaces
454	S simple case folding, mappings to single characters
455	where different from F
456	I special case for dotted uppercase I and
457	dotless lowercase i
458	- If this mapping is included, the result is
459	case-insensitive, but dotless and dotted I's
460	are not distinguished
461	- If this mapping is excluded, the result is not
462	fully case-insensitive, but dotless and dotted
463	I's are distinguished
464
465	If there is no case folding for that character, C<undef> is returned.
466
467	For more information about case mappings see
468	http://www.unicode.org/unicode/reports/tr21/
469
470	=cut
471
472	my %CASEFOLD;
473
474	sub _casefold {
475	unless (%CASEFOLD) {
476	if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
477	while (<$CASEFOLDFH>) {
478	if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
479	my $code = hex($1);
480	$CASEFOLD{$code} = { code => $1,
481	status => $2,
482	mapping => $3 };
483	}
484	}
485	close($CASEFOLDFH);
486	}
487	}
488	}
489
490	sub casefold {
491	my $arg = shift;
492	my $code = _getcode($arg);
493
494	_casefold() unless %CASEFOLD;
495
496	return $CASEFOLD{$code};
497	}
498
499	=head2 casespec
500
501	use Unicode::UCD 'casespec';
502
503	my %casespec = casespec("09dc");
504
505	The casespec() returns the potentially locale-dependent case mapping
506	of the character specified by a B<code point argument>. The mapping
507	may change the length of the string (which the basic Unicode case
508	mappings as returned by charinfo() never do).
509
510	If there is a case folding for that character, a reference to a hash
511	with the following fields is returned:
512
513	key
514
515	code code point with at least four hexdigits
516	lower lowercase
517	title titlecase
518	upper uppercase
519	condition condition list (may be undef)
520
521	The C<condition> is optional. Where present, it consists of one or
522	more I<locales> or I<contexts>, separated by spaces (other than as
523	used to separate elements, spaces are to be ignored). A condition
524	list overrides the normal behavior if all of the listed conditions are
525	true. Case distinctions in the condition list are not significant.
526	Conditions preceded by "NON_" represent the negation of the condition
527
528	A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
529	followed by a "_" and a 2-letter ISO language code (, possibly followed
530	by a "_" and a variant code). You can find the list of those codes
531	in L<Locale::Country> and L<Locale::Language>.
532
533	A I<context> is one of the following choices:
534
535	FINAL The letter is not followed by a letter of
536	general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
537	MODERN The mapping is only used for modern text
538	AFTER_i The last base character was "i" 0069
539
540	For more information about case mappings see
541	http://www.unicode.org/unicode/reports/tr21/
542
543	=cut
544
545	my %CASESPEC;
546
547	sub _casespec {
548	unless (%CASESPEC) {
549	if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
550	while (<$CASESPECFH>) {
551	if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+))?; ([0-9A-F]+(?: [0-9A-F]+))?; ([0-9A-F]+(?: [0-9A-F]+))?; (\w+(?: \w+))?/) {
552	my $code = hex($1);
553	$CASESPEC{$code} = { code => $1,
554	lower => $2,
555	title => $3,
556	upper => $4,
557	condition => $5 };
558	}
559	}
560	close($CASESPECFH);
561	}
562	}
563	}
564
565	sub casespec {
566	my $arg = shift;
567	my $code = _getcode($arg);
568
569	_casespec() unless %CASESPEC;
570
571	return $CASESPEC{$code};
572	}
573
10a6ecd2	574	=head2 Unicode::UCD::UnicodeVersion
	575
	576	Unicode::UCD::UnicodeVersion() returns the version of the Unicode Character
	577	Database, in other words, the version of the Unicode standard the
	578	database implements.
	579
	580	=cut
	581
	582	my $UNICODEVERSION;
	583
	584	sub UnicodeVersion {
	585	unless (defined $UNICODEVERSION) {
	586	openunicode(\$VERSIONFH, "version");
	587	chomp($UNICODEVERSION = <$VERSIONFH>);
	588	close($VERSIONFH);
	589	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
	590	unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
	591	}
	592	return $UNICODEVERSION;
	593	}
3aa957f9	594
3aa957f9	595	=head2 Implementation Note
32c16050	596
ad9cab37	597	The first use of charinfo() opens a read-only filehandle to the Unicode
	598	Character Database (the database is included in the Perl distribution).
	599	The filehandle is then kept open for further queries.
32c16050	600
561c79ed	601	=head1 AUTHOR
	602
	603	Jarkko Hietaniemi
	604
	605	=cut
	606
	607	1;