[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm

package Unicode::UCD;

use strict;
use warnings;

our $VERSION = '0.1';

require Exporter;

our @ISA = qw(Exporter);
our @EXPORT_OK = qw(charinfo
		    charblock charscript
		    charblocks charscripts
		    charinrange);

use Carp;

=head1 NAME

Unicode::UCD - Unicode character database

=head1 SYNOPSIS

    use Unicode::UCD 'charinfo';
    my %charinfo   = charinfo($codepoint);

    use Unicode::UCD 'charblock';
    my $charblock  = charblock($codepoint);

    use Unicode::UCD 'charscript';
    my $charscript = charblock($codepoint);

=head1 DESCRIPTION

The Unicode module offers a simple interface to the Unicode Character
Database.

=cut

my $UNICODEFH;
my $BLOCKSFH;
my $SCRIPTSFH;
my $VERSIONFH;

sub openunicode {
    my ($rfh, @path) = @_;
    my $f;
    unless (defined $$rfh) {
	for my $d (@INC) {
	    use File::Spec;
	    $f = File::Spec->catfile($d, "unicode", @path);
	    last if open($$rfh, $f);
	    undef $f;
	}
	croak __PACKAGE__, ": failed to find ",
              File::Spec->catfile(@path), " in @INC"
	    unless defined $f;
    }
    return $f;
}

=head2 charinfo

    use Unicode::UCD 'charinfo';

    my %charinfo = charinfo(0x41);

charinfo() returns a hash that has the following fields as defined
by the Unicode standard:

    key

    code             code point with at least four hexdigits
    name             name of the character IN UPPER CASE
    category         general category of the character
    combining        classes used in the Canonical Ordering Algorithm
    bidi             bidirectional category
    decomposition    character decomposition mapping
    decimal          if decimal digit this is the integer numeric value
    digit            if digit this is the numeric value
    numeric          if numeric is the integer or rational numeric value
    mirrored         if mirrored in bidirectional text
    unicode10        Unicode 1.0 name if existed and different
    comment          ISO 10646 comment field
    upper            uppercase equivalent mapping
    lower            lowercase equivalent mapping
    title            titlecase equivalent mapping

    block            block the character belongs to (used in \p{In...})
    script           script the character belongs to 

If no match is found, an empty hash is returned.

The C<block> property is the same as as returned by charinfo().  It is
not defined in the Unicode Character Database proper (Chapter 4 of the
Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
of TUS3).  Similarly for the C<script> property.

Note that you cannot do (de)composition and casing based solely on the
above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
you will need also the I<Composition Exclusions>, I<Case Folding>, and
I<SpecialCasing> tables, available as files F<CompExcl.txt>,
F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.

=cut

sub _getcode {
    my $arg = shift;

    if ($arg =~ /^\d+$/) {
	return $arg;
    } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
	return hex($1);
    }

    return;
}

sub charinfo {
    my $arg  = shift;
    my $code = _getcode($arg);
    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
	unless defined $code;
    my $hexk = sprintf("%04X", $code);

    openunicode(\$UNICODEFH, "Unicode.txt");
    if (defined $UNICODEFH) {
	use Search::Dict;
	if (look($UNICODEFH, "$hexk;") >= 0) {
	    my $line = <$UNICODEFH>;
	    chomp $line;
	    my %prop;
	    @prop{qw(
		     code name category
		     combining bidi decomposition
		     decimal digit numeric
		     mirrored unicode10 comment
		     upper lower title
		    )} = split(/;/, $line, -1);
	    if ($prop{code} eq $hexk) {
		$prop{block}  = charblock($code);
		$prop{script} = charscript($code);
		return %prop;
	    }
	}
    }
    return;
}

sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
    my ($table, $lo, $hi, $code) = @_;

    return if $lo > $hi;

    my $mid = int(($lo+$hi) / 2);

    if ($table->[$mid]->[0] < $code) {
	if ($table->[$mid]->[1] >= $code) {
	    return $table->[$mid]->[2];
	} else {
	    _search($table, $mid + 1, $hi, $code);
	}
    } elsif ($table->[$mid]->[0] > $code) {
	_search($table, $lo, $mid - 1, $code);
    } else {
	return $table->[$mid]->[2];
    }
}

sub charinrange {
    my ($range, $arg) = @_;
    my $code = _getcode($arg);
    croak __PACKAGE__, "::charinrange: unknown code '$arg'"
	unless defined $code;
    _search($range, 0, $#$range, $code);
}

=head2 charblock

    use Unicode::UCD 'charblock';

    my $charblock = charblock(0x41);
    my $charblock = charblock(1234);
    my $charblock = charblock("0x263a");
    my $charblock = charblock("U+263a");

    my $ranges    = charblock('Armenian');

With a B<code point argument> charblock() returns the block the character
belongs to, e.g.  C<Basic Latin>.  Note that not all the character
positions within all blocks are defined.  A <code point argument>
is either a decimal or a hexadecimal scalar, or "U+" followed
by hexadecimals.

If supplied with an argument that can't be a code point, charblock()
tries to do the opposite and interpret the argument as a character
block.  The return value is a I<range>: an anonymous list that
contains anonymous lists, which in turn contain I<start-of-range>,
I<end-of-range> code point pairs.  You can test whether a code point
is in a range using the L</charinrange> function.  If the argument is
not a known charater block, C<undef> is returned.

=cut

my @BLOCKS;
my %BLOCKS;

sub _charblocks {
    unless (@BLOCKS) {
	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
	    while (<$BLOCKSFH>) {
		if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
		    my ($lo, $hi) = (hex($1), hex($2));
		    my $subrange = [ $lo, $hi, $3 ];
		    push @BLOCKS, $subrange;
		    push @{$BLOCKS{$3}}, $subrange;
		}
	    }
	    close($BLOCKSFH);
	}
    }
}

sub charblock {
    my $arg = shift;

    _charblocks() unless @BLOCKS;

    my $code = _getcode($arg);

    if (defined $code) {
	_search(\@BLOCKS, 0, $#BLOCKS, $code);
    } else {
	if (exists $BLOCKS{$arg}) {
	    return $BLOCKS{$arg};
	} else {
	    return;
	}
    }
}

=head2 charscript

    use Unicode::UCD 'charscript';

    my $charscript = charscript(0x41);
    my $charscript = charscript(1234);
    my $charscript = charscript("U+263a");

    my $ranges     = charscript('Thai');

With a B<code point argument> charscript() returns the script the
character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.  A <code point
argument> is either a decimal or a hexadecimal scalar, or "U+"
followed by hexadecimals.

If supplied with an argument that can't be a code point, charscript()
tries to do the opposite and interpret the argument as a character
script.  The return value is a I<range>: an anonymous list that
contains anonymous lists, which in turn contain I<start-of-range>,
I<end-of-range> code point pairs.  You can test whether a code point
is in a range using the L</charinrange> function.  If the argument is
not a known charater script, C<undef> is returned.

=cut

my @SCRIPTS;
my %SCRIPTS;

sub _charscripts {
    unless (@SCRIPTS) {
	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
	    while (<$SCRIPTSFH>) {
		if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
		    my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
		    my $script = lc($3);
		    $script =~ s/\b(\w)/uc($1)/ge;
		    my $subrange = [ $lo, $hi, $script ];
		    push @SCRIPTS, $subrange;
		    push @{$SCRIPTS{$script}}, $subrange;
		}
	    }
	    close($SCRIPTSFH);
	    @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
	}
    }
}

sub charscript {
    my $arg = shift;

    _charscripts() unless @SCRIPTS;

    my $code = _getcode($arg);

    if (defined $code) {
	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
    } else {
	if (exists $SCRIPTS{$arg}) {
	    return $SCRIPTS{$arg};
	} else {
	    return;
	}
    }
}

=head2 charblocks

    use Unicode::UCD 'charblocks';

    my %charblocks = charblocks();

charblocks() returns a hash with the known block names as the keys,
and the code point ranges (see L</charblock>) as the values.

=cut

sub charblocks {
    _charblocks() unless @BLOCKS;
    return %BLOCKS;
}

=head2 charscripts

    use Unicode::UCD 'charscripts';

    my %charscripts = charscripts();

charscripts() returns a hash with the known script names as the keys,
and the code point ranges (see L</charscript>) as the values.

=cut

sub charscripts {
    _charscripts() unless @SCRIPTS;
    return %SCRIPTS;
}

=head2 Blocks versus Scripts

The difference between a block and a script is that scripts are closer
to the linguistic notion of a set of characters required to present
languages, while block is more of an artifact of the Unicode character
numbering and separation into blocks of 256 characters.

For example the Latin B<script> is spread over several B<blocks>, such
as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
C<Latin Extended-B>.  On the other hand, the Latin script does not
contain all the characters of the C<Basic Latin> block (also known as
the ASCII): it includes only the letters, not for example the digits
or the punctuation.

For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt

For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/

=head2 Matching Scripts and Blocks

Both scripts and blocks can be matched using the regular expression
construct C<\p{In...}> and its negation C<\P{In...}>.

The name of the script or the block comes after the C<In>, for example
C<\p{InCyrillic}>, C<\P{InBasicLatin}>.  Spaces and dashes ('-') are
removed from the names for the C<\p{In...}>, for example
C<LatinExtendedA> instead of C<Latin Extended-A>.

There are a few cases where there exists both a script and a block by
the same name, in these cases the block version has C<Block> appended:
C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.

=head2 charinrange

In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
can also test whether a code point is in the I<range> as returned by
L</charblock> and L</charscript> or as the values of the hash returned
by L</charblocks> and </charscripts> by using charinrange():

    use Unicode::UCD qw(charscript charinrange);

    $range = charscript('Hiragana');
    print "looks like hiragana\n" if charinrange($range, $code);

=cut

=head2 Unicode::UCD::UnicodeVersion

Unicode::UCD::UnicodeVersion() returns the version of the Unicode Character
Database, in other words, the version of the Unicode standard the
database implements.

=cut

my $UNICODEVERSION;

sub UnicodeVersion {
    unless (defined $UNICODEVERSION) {
	openunicode(\$VERSIONFH, "version");
	chomp($UNICODEVERSION = <$VERSIONFH>);
	close($VERSIONFH);
	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
	    unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
    }
    return $UNICODEVERSION;
}

=head2 Implementation Note

The first use of charinfo() opens a read-only filehandle to the Unicode
Character Database (the database is included in the Perl distribution).
The filehandle is then kept open for further queries.

=head1 AUTHOR

Jarkko Hietaniemi

=cut

1;
Commit	Line	Data
561c79ed	1	package Unicode::UCD;
	2
	3	use strict;
	4	use warnings;
	5
afd1eb53	6	our $VERSION = '0.1';
561c79ed	7
	8	require Exporter;
	9
	10	our @ISA = qw(Exporter);
afd1eb53	11	our @EXPORT_OK = qw(charinfo
	12	charblock charscript
	13	charblocks charscripts
	14	charinrange);
561c79ed	15
	16	use Carp;
	17
	18	=head1 NAME
	19
00f2772c	20	Unicode::UCD - Unicode character database
561c79ed	21
	22	=head1 SYNOPSIS
	23
561c79ed	24	use Unicode::UCD 'charinfo';
afd1eb53	25	my %charinfo = charinfo($codepoint);
561c79ed	26
561c79ed	27	use Unicode::UCD 'charblock';
afd1eb53	28	my $charblock = charblock($codepoint);
	29
	30	use Unicode::UCD 'charscript';
	31	my $charscript = charblock($codepoint);
561c79ed	32
	33	=head1 DESCRIPTION
	34
	35	The Unicode module offers a simple interface to the Unicode Character
	36	Database.
	37
	38	=cut
	39
afd1eb53	40	my $UNICODEFH;
	41	my $BLOCKSFH;
	42	my $SCRIPTSFH;
	43	my $VERSIONFH;
561c79ed	44
	45	sub openunicode {
	46	my ($rfh, @path) = @_;
	47	my $f;
	48	unless (defined $$rfh) {
	49	for my $d (@INC) {
	50	use File::Spec;
	51	$f = File::Spec->catfile($d, "unicode", @path);
afd1eb53	52	last if open($$rfh, $f);
afd1eb53	53	undef $f;
561c79ed	54	}
afd1eb53	55	croak __PACKAGE__, ": failed to find ",
	56	File::Spec->catfile(@path), " in @INC"
	57	unless defined $f;
561c79ed	58	}
	59	return $f;
	60	}
	61
	62	=head2 charinfo
	63
	64	use Unicode::UCD 'charinfo';
	65
	66	my %charinfo = charinfo(0x41);
	67
	68	charinfo() returns a hash that has the following fields as defined
	69	by the Unicode standard:
	70
	71	key
	72
	73	code code point with at least four hexdigits
	74	name name of the character IN UPPER CASE
	75	category general category of the character
	76	combining classes used in the Canonical Ordering Algorithm
	77	bidi bidirectional category
	78	decomposition character decomposition mapping
	79	decimal if decimal digit this is the integer numeric value
	80	digit if digit this is the numeric value
	81	numeric if numeric is the integer or rational numeric value
	82	mirrored if mirrored in bidirectional text
	83	unicode10 Unicode 1.0 name if existed and different
	84	comment ISO 10646 comment field
	85	upper uppercase equivalent mapping
	86	lower lowercase equivalent mapping
	87	title titlecase equivalent mapping
afd1eb53	88
561c79ed	89	block block the character belongs to (used in \p{In...})
afd1eb53	90	script script the character belongs to
561c79ed	91
	92	If no match is found, an empty hash is returned.
	93
afd1eb53	94	The C<block> property is the same as as returned by charinfo(). It is
	95	not defined in the Unicode Character Database proper (Chapter 4 of the
	96	Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
	97	of TUS3). Similarly for the C<script> property.
	98
	99	Note that you cannot do (de)composition and casing based solely on the
	100	above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
	101	you will need also the I<Composition Exclusions>, I<Case Folding>, and
	102	I<SpecialCasing> tables, available as files F<CompExcl.txt>,
	103	F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
561c79ed	104
	105	=cut
	106
afd1eb53	107	sub _getcode {
	108	my $arg = shift;
	109
	110	if ($arg =~ /^\d+$/) {
	111	return $arg;
	112	} elsif ($arg =~ /^(?:U\+\|0x)?([[:xdigit:]]+)$/) {
	113	return hex($1);
	114	}
	115
	116	return;
	117	}
	118
561c79ed	119	sub charinfo {
afd1eb53	120	my $arg = shift;
	121	my $code = _getcode($arg);
	122	croak __PACKAGE__, "::charinfo: unknown code '$arg'"
	123	unless defined $code;
561c79ed	124	my $hexk = sprintf("%04X", $code);
561c79ed	125
afd1eb53	126	openunicode(\$UNICODEFH, "Unicode.txt");
afd1eb53	127	if (defined $UNICODEFH) {
561c79ed	128	use Search::Dict;
afd1eb53	129	if (look($UNICODEFH, "$hexk;") >= 0) {
afd1eb53	130	my $line = <$UNICODEFH>;
561c79ed	131	chomp $line;
	132	my %prop;
	133	@prop{qw(
	134	code name category
	135	combining bidi decomposition
	136	decimal digit numeric
	137	mirrored unicode10 comment
	138	upper lower title
	139	)} = split(/;/, $line, -1);
	140	if ($prop{code} eq $hexk) {
afd1eb53	141	$prop{block} = charblock($code);
afd1eb53	142	$prop{script} = charscript($code);
561c79ed	143	return %prop;
	144	}
	145	}
	146	}
	147	return;
	148	}
	149
afd1eb53	150	sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
	151	my ($table, $lo, $hi, $code) = @_;
	152
	153	return if $lo > $hi;
	154
	155	my $mid = int(($lo+$hi) / 2);
	156
	157	if ($table->[$mid]->[0] < $code) {
	158	if ($table->[$mid]->[1] >= $code) {
	159	return $table->[$mid]->[2];
	160	} else {
	161	_search($table, $mid + 1, $hi, $code);
	162	}
	163	} elsif ($table->[$mid]->[0] > $code) {
	164	_search($table, $lo, $mid - 1, $code);
	165	} else {
	166	return $table->[$mid]->[2];
	167	}
	168	}
	169
	170	sub charinrange {
	171	my ($range, $arg) = @_;
	172	my $code = _getcode($arg);
	173	croak __PACKAGE__, "::charinrange: unknown code '$arg'"
	174	unless defined $code;
	175	_search($range, 0, $#$range, $code);
	176	}
	177
354a27bf	178	=head2 charblock
561c79ed	179
	180	use Unicode::UCD 'charblock';
	181
	182	my $charblock = charblock(0x41);
afd1eb53	183	my $charblock = charblock(1234);
	184	my $charblock = charblock("0x263a");
	185	my $charblock = charblock("U+263a");
	186
	187	my $ranges = charblock('Armenian');
	188
	189	With a B<code point argument> charblock() returns the block the character
	190	belongs to, e.g. C<Basic Latin>. Note that not all the character
	191	positions within all blocks are defined. A <code point argument>
	192	is either a decimal or a hexadecimal scalar, or "U+" followed
	193	by hexadecimals.
	194
	195	If supplied with an argument that can't be a code point, charblock()
	196	tries to do the opposite and interpret the argument as a character
	197	block. The return value is a I<range>: an anonymous list that
	198	contains anonymous lists, which in turn contain I<start-of-range>,
	199	I<end-of-range> code point pairs. You can test whether a code point
	200	is in a range using the L</charinrange> function. If the argument is
	201	not a known charater block, C<undef> is returned.
561c79ed	202
	203	=cut
	204
	205	my @BLOCKS;
afd1eb53	206	my %BLOCKS;
	207
	208	sub _charblocks {
	209	unless (@BLOCKS) {
	210	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
	211	while (<$BLOCKSFH>) {
	212	if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
	213	my ($lo, $hi) = (hex($1), hex($2));
	214	my $subrange = [ $lo, $hi, $3 ];
	215	push @BLOCKS, $subrange;
	216	push @{$BLOCKS{$3}}, $subrange;
	217	}
	218	}
	219	close($BLOCKSFH);
	220	}
	221	}
	222	}
561c79ed	223
afd1eb53	224	sub charblock {
afd1eb53	225	my $arg = shift;
561c79ed	226
afd1eb53	227	_charblocks() unless @BLOCKS;
561c79ed	228
afd1eb53	229	my $code = _getcode($arg);
561c79ed	230
afd1eb53	231	if (defined $code) {
	232	_search(\@BLOCKS, 0, $#BLOCKS, $code);
	233	} else {
	234	if (exists $BLOCKS{$arg}) {
	235	return $BLOCKS{$arg};
561c79ed	236	} else {
afd1eb53	237	return;
561c79ed	238	}
561c79ed	239	}
	240	}
	241
afd1eb53	242	=head2 charscript
561c79ed	243
afd1eb53	244	use Unicode::UCD 'charscript';
	245
	246	my $charscript = charscript(0x41);
	247	my $charscript = charscript(1234);
	248	my $charscript = charscript("U+263a");
	249
	250	my $ranges = charscript('Thai');
	251
	252	With a B<code point argument> charscript() returns the script the
	253	character belongs to, e.g. C<Latin>, C<Greek>, C<Han>. A <code point
	254	argument> is either a decimal or a hexadecimal scalar, or "U+"
	255	followed by hexadecimals.
	256
	257	If supplied with an argument that can't be a code point, charscript()
	258	tries to do the opposite and interpret the argument as a character
	259	script. The return value is a I<range>: an anonymous list that
	260	contains anonymous lists, which in turn contain I<start-of-range>,
	261	I<end-of-range> code point pairs. You can test whether a code point
	262	is in a range using the L</charinrange> function. If the argument is
	263	not a known charater script, C<undef> is returned.
	264
	265	=cut
	266
	267	my @SCRIPTS;
	268	my %SCRIPTS;
	269
	270	sub _charscripts {
	271	unless (@SCRIPTS) {
	272	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
	273	while (<$SCRIPTSFH>) {
	274	if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
	275	my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
	276	my $script = lc($3);
	277	$script =~ s/\b(\w)/uc($1)/ge;
	278	my $subrange = [ $lo, $hi, $script ];
	279	push @SCRIPTS, $subrange;
	280	push @{$SCRIPTS{$script}}, $subrange;
561c79ed	281	}
561c79ed	282	}
afd1eb53	283	close($SCRIPTSFH);
	284	@SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
	285	}
	286	}
	287	}
	288
	289	sub charscript {
	290	my $arg = shift;
	291
	292	_charscripts() unless @SCRIPTS;
	293
	294	my $code = _getcode($arg);
	295
	296	if (defined $code) {
	297	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
	298	} else {
	299	if (exists $SCRIPTS{$arg}) {
	300	return $SCRIPTS{$arg};
	301	} else {
	302	return;
561c79ed	303	}
561c79ed	304	}
afd1eb53	305	}
	306
	307	=head2 charblocks
	308
	309	use Unicode::UCD 'charblocks';
	310
	311	my %charblocks = charblocks();
	312
	313	charblocks() returns a hash with the known block names as the keys,
	314	and the code point ranges (see L</charblock>) as the values.
	315
	316	=cut
	317
	318	sub charblocks {
	319	_charblocks() unless @BLOCKS;
	320	return %BLOCKS;
	321	}
	322
	323	=head2 charscripts
	324
	325	use Unicode::UCD 'charscripts';
	326
	327	my %charscripts = charscripts();
	328
	329	charscripts() returns a hash with the known script names as the keys,
	330	and the code point ranges (see L</charscript>) as the values.
	331
	332	=cut
	333
	334	sub charscripts {
	335	_charscripts() unless @SCRIPTS;
	336	return %SCRIPTS;
	337	}
	338
	339	=head2 Blocks versus Scripts
	340
	341	The difference between a block and a script is that scripts are closer
	342	to the linguistic notion of a set of characters required to present
	343	languages, while block is more of an artifact of the Unicode character
	344	numbering and separation into blocks of 256 characters.
	345
	346	For example the Latin B<script> is spread over several B<blocks>, such
	347	as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
	348	C<Latin Extended-B>. On the other hand, the Latin script does not
	349	contain all the characters of the C<Basic Latin> block (also known as
	350	the ASCII): it includes only the letters, not for example the digits
	351	or the punctuation.
	352
	353	For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
	354
	355	For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
	356
	357	=head2 Matching Scripts and Blocks
	358
	359	Both scripts and blocks can be matched using the regular expression
	360	construct C<\p{In...}> and its negation C<\P{In...}>.
	361
	362	The name of the script or the block comes after the C<In>, for example
	363	C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
	364	removed from the names for the C<\p{In...}>, for example
	365	C<LatinExtendedA> instead of C<Latin Extended-A>.
	366
	367	There are a few cases where there exists both a script and a block by
	368	the same name, in these cases the block version has C<Block> appended:
369	C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
370
371	=head2 charinrange
372
373	In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
374	can also test whether a code point is in the I<range> as returned by
375	L</charblock> and L</charscript> or as the values of the hash returned
376	by L</charblocks> and </charscripts> by using charinrange():
561c79ed	377
afd1eb53	378	use Unicode::UCD qw(charscript charinrange);
	379
	380	$range = charscript('Hiragana');
	381	print "looks like hiragana\n" if charinrange($range, $code);
	382
	383	=cut
	384
	385	=head2 Unicode::UCD::UnicodeVersion
	386
	387	Unicode::UCD::UnicodeVersion() returns the version of the Unicode Character
	388	Database, in other words, the version of the Unicode standard the
	389	database implements.
	390
	391	=cut
	392
	393	my $UNICODEVERSION;
	394
	395	sub UnicodeVersion {
	396	unless (defined $UNICODEVERSION) {
	397	openunicode(\$VERSIONFH, "version");
	398	chomp($UNICODEVERSION = <$VERSIONFH>);
	399	close($VERSIONFH);
	400	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
	401	unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
	402	}
	403	return $UNICODEVERSION;
561c79ed	404	}
561c79ed	405
afd1eb53	406	=head2 Implementation Note
	407
	408	The first use of charinfo() opens a read-only filehandle to the Unicode
	409	Character Database (the database is included in the Perl distribution).
	410	The filehandle is then kept open for further queries.
	411
561c79ed	412	=head1 AUTHOR
	413
	414	Jarkko Hietaniemi
	415
	416	=cut
	417
	418	1;