[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm

package Unicode::UCD;

use strict;
use warnings;

our $VERSION = '0.1';

require Exporter;

our @ISA = qw(Exporter);
our @EXPORT_OK = qw(charinfo
		    charblock charscript
		    charblocks charscripts
		    charinrange);

use Carp;

=head1 NAME

Unicode::UCD - Unicode character database

=head1 SYNOPSIS

    use Unicode::UCD 'charinfo';
    my %charinfo   = charinfo($codepoint);

    use Unicode::UCD 'charblock';
    my $charblock  = charblock($codepoint);

    use Unicode::UCD 'charscript';
    my $charscript = charblock($codepoint);

=head1 DESCRIPTION

The Unicode module offers a simple interface to the Unicode Character
Database.

=cut

my $UNICODEFH;
my $BLOCKSFH;
my $SCRIPTSFH;
my $VERSIONFH;

sub openunicode {
    my ($rfh, @path) = @_;
    my $f;
    unless (defined $$rfh) {
	for my $d (@INC) {
	    use File::Spec;
	    $f = File::Spec->catfile($d, "unicode", @path);
	    last if open($$rfh, $f);
	    undef $f;
	}
	croak __PACKAGE__, ": failed to find ",
              File::Spec->catfile(@path), " in @INC"
	    unless defined $f;
    }
    return $f;
}

=head2 charinfo

    use Unicode::UCD 'charinfo';

    my %charinfo = charinfo(0x41);

charinfo() returns a hash that has the following fields as defined
by the Unicode standard:

    key

    code             code point with at least four hexdigits
    name             name of the character IN UPPER CASE
    category         general category of the character
    combining        classes used in the Canonical Ordering Algorithm
    bidi             bidirectional category
    decomposition    character decomposition mapping
    decimal          if decimal digit this is the integer numeric value
    digit            if digit this is the numeric value
    numeric          if numeric is the integer or rational numeric value
    mirrored         if mirrored in bidirectional text
    unicode10        Unicode 1.0 name if existed and different
    comment          ISO 10646 comment field
    upper            uppercase equivalent mapping
    lower            lowercase equivalent mapping
    title            titlecase equivalent mapping

    block            block the character belongs to (used in \p{In...})
    script           script the character belongs to 

If no match is found, an empty hash is returned.

The C<block> property is the same as as returned by charinfo().  It is
not defined in the Unicode Character Database proper (Chapter 4 of the
Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
of TUS3).  Similarly for the C<script> property.

Note that you cannot do (de)composition and casing based solely on the
above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
you will need also the I<Composition Exclusions>, I<Case Folding>, and
I<SpecialCasing> tables, available as files F<CompExcl.txt>,
F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.

=cut

sub _getcode {
    my $arg = shift;

    if ($arg =~ /^\d+$/) {
	return $arg;
    } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
	return hex($1);
    }

    return;
}

sub charinfo {
    my $arg  = shift;
    my $code = _getcode($arg);
    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
	unless defined $code;
    my $hexk = sprintf("%04X", $code);

    openunicode(\$UNICODEFH, "Unicode.txt");
    if (defined $UNICODEFH) {
	use Search::Dict;
	if (look($UNICODEFH, "$hexk;") >= 0) {
	    my $line = <$UNICODEFH>;
	    chomp $line;
	    my %prop;
	    @prop{qw(
		     code name category
		     combining bidi decomposition
		     decimal digit numeric
		     mirrored unicode10 comment
		     upper lower title
		    )} = split(/;/, $line, -1);
	    if ($prop{code} eq $hexk) {
		$prop{block}  = charblock($code);
		$prop{script} = charscript($code);
		return %prop;
	    }
	}
    }
    return;
}

sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
    my ($table, $lo, $hi, $code) = @_;

    return if $lo > $hi;

    my $mid = int(($lo+$hi) / 2);

    if ($table->[$mid]->[0] < $code) {
	if ($table->[$mid]->[1] >= $code) {
	    return $table->[$mid]->[2];
	} else {
	    _search($table, $mid + 1, $hi, $code);
	}
    } elsif ($table->[$mid]->[0] > $code) {
	_search($table, $lo, $mid - 1, $code);
    } else {
	return $table->[$mid]->[2];
    }
}

sub charinrange {
    my ($range, $arg) = @_;
    my $code = _getcode($arg);
    croak __PACKAGE__, "::charinrange: unknown code '$arg'"
	unless defined $code;
    _search($range, 0, $#$range, $code);
}

=head2 charblock

    use Unicode::UCD 'charblock';

    my $charblock = charblock(0x41);
    my $charblock = charblock(1234);
    my $charblock = charblock("0x263a");
    my $charblock = charblock("U+263a");

    my $ranges    = charblock('Armenian');

With a B<code point argument> charblock() returns the block the character
belongs to, e.g.  C<Basic Latin>.  Note that not all the character
positions within all blocks are defined.  A <code point argument>
is either a decimal or a hexadecimal scalar, or "U+" followed
by hexadecimals.

If supplied with an argument that can't be a code point, charblock()
tries to do the opposite and interpret the argument as a character
block.  The return value is a I<range>: an anonymous list that
contains anonymous lists, which in turn contain I<start-of-range>,
I<end-of-range> code point pairs.  You can test whether a code point
is in a range using the L</charinrange> function.  If the argument is
not a known charater block, C<undef> is returned.

=cut

my @BLOCKS;
my %BLOCKS;

sub _charblocks {
    unless (@BLOCKS) {
	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
	    while (<$BLOCKSFH>) {
		if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
		    my ($lo, $hi) = (hex($1), hex($2));
		    my $subrange = [ $lo, $hi, $3 ];
		    push @BLOCKS, $subrange;
		    push @{$BLOCKS{$3}}, $subrange;
		}
	    }
	    close($BLOCKSFH);
	}
    }
}

sub charblock {
    my $arg = shift;

    _charblocks() unless @BLOCKS;

    my $code = _getcode($arg);

    if (defined $code) {
	_search(\@BLOCKS, 0, $#BLOCKS, $code);
    } else {
	if (exists $BLOCKS{$arg}) {
	    return $BLOCKS{$arg};
	} else {
	    return;
	}
    }
}

=head2 charscript

    use Unicode::UCD 'charscript';

    my $charscript = charscript(0x41);
    my $charscript = charscript(1234);
    my $charscript = charscript("U+263a");

    my $ranges     = charscript('Thai');

With a B<code point argument> charscript() returns the script the
character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.  A <code point
argument> is either a decimal or a hexadecimal scalar, or "U+"
followed by hexadecimals.

If supplied with an argument that can't be a code point, charscript()
tries to do the opposite and interpret the argument as a character
script.  The return value is a I<range>: an anonymous list that
contains anonymous lists, which in turn contain I<start-of-range>,
I<end-of-range> code point pairs.  You can test whether a code point
is in a range using the L</charinrange> function.  If the argument is
not a known charater script, C<undef> is returned.

=cut

my @SCRIPTS;
my %SCRIPTS;

sub _charscripts {
    unless (@SCRIPTS) {
	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
	    while (<$SCRIPTSFH>) {
		if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
		    my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
		    my $script = lc($3);
		    $script =~ s/\b(\w)/uc($1)/ge;
		    my $subrange = [ $lo, $hi, $script ];
		    push @SCRIPTS, $subrange;
		    push @{$SCRIPTS{$script}}, $subrange;
		}
	    }
	    close($SCRIPTSFH);
	    @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
	}
    }
}

sub charscript {
    my $arg = shift;

    _charscripts() unless @SCRIPTS;

    my $code = _getcode($arg);

    if (defined $code) {
	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
    } else {
	if (exists $SCRIPTS{$arg}) {
	    return $SCRIPTS{$arg};
	} else {
	    return;
	}
    }
}

=head2 charblocks

    use Unicode::UCD 'charblocks';

    my %charblocks = charblocks();

charblocks() returns a hash with the known block names as the keys,
and the code point ranges (see L</charblock>) as the values.

=cut

sub charblocks {
    _charblocks() unless @BLOCKS;
    return %BLOCKS;
}

=head2 charscripts

    use Unicode::UCD 'charscripts';

    my %charscripts = charscripts();

charscripts() returns a hash with the known script names as the keys,
and the code point ranges (see L</charscript>) as the values.

=cut

sub charscripts {
    _charscripts() unless @SCRIPTS;
    return %SCRIPTS;
}

=head2 Blocks versus Scripts

The difference between a block and a script is that scripts are closer
to the linguistic notion of a set of characters required to present
languages, while block is more of an artifact of the Unicode character
numbering and separation into blocks of 256 characters.

For example the Latin B<script> is spread over several B<blocks>, such
as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
C<Latin Extended-B>.  On the other hand, the Latin script does not
contain all the characters of the C<Basic Latin> block (also known as
the ASCII): it includes only the letters, not for example the digits
or the punctuation.

For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt

For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/

=head2 Matching Scripts and Blocks

Both scripts and blocks can be matched using the regular expression
construct C<\p{In...}> and its negation C<\P{In...}>.

The name of the script or the block comes after the C<In>, for example
C<\p{InCyrillic}>, C<\P{InBasicLatin}>.  Spaces and dashes ('-') are
removed from the names for the C<\p{In...}>, for example
C<LatinExtendedA> instead of C<Latin Extended-A>.

There are a few cases where there exists both a script and a block by
the same name, in these cases the block version has C<Block> appended:
C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.

=head2 charinrange

In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
can also test whether a code point is in the I<range> as returned by
L</charblock> and L</charscript> or as the values of the hash returned
by L</charblocks> and </charscripts> by using charinrange():

    use Unicode::UCD qw(charscript charinrange);

    $range = charscript('Hiragana');
    print "looks like hiragana\n" if charinrange($range, $code);

=cut

=head2 Unicode::UCD::UnicodeVersion

Unicode::UCD::UnicodeVersion() returns the version of the Unicode Character
Database, in other words, the version of the Unicode standard the
database implements.

=cut

my $UNICODEVERSION;

sub UnicodeVersion {
    unless (defined $UNICODEVERSION) {
	openunicode(\$VERSIONFH, "version");
	chomp($UNICODEVERSION = <$VERSIONFH>);
	close($VERSIONFH);
	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
	    unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
    }
    return $UNICODEVERSION;
}

=head2 Implementation Note

The first use of charinfo() opens a read-only filehandle to the Unicode
Character Database (the database is included in the Perl distribution).
The filehandle is then kept open for further queries.

=head1 AUTHOR

Jarkko Hietaniemi

=cut

1;
Commit	Line	Data
561c79ed	1	package Unicode::UCD;
	2
	3	use strict;
	4	use warnings;
	5
10a6ecd2	6	our $VERSION = '0.1';
561c79ed	7
	8	require Exporter;
	9
	10	our @ISA = qw(Exporter);
10a6ecd2	11	our @EXPORT_OK = qw(charinfo
	12	charblock charscript
	13	charblocks charscripts
	14	charinrange);
561c79ed	15
	16	use Carp;
	17
	18	=head1 NAME
	19
00f2772c	20	Unicode::UCD - Unicode character database
561c79ed	21
	22	=head1 SYNOPSIS
	23
561c79ed	24	use Unicode::UCD 'charinfo';
e882dd67	25	my %charinfo = charinfo($codepoint);
561c79ed	26
561c79ed	27	use Unicode::UCD 'charblock';
e882dd67	28	my $charblock = charblock($codepoint);
	29
	30	use Unicode::UCD 'charscript';
	31	my $charscript = charblock($codepoint);
561c79ed	32
	33	=head1 DESCRIPTION
	34
	35	The Unicode module offers a simple interface to the Unicode Character
	36	Database.
	37
	38	=cut
	39
10a6ecd2	40	my $UNICODEFH;
	41	my $BLOCKSFH;
	42	my $SCRIPTSFH;
	43	my $VERSIONFH;
561c79ed	44
	45	sub openunicode {
	46	my ($rfh, @path) = @_;
	47	my $f;
	48	unless (defined $$rfh) {
	49	for my $d (@INC) {
	50	use File::Spec;
	51	$f = File::Spec->catfile($d, "unicode", @path);
32c16050	52	last if open($$rfh, $f);
e882dd67	53	undef $f;
561c79ed	54	}
e882dd67	55	croak __PACKAGE__, ": failed to find ",
	56	File::Spec->catfile(@path), " in @INC"
	57	unless defined $f;
561c79ed	58	}
	59	return $f;
	60	}
	61
	62	=head2 charinfo
	63
	64	use Unicode::UCD 'charinfo';
	65
	66	my %charinfo = charinfo(0x41);
	67
	68	charinfo() returns a hash that has the following fields as defined
	69	by the Unicode standard:
	70
	71	key
	72
	73	code code point with at least four hexdigits
	74	name name of the character IN UPPER CASE
	75	category general category of the character
	76	combining classes used in the Canonical Ordering Algorithm
	77	bidi bidirectional category
	78	decomposition character decomposition mapping
	79	decimal if decimal digit this is the integer numeric value
	80	digit if digit this is the numeric value
	81	numeric if numeric is the integer or rational numeric value
	82	mirrored if mirrored in bidirectional text
	83	unicode10 Unicode 1.0 name if existed and different
	84	comment ISO 10646 comment field
	85	upper uppercase equivalent mapping
	86	lower lowercase equivalent mapping
	87	title titlecase equivalent mapping
e882dd67	88
561c79ed	89	block block the character belongs to (used in \p{In...})
e882dd67	90	script script the character belongs to
561c79ed	91
	92	If no match is found, an empty hash is returned.
	93
32c16050	94	The C<block> property is the same as as returned by charinfo(). It is
	95	not defined in the Unicode Character Database proper (Chapter 4 of the
	96	Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
e882dd67	97	of TUS3). Similarly for the C<script> property.
32c16050	98
	99	Note that you cannot do (de)composition and casing based solely on the
	100	above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
e882dd67	101	you will need also the I<Composition Exclusions>, I<Case Folding>, and
	102	I<SpecialCasing> tables, available as files F<CompExcl.txt>,
	103	F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
561c79ed	104
	105	=cut
	106
10a6ecd2	107	sub _getcode {
	108	my $arg = shift;
	109
	110	if ($arg =~ /^\d+$/) {
	111	return $arg;
	112	} elsif ($arg =~ /^(?:U\+\|0x)?([[:xdigit:]]+)$/) {
	113	return hex($1);
	114	}
	115
	116	return;
	117	}
	118
561c79ed	119	sub charinfo {
10a6ecd2	120	my $arg = shift;
	121	my $code = _getcode($arg);
	122	croak __PACKAGE__, "::charinfo: unknown code '$arg'"
	123	unless defined $code;
561c79ed	124	my $hexk = sprintf("%04X", $code);
561c79ed	125
10a6ecd2	126	openunicode(\$UNICODEFH, "Unicode.txt");
10a6ecd2	127	if (defined $UNICODEFH) {
561c79ed	128	use Search::Dict;
10a6ecd2	129	if (look($UNICODEFH, "$hexk;") >= 0) {
10a6ecd2	130	my $line = <$UNICODEFH>;
561c79ed	131	chomp $line;
	132	my %prop;
	133	@prop{qw(
	134	code name category
	135	combining bidi decomposition
	136	decimal digit numeric
	137	mirrored unicode10 comment
	138	upper lower title
	139	)} = split(/;/, $line, -1);
	140	if ($prop{code} eq $hexk) {
a196fbfd	141	$prop{block} = charblock($code);
a196fbfd	142	$prop{script} = charscript($code);
561c79ed	143	return %prop;
	144	}
	145	}
	146	}
	147	return;
	148	}
	149
e882dd67	150	sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
	151	my ($table, $lo, $hi, $code) = @_;
	152
	153	return if $lo > $hi;
	154
	155	my $mid = int(($lo+$hi) / 2);
	156
	157	if ($table->[$mid]->[0] < $code) {
10a6ecd2	158	if ($table->[$mid]->[1] >= $code) {
e882dd67	159	return $table->[$mid]->[2];
	160	} else {
	161	_search($table, $mid + 1, $hi, $code);
	162	}
	163	} elsif ($table->[$mid]->[0] > $code) {
	164	_search($table, $lo, $mid - 1, $code);
	165	} else {
	166	return $table->[$mid]->[2];
	167	}
	168	}
	169
10a6ecd2	170	sub charinrange {
	171	my ($range, $arg) = @_;
	172	my $code = _getcode($arg);
	173	croak __PACKAGE__, "::charinrange: unknown code '$arg'"
	174	unless defined $code;
	175	_search($range, 0, $#$range, $code);
	176	}
	177
354a27bf	178	=head2 charblock
561c79ed	179
	180	use Unicode::UCD 'charblock';
	181
	182	my $charblock = charblock(0x41);
10a6ecd2	183	my $charblock = charblock(1234);
	184	my $charblock = charblock("0x263a");
	185	my $charblock = charblock("U+263a");
	186
	187	my $ranges = charblock('Armenian');
	188
	189	With a B<code point argument> charblock() returns the block the character
	190	belongs to, e.g. C<Basic Latin>. Note that not all the character
	191	positions within all blocks are defined. A <code point argument>
	192	is either a decimal or a hexadecimal scalar, or "U+" followed
	193	by hexadecimals.
	194
	195	If supplied with an argument that can't be a code point, charblock()
	196	tries to do the opposite and interpret the argument as a character
	197	block. The return value is a I<range>: an anonymous list that
	198	contains anonymous lists, which in turn contain I<start-of-range>,
	199	I<end-of-range> code point pairs. You can test whether a code point
	200	is in a range using the L</charinrange> function. If the argument is
	201	not a known charater block, C<undef> is returned.
561c79ed	202
561c79ed	203	=cut
	204
	205	my @BLOCKS;
10a6ecd2	206	my %BLOCKS;
561c79ed	207
10a6ecd2	208	sub _charblocks {
561c79ed	209	unless (@BLOCKS) {
10a6ecd2	210	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
10a6ecd2	211	while (<$BLOCKSFH>) {
2796c109	212	if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2	213	my ($lo, $hi) = (hex($1), hex($2));
	214	my $subrange = [ $lo, $hi, $3 ];
	215	push @BLOCKS, $subrange;
	216	push @{$BLOCKS{$3}}, $subrange;
561c79ed	217	}
561c79ed	218	}
10a6ecd2	219	close($BLOCKSFH);
561c79ed	220	}
561c79ed	221	}
10a6ecd2	222	}
	223
	224	sub charblock {
	225	my $arg = shift;
	226
	227	_charblocks() unless @BLOCKS;
	228
	229	my $code = _getcode($arg);
561c79ed	230
10a6ecd2	231	if (defined $code) {
	232	_search(\@BLOCKS, 0, $#BLOCKS, $code);
	233	} else {
	234	if (exists $BLOCKS{$arg}) {
	235	return $BLOCKS{$arg};
	236	} else {
	237	return;
	238	}
	239	}
e882dd67	240	}
	241
	242	=head2 charscript
	243
	244	use Unicode::UCD 'charscript';
	245
	246	my $charscript = charscript(0x41);
10a6ecd2	247	my $charscript = charscript(1234);
10a6ecd2	248	my $charscript = charscript("U+263a");
e882dd67	249
10a6ecd2	250	my $ranges = charscript('Thai');
	251
	252	With a B<code point argument> charscript() returns the script the
	253	character belongs to, e.g. C<Latin>, C<Greek>, C<Han>. A <code point
	254	argument> is either a decimal or a hexadecimal scalar, or "U+"
	255	followed by hexadecimals.
	256
	257	If supplied with an argument that can't be a code point, charscript()
	258	tries to do the opposite and interpret the argument as a character
	259	script. The return value is a I<range>: an anonymous list that
	260	contains anonymous lists, which in turn contain I<start-of-range>,
	261	I<end-of-range> code point pairs. You can test whether a code point
	262	is in a range using the L</charinrange> function. If the argument is
	263	not a known charater script, C<undef> is returned.
e882dd67	264
e882dd67	265	=cut
	266
	267	my @SCRIPTS;
10a6ecd2	268	my %SCRIPTS;
e882dd67	269
10a6ecd2	270	sub _charscripts {
e882dd67	271	unless (@SCRIPTS) {
10a6ecd2	272	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
10a6ecd2	273	while (<$SCRIPTSFH>) {
e882dd67	274	if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2	275	my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
	276	my $script = lc($3);
	277	$script =~ s/\b(\w)/uc($1)/ge;
	278	my $subrange = [ $lo, $hi, $script ];
	279	push @SCRIPTS, $subrange;
	280	push @{$SCRIPTS{$script}}, $subrange;
e882dd67	281	}
e882dd67	282	}
10a6ecd2	283	close($SCRIPTSFH);
e882dd67	284	@SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
	285	}
	286	}
10a6ecd2	287	}
	288
	289	sub charscript {
	290	my $arg = shift;
	291
	292	_charscripts() unless @SCRIPTS;
e882dd67	293
10a6ecd2	294	my $code = _getcode($arg);
	295
	296	if (defined $code) {
	297	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
	298	} else {
	299	if (exists $SCRIPTS{$arg}) {
	300	return $SCRIPTS{$arg};
	301	} else {
	302	return;
	303	}
	304	}
	305	}
	306
	307	=head2 charblocks
	308
	309	use Unicode::UCD 'charblocks';
	310
	311	my %charblocks = charblocks();
	312
	313	charblocks() returns a hash with the known block names as the keys,
	314	and the code point ranges (see L</charblock>) as the values.
	315
	316	=cut
	317
	318	sub charblocks {
	319	_charblocks() unless @BLOCKS;
	320	return %BLOCKS;
	321	}
	322
	323	=head2 charscripts
	324
	325	use Unicode::UCD 'charscripts';
	326
	327	my %charscripts = charscripts();
	328
	329	charscripts() returns a hash with the known script names as the keys,
	330	and the code point ranges (see L</charscript>) as the values.
	331
	332	=cut
	333
	334	sub charscripts {
	335	_charscripts() unless @SCRIPTS;
	336	return %SCRIPTS;
561c79ed	337	}
561c79ed	338
10a6ecd2	339	=head2 Blocks versus Scripts
ad9cab37	340
10a6ecd2	341	The difference between a block and a script is that scripts are closer
	342	to the linguistic notion of a set of characters required to present
	343	languages, while block is more of an artifact of the Unicode character
	344	numbering and separation into blocks of 256 characters.
3aa957f9	345
	346	For example the Latin B<script> is spread over several B<blocks>, such
	347	as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
	348	C<Latin Extended-B>. On the other hand, the Latin script does not
	349	contain all the characters of the C<Basic Latin> block (also known as
	350	the ASCII): it includes only the letters, not for example the digits
	351	or the punctuation.
ad9cab37	352
3aa957f9	353	For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37	354
	355	For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
	356
3aa957f9	357	=head2 Matching Scripts and Blocks
	358
	359	Both scripts and blocks can be matched using the regular expression
	360	construct C<\p{In...}> and its negation C<\P{In...}>.
	361
	362	The name of the script or the block comes after the C<In>, for example
	363	C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
10a6ecd2	364	removed from the names for the C<\p{In...}>, for example
	365	C<LatinExtendedA> instead of C<Latin Extended-A>.
	366
	367	There are a few cases where there exists both a script and a block by
	368	the same name, in these cases the block version has C<Block> appended:
	369	C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
	370
	371	=head2 charinrange
	372
	373	In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
	374	can also test whether a code point is in the I<range> as returned by
	375	L</charblock> and L</charscript> or as the values of the hash returned
	376	by L</charblocks> and </charscripts> by using charinrange():
	377
	378	use Unicode::UCD qw(charscript charinrange);
	379
	380	$range = charscript('Hiragana');
	381	print "looks like hiragana\n" if charinrange($range, $code);
	382
	383	=cut
	384
	385	=head2 Unicode::UCD::UnicodeVersion
	386
	387	Unicode::UCD::UnicodeVersion() returns the version of the Unicode Character
	388	Database, in other words, the version of the Unicode standard the
	389	database implements.
	390
	391	=cut
	392
	393	my $UNICODEVERSION;
	394
	395	sub UnicodeVersion {
	396	unless (defined $UNICODEVERSION) {
	397	openunicode(\$VERSIONFH, "version");
	398	chomp($UNICODEVERSION = <$VERSIONFH>);
	399	close($VERSIONFH);
	400	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
	401	unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
	402	}
	403	return $UNICODEVERSION;
	404	}
3aa957f9	405
3aa957f9	406	=head2 Implementation Note
32c16050	407
ad9cab37	408	The first use of charinfo() opens a read-only filehandle to the Unicode
	409	Character Database (the database is included in the Perl distribution).
	410	The filehandle is then kept open for further queries.
32c16050	411
561c79ed	412	=head1 AUTHOR
	413
	414	Jarkko Hietaniemi
	415
	416	=cut
	417
	418	1;