[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm

package Unicode::UCD;

use strict;
use warnings;

our $VERSION = '3.1.0';

require Exporter;

our @ISA = qw(Exporter);
our @EXPORT_OK = qw(charinfo charblock charscript);

use Carp;

=head1 NAME

Unicode::UCD - Unicode character database

=head1 SYNOPSIS

    use Unicode::UCD 3.1.0;
    # requires that level of the Unicode character database

    use Unicode::UCD 'charinfo';
    my %charinfo   = charinfo($codepoint);

    use Unicode::UCD 'charblock';
    my $charblock  = charblock($codepoint);

    use Unicode::UCD 'charscript';
    my $charscript = charblock($codepoint);

=head1 DESCRIPTION

The Unicode module offers a simple interface to the Unicode Character
Database.

=cut

my $UNICODE;
my $BLOCKS;
my $SCRIPTS;

sub openunicode {
    my ($rfh, @path) = @_;
    my $f;
    unless (defined $$rfh) {
	for my $d (@INC) {
	    use File::Spec;
	    $f = File::Spec->catfile($d, "unicode", @path);
	    last if open($$rfh, $f);
	    undef $f;
	}
	croak __PACKAGE__, ": failed to find ",
              File::Spec->catfile(@path), " in @INC"
	    unless defined $f;
    }
    return $f;
}

=head2 charinfo

    use Unicode::UCD 'charinfo';

    my %charinfo = charinfo(0x41);

charinfo() returns a hash that has the following fields as defined
by the Unicode standard:

    key

    code             code point with at least four hexdigits
    name             name of the character IN UPPER CASE
    category         general category of the character
    combining        classes used in the Canonical Ordering Algorithm
    bidi             bidirectional category
    decomposition    character decomposition mapping
    decimal          if decimal digit this is the integer numeric value
    digit            if digit this is the numeric value
    numeric          if numeric is the integer or rational numeric value
    mirrored         if mirrored in bidirectional text
    unicode10        Unicode 1.0 name if existed and different
    comment          ISO 10646 comment field
    upper            uppercase equivalent mapping
    lower            lowercase equivalent mapping
    title            titlecase equivalent mapping

    block            block the character belongs to (used in \p{In...})
    script           script the character belongs to 

If no match is found, an empty hash is returned.

The C<block> property is the same as as returned by charinfo().  It is
not defined in the Unicode Character Database proper (Chapter 4 of the
Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
of TUS3).  Similarly for the C<script> property.

Note that you cannot do (de)composition and casing based solely on the
above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
you will need also the I<Composition Exclusions>, I<Case Folding>, and
I<SpecialCasing> tables, available as files F<CompExcl.txt>,
F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.

=cut

sub charinfo {
    my $code = shift;
    my $hexk = sprintf("%04X", $code);

    openunicode(\$UNICODE, "Unicode.txt");
    if (defined $UNICODE) {
	use Search::Dict;
	if (look($UNICODE, "$hexk;") >= 0) {
	    my $line = <$UNICODE>;
	    chomp $line;
	    my %prop;
	    @prop{qw(
		     code name category
		     combining bidi decomposition
		     decimal digit numeric
		     mirrored unicode10 comment
		     upper lower title
		    )} = split(/;/, $line, -1);
	    if ($prop{code} eq $hexk) {
		$prop{block}  = charblock($code);
		$prop{script} = charscript($code);
		return %prop;
	    }
	}
    }
    return;
}

sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
    my ($table, $lo, $hi, $code) = @_;

    return if $lo > $hi;

    my $mid = int(($lo+$hi) / 2);

    if ($table->[$mid]->[0] < $code) {
	if (defined $table->[$mid]->[1] && $table->[$mid]->[1] >= $code) {
	    return $table->[$mid]->[2];
	} else {
	    _search($table, $mid + 1, $hi, $code);
	}
    } elsif ($table->[$mid]->[0] > $code) {
	_search($table, $lo, $mid - 1, $code);
    } else {
	return $table->[$mid]->[2];
    }
}

=head2 charblock

    use Unicode::UCD 'charblock';

    my $charblock = charblock(0x41);

charblock() returns the block the character belongs to, e.g.
C<Basic Latin>.  Note that not all the character positions within all
blocks are defined.

The name is the same name that is used in the C<\p{In...}> construct,
for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
away from the names for the C<\p{In...}>, for example C<LatinExtendedA>
instead of C<Latin Extended-A>.

=cut

my @BLOCKS;

sub charblock {
    my $code = shift;

    unless (@BLOCKS) {
	if (openunicode(\$BLOCKS, "Blocks.txt")) {
	    while (<$BLOCKS>) {
		if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
		    push @BLOCKS, [ hex($1), hex($2), $3 ];
		}
	    }
	    close($BLOCKS);
	}
    }

    _search(\@BLOCKS, 0, $#BLOCKS, $code);
}

=head2 charscript

    use Unicode::UCD 'charscript';

    my $charscript = charscript(0x41);

charscript() returns the script the character belongs to, e.g.
C<Latin>, C<Greek>, C<Han>.

Unfortunately, currently (Perl 5.8.0) there is no regular expression
notation for matching scripts as there is for blocks (C<\p{In...}>.

=cut

my @SCRIPTS;

sub charscript {
    my $code = shift;

    unless (@SCRIPTS) {
	if (openunicode(\$SCRIPTS, "Scripts.txt")) {
	    while (<$SCRIPTS>) {
		if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
		    push @SCRIPTS, [ hex($1), $2 ? hex($2) : undef, $3 ];
		}
	    }
	    close($SCRIPTS);
	    @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
	}
    }

    _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
}

=head2 charblock versus charscript

The difference between a character block and a script is that scripts
are closer to the linguistic notion of a set of characters required to
present languages, while block is more of an artifact of the Unicode
character numbering.  For example the Latin B<script> is spread over
several B<blocks>, such as C<Basic Latin>, C<Latin 1 Supplement>,
C<Latin Extended-A>, and C<Latin Extended-B>.  On the other hand, the
Latin script does not contain all the characters of the C<Basic Latin>
block (also known as the ASCII): it includes only the letters, not for
example the digits or the punctuation.

For block see http://www.unicode.org/Public/UNIDATA/Blocks.txt

For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/

Note also that the script names are all in uppercase, e.g. C<HEBREW>,
while the block names are Capitalized and with intermixed spaces,
e.g. C<Yi Syllables>.

Greek
Cyrillic
Armenian
Hebrew
Arabic
Syriac
Thaana
Devanagari
Bengali
Gurmukhi
Gujarati
Oriya
Tamil
Telugu
Kannada
Malayalam
Sinhala
Thai
Lao
Tibetan
Myanmar
Georgian
Ethiopic
Cherokee
Ogham
Runic
Khmer
Hiragana
Katakana
Bopomofo
OldItalic
Gothic
Deseret

=head1 IMPLEMENTATION NOTE

The first use of charinfo() opens a read-only filehandle to the Unicode
Character Database (the database is included in the Perl distribution).
The filehandle is then kept open for further queries.

=head1 AUTHOR

Jarkko Hietaniemi

=cut

1;
Commit	Line	Data
561c79ed	1	package Unicode::UCD;
	2
	3	use strict;
	4	use warnings;
	5
6f50a187	6	our $VERSION = '3.1.0';
561c79ed	7
	8	require Exporter;
	9
	10	our @ISA = qw(Exporter);
e882dd67	11	our @EXPORT_OK = qw(charinfo charblock charscript);
561c79ed	12
	13	use Carp;
	14
	15	=head1 NAME
	16
00f2772c	17	Unicode::UCD - Unicode character database
561c79ed	18
	19	=head1 SYNOPSIS
	20
	21	use Unicode::UCD 3.1.0;
	22	# requires that level of the Unicode character database
	23
	24	use Unicode::UCD 'charinfo';
e882dd67	25	my %charinfo = charinfo($codepoint);
561c79ed	26
561c79ed	27	use Unicode::UCD 'charblock';
e882dd67	28	my $charblock = charblock($codepoint);
	29
	30	use Unicode::UCD 'charscript';
	31	my $charscript = charblock($codepoint);
561c79ed	32
	33	=head1 DESCRIPTION
	34
	35	The Unicode module offers a simple interface to the Unicode Character
	36	Database.
	37
	38	=cut
	39
	40	my $UNICODE;
	41	my $BLOCKS;
e882dd67	42	my $SCRIPTS;
561c79ed	43
	44	sub openunicode {
	45	my ($rfh, @path) = @_;
	46	my $f;
	47	unless (defined $$rfh) {
	48	for my $d (@INC) {
	49	use File::Spec;
	50	$f = File::Spec->catfile($d, "unicode", @path);
32c16050	51	last if open($$rfh, $f);
e882dd67	52	undef $f;
561c79ed	53	}
e882dd67	54	croak __PACKAGE__, ": failed to find ",
	55	File::Spec->catfile(@path), " in @INC"
	56	unless defined $f;
561c79ed	57	}
	58	return $f;
	59	}
	60
	61	=head2 charinfo
	62
	63	use Unicode::UCD 'charinfo';
	64
	65	my %charinfo = charinfo(0x41);
	66
	67	charinfo() returns a hash that has the following fields as defined
	68	by the Unicode standard:
	69
	70	key
	71
	72	code code point with at least four hexdigits
	73	name name of the character IN UPPER CASE
	74	category general category of the character
	75	combining classes used in the Canonical Ordering Algorithm
	76	bidi bidirectional category
	77	decomposition character decomposition mapping
	78	decimal if decimal digit this is the integer numeric value
	79	digit if digit this is the numeric value
	80	numeric if numeric is the integer or rational numeric value
	81	mirrored if mirrored in bidirectional text
	82	unicode10 Unicode 1.0 name if existed and different
	83	comment ISO 10646 comment field
	84	upper uppercase equivalent mapping
	85	lower lowercase equivalent mapping
	86	title titlecase equivalent mapping
e882dd67	87
561c79ed	88	block block the character belongs to (used in \p{In...})
e882dd67	89	script script the character belongs to
561c79ed	90
	91	If no match is found, an empty hash is returned.
	92
32c16050	93	The C<block> property is the same as as returned by charinfo(). It is
	94	not defined in the Unicode Character Database proper (Chapter 4 of the
	95	Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
e882dd67	96	of TUS3). Similarly for the C<script> property.
32c16050	97
	98	Note that you cannot do (de)composition and casing based solely on the
	99	above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
e882dd67	100	you will need also the I<Composition Exclusions>, I<Case Folding>, and
	101	I<SpecialCasing> tables, available as files F<CompExcl.txt>,
	102	F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
561c79ed	103
	104	=cut
	105
	106	sub charinfo {
	107	my $code = shift;
	108	my $hexk = sprintf("%04X", $code);
	109
	110	openunicode(\$UNICODE, "Unicode.txt");
	111	if (defined $UNICODE) {
	112	use Search::Dict;
	113	if (look($UNICODE, "$hexk;") >= 0) {
	114	my $line = <$UNICODE>;
	115	chomp $line;
	116	my %prop;
	117	@prop{qw(
	118	code name category
	119	combining bidi decomposition
	120	decimal digit numeric
	121	mirrored unicode10 comment
	122	upper lower title
	123	)} = split(/;/, $line, -1);
	124	if ($prop{code} eq $hexk) {
a196fbfd	125	$prop{block} = charblock($code);
a196fbfd	126	$prop{script} = charscript($code);
561c79ed	127	return %prop;
	128	}
	129	}
	130	}
	131	return;
	132	}
	133
e882dd67	134	sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
	135	my ($table, $lo, $hi, $code) = @_;
	136
	137	return if $lo > $hi;
	138
	139	my $mid = int(($lo+$hi) / 2);
	140
	141	if ($table->[$mid]->[0] < $code) {
a196fbfd	142	if (defined $table->[$mid]->[1] && $table->[$mid]->[1] >= $code) {
e882dd67	143	return $table->[$mid]->[2];
	144	} else {
	145	_search($table, $mid + 1, $hi, $code);
	146	}
	147	} elsif ($table->[$mid]->[0] > $code) {
	148	_search($table, $lo, $mid - 1, $code);
	149	} else {
	150	return $table->[$mid]->[2];
	151	}
	152	}
	153
354a27bf	154	=head2 charblock
561c79ed	155
	156	use Unicode::UCD 'charblock';
	157
	158	my $charblock = charblock(0x41);
	159
	160	charblock() returns the block the character belongs to, e.g.
	161	C<Basic Latin>. Note that not all the character positions within all
e882dd67	162	blocks are defined.
561c79ed	163
	164	The name is the same name that is used in the C<\p{In...}> construct,
	165	for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
e882dd67	166	away from the names for the C<\p{In...}>, for example C<LatinExtendedA>
e882dd67	167	instead of C<Latin Extended-A>.
561c79ed	168
	169	=cut
	170
	171	my @BLOCKS;
	172
561c79ed	173	sub charblock {
	174	my $code = shift;
	175
	176	unless (@BLOCKS) {
2796c109	177	if (openunicode(\$BLOCKS, "Blocks.txt")) {
561c79ed	178	while (<$BLOCKS>) {
2796c109	179	if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
561c79ed	180	push @BLOCKS, [ hex($1), hex($2), $3 ];
	181	}
	182	}
	183	close($BLOCKS);
	184	}
	185	}
	186
e882dd67	187	_search(\@BLOCKS, 0, $#BLOCKS, $code);
	188	}
	189
	190	=head2 charscript
	191
	192	use Unicode::UCD 'charscript';
	193
	194	my $charscript = charscript(0x41);
	195
	196	charscript() returns the script the character belongs to, e.g.
ad9cab37	197	C<Latin>, C<Greek>, C<Han>.
e882dd67	198
	199	Unfortunately, currently (Perl 5.8.0) there is no regular expression
	200	notation for matching scripts as there is for blocks (C<\p{In...}>.
	201
	202	=cut
	203
	204	my @SCRIPTS;
	205
	206	sub charscript {
	207	my $code = shift;
	208
	209	unless (@SCRIPTS) {
	210	if (openunicode(\$SCRIPTS, "Scripts.txt")) {
	211	while (<$SCRIPTS>) {
	212	if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
	213	push @SCRIPTS, [ hex($1), $2 ? hex($2) : undef, $3 ];
	214	}
	215	}
	216	close($SCRIPTS);
	217	@SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
	218	}
	219	}
	220
	221	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
561c79ed	222	}
561c79ed	223
ad9cab37	224	=head2 charblock versus charscript
	225
	226	The difference between a character block and a script is that scripts
	227	are closer to the linguistic notion of a set of characters required to
	228	present languages, while block is more of an artifact of the Unicode
	229	character numbering. For example the Latin B<script> is spread over
	230	several B<blocks>, such as C<Basic Latin>, C<Latin 1 Supplement>,
	231	C<Latin Extended-A>, and C<Latin Extended-B>. On the other hand, the
	232	Latin script does not contain all the characters of the C<Basic Latin>
	233	block (also known as the ASCII): it includes only the letters, not for
	234	example the digits or the punctuation.
	235
	236	For block see http://www.unicode.org/Public/UNIDATA/Blocks.txt
	237
	238	For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
	239
	240	Note also that the script names are all in uppercase, e.g. C<HEBREW>,
	241	while the block names are Capitalized and with intermixed spaces,
	242	e.g. C<Yi Syllables>.
	243
2796c109	244	Greek
	245	Cyrillic
	246	Armenian
	247	Hebrew
	248	Arabic
	249	Syriac
	250	Thaana
	251	Devanagari
	252	Bengali
	253	Gurmukhi
	254	Gujarati
	255	Oriya
	256	Tamil
	257	Telugu
	258	Kannada
	259	Malayalam
	260	Sinhala
	261	Thai
	262	Lao
	263	Tibetan
	264	Myanmar
	265	Georgian
	266	Ethiopic
	267	Cherokee
	268	Ogham
	269	Runic
	270	Khmer
	271	Hiragana
	272	Katakana
	273	Bopomofo
	274	OldItalic
	275	Gothic
	276	Deseret
	277
e882dd67	278	=head1 IMPLEMENTATION NOTE
32c16050	279
ad9cab37	280	The first use of charinfo() opens a read-only filehandle to the Unicode
	281	Character Database (the database is included in the Perl distribution).
	282	The filehandle is then kept open for further queries.
32c16050	283
561c79ed	284	=head1 AUTHOR
	285
	286	Jarkko Hietaniemi
	287
	288	=cut
	289
	290	1;