[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm

package Unicode::UCD;

use strict;
use warnings;

our $VERSION = '3.1.0';

require Exporter;

our @ISA = qw(Exporter);
our @EXPORT_OK = qw(charinfo charblock);

use Carp;

=head1 NAME

Unicode::UCD - Unicode character database

=head1 SYNOPSIS

    use Unicode::UCD 3.1.0;
    # requires that level of the Unicode character database

    use Unicode::UCD 'charinfo';
    my %charinfo  = charinfo($codepoint);

    use Unicode::UCD 'charblock';
    my $charblock = charblock($codepoint);

=head1 DESCRIPTION

The Unicode module offers a simple interface to the Unicode Character
Database.

=cut

my $UNICODE;
my $BLOCKS;

sub openunicode {
    my ($rfh, @path) = @_;
    my $f;
    unless (defined $$rfh) {
	for my $d (@INC) {
	    use File::Spec;
	    $f = File::Spec->catfile($d, "unicode", @path);
           next unless -f $f;
	    if (open($$rfh, $f)) {
		last;
	    } else {
		croak __PACKAGE__, ": open '$f' failed: $!\n";
	    }
	}
	croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n"
	    unless defined $rfh;
    }
    return $f;
}

=head2 charinfo

    use Unicode::UCD 'charinfo';

    my %charinfo = charinfo(0x41);

charinfo() returns a hash that has the following fields as defined
by the Unicode standard:

    key

    code             code point with at least four hexdigits
    name             name of the character IN UPPER CASE
    category         general category of the character
    combining        classes used in the Canonical Ordering Algorithm
    bidi             bidirectional category
    decomposition    character decomposition mapping
    decimal          if decimal digit this is the integer numeric value
    digit            if digit this is the numeric value
    numeric          if numeric is the integer or rational numeric value
    mirrored         if mirrored in bidirectional text
    unicode10        Unicode 1.0 name if existed and different
    comment          ISO 10646 comment field
    upper            uppercase equivalent mapping
    lower            lowercase equivalent mapping
    title            titlecase equivalent mapping
    block            block the character belongs to (used in \p{In...})

If no match is found, an empty hash is returned.

The C<block> property is the same as as returned by charinfo().
(It is not defined in the Unicode Character Database proper but
instead in an auxiliary database.)

=cut

sub charinfo {
    my $code = shift;
    my $hexk = sprintf("%04X", $code);

    openunicode(\$UNICODE, "Unicode.txt");
    if (defined $UNICODE) {
	use Search::Dict;
	if (look($UNICODE, "$hexk;") >= 0) {
	    my $line = <$UNICODE>;
	    chomp $line;
	    my %prop;
	    @prop{qw(
		     code name category
		     combining bidi decomposition
		     decimal digit numeric
		     mirrored unicode10 comment
		     upper lower title
		    )} = split(/;/, $line, -1);
	    if ($prop{code} eq $hexk) {
		$prop{block} = charblock($code);
		return %prop;
	    }
	}
    }
    return;
}

=head2 charblock

    use Unicode::UCD 'charblock';

    my $charblock = charblock(0x41);

charblock() returns the block the character belongs to, e.g.
C<Basic Latin>.  Note that not all the character positions within all
block are defined.

The name is the same name that is used in the C<\p{In...}> construct,
for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
away from the names for the C<\p{In...}>.

=cut

my @BLOCKS;

sub _charblock {
    my ($code, $lo, $hi) = @_;

    return if $lo > $hi;

    my $mid = int(($lo+$hi) / 2);

    if ($BLOCKS[$mid]->[0] < $code) {
	if ($BLOCKS[$mid]->[1] >= $code) {
	    return $BLOCKS[$mid]->[2];
	} else {
	    _charblock($code, $mid + 1, $hi);
	}
    } elsif ($BLOCKS[$mid]->[0] > $code) {
	_charblock($code, $lo, $mid - 1);
    } else {
	return $BLOCKS[$mid]->[2];
    }
}

sub charblock {
    my $code = shift;

    unless (@BLOCKS) {
	if (openunicode(\$BLOCKS, "Blocks.pl")) {
	    while (<$BLOCKS>) {
		if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
		    push @BLOCKS, [ hex($1), hex($2), $3 ];
		}
	    }
	    close($BLOCKS);
	}
    }

    _charblock($code, 0, $#BLOCKS);
}

=head1 AUTHOR

Jarkko Hietaniemi

=cut

1;
Commit	Line	Data
561c79ed	1	package Unicode::UCD;
	2
	3	use strict;
	4	use warnings;
	5
6f50a187	6	our $VERSION = '3.1.0';
561c79ed	7
	8	require Exporter;
	9
	10	our @ISA = qw(Exporter);
	11	our @EXPORT_OK = qw(charinfo charblock);
	12
	13	use Carp;
	14
	15	=head1 NAME
	16
00f2772c	17	Unicode::UCD - Unicode character database
561c79ed	18
	19	=head1 SYNOPSIS
	20
	21	use Unicode::UCD 3.1.0;
	22	# requires that level of the Unicode character database
	23
	24	use Unicode::UCD 'charinfo';
	25	my %charinfo = charinfo($codepoint);
	26
	27	use Unicode::UCD 'charblock';
	28	my $charblock = charblock($codepoint);
	29
	30	=head1 DESCRIPTION
	31
	32	The Unicode module offers a simple interface to the Unicode Character
	33	Database.
	34
	35	=cut
	36
	37	my $UNICODE;
	38	my $BLOCKS;
	39
	40	sub openunicode {
	41	my ($rfh, @path) = @_;
	42	my $f;
	43	unless (defined $$rfh) {
	44	for my $d (@INC) {
	45	use File::Spec;
	46	$f = File::Spec->catfile($d, "unicode", @path);
14f14a10	47	next unless -f $f;
561c79ed	48	if (open($$rfh, $f)) {
	49	last;
	50	} else {
	51	croak __PACKAGE__, ": open '$f' failed: $!\n";
	52	}
	53	}
	54	croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n"
	55	unless defined $rfh;
	56	}
	57	return $f;
	58	}
	59
	60	=head2 charinfo
	61
	62	use Unicode::UCD 'charinfo';
	63
	64	my %charinfo = charinfo(0x41);
	65
	66	charinfo() returns a hash that has the following fields as defined
	67	by the Unicode standard:
	68
	69	key
	70
	71	code code point with at least four hexdigits
	72	name name of the character IN UPPER CASE
	73	category general category of the character
	74	combining classes used in the Canonical Ordering Algorithm
	75	bidi bidirectional category
	76	decomposition character decomposition mapping
	77	decimal if decimal digit this is the integer numeric value
	78	digit if digit this is the numeric value
	79	numeric if numeric is the integer or rational numeric value
	80	mirrored if mirrored in bidirectional text
	81	unicode10 Unicode 1.0 name if existed and different
	82	comment ISO 10646 comment field
	83	upper uppercase equivalent mapping
	84	lower lowercase equivalent mapping
	85	title titlecase equivalent mapping
	86	block block the character belongs to (used in \p{In...})
	87
	88	If no match is found, an empty hash is returned.
	89
	90	The C<block> property is the same as as returned by charinfo().
	91	(It is not defined in the Unicode Character Database proper but
	92	instead in an auxiliary database.)
	93
	94	=cut
	95
	96	sub charinfo {
	97	my $code = shift;
	98	my $hexk = sprintf("%04X", $code);
	99
	100	openunicode(\$UNICODE, "Unicode.txt");
	101	if (defined $UNICODE) {
	102	use Search::Dict;
	103	if (look($UNICODE, "$hexk;") >= 0) {
	104	my $line = <$UNICODE>;
	105	chomp $line;
	106	my %prop;
	107	@prop{qw(
	108	code name category
	109	combining bidi decomposition
	110	decimal digit numeric
	111	mirrored unicode10 comment
112	upper lower title
113	)} = split(/;/, $line, -1);
114	if ($prop{code} eq $hexk) {
115	$prop{block} = charblock($code);
116	return %prop;
117	}
118	}
119	}
120	return;
121	}
122
354a27bf	123	=head2 charblock
561c79ed	124
	125	use Unicode::UCD 'charblock';
	126
	127	my $charblock = charblock(0x41);
	128
	129	charblock() returns the block the character belongs to, e.g.
	130	C<Basic Latin>. Note that not all the character positions within all
	131	block are defined.
	132
	133	The name is the same name that is used in the C<\p{In...}> construct,
	134	for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
	135	away from the names for the C<\p{In...}>.
	136
	137	=cut
	138
	139	my @BLOCKS;
	140
	141	sub _charblock {
	142	my ($code, $lo, $hi) = @_;
	143
	144	return if $lo > $hi;
	145
	146	my $mid = int(($lo+$hi) / 2);
	147
	148	if ($BLOCKS[$mid]->[0] < $code) {
	149	if ($BLOCKS[$mid]->[1] >= $code) {
	150	return $BLOCKS[$mid]->[2];
	151	} else {
	152	_charblock($code, $mid + 1, $hi);
	153	}
	154	} elsif ($BLOCKS[$mid]->[0] > $code) {
	155	_charblock($code, $lo, $mid - 1);
	156	} else {
	157	return $BLOCKS[$mid]->[2];
	158	}
	159	}
	160
	161	sub charblock {
	162	my $code = shift;
	163
	164	unless (@BLOCKS) {
	165	if (openunicode(\$BLOCKS, "Blocks.pl")) {
	166	while (<$BLOCKS>) {
	167	if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
	168	push @BLOCKS, [ hex($1), hex($2), $3 ];
	169	}
	170	}
	171	close($BLOCKS);
	172	}
	173	}
	174
	175	_charblock($code, 0, $#BLOCKS);
	176	}
	177
	178	=head1 AUTHOR
	179
	180	Jarkko Hietaniemi
	181
	182	=cut
	183
	184	1;