[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm

package Unicode::UCD;

use strict;
use warnings;

our $VERSION = '3.1.0';

require Exporter;

our @ISA = qw(Exporter);
our @EXPORT_OK = qw(charinfo charblock);

use Carp;

=head1 NAME

Unicode::UCD - Unicode character database

=head1 SYNOPSIS

    use Unicode::UCD 3.1.0;
    # requires that level of the Unicode character database

    use Unicode::UCD 'charinfo';
    my %charinfo  = charinfo($codepoint);

    use Unicode::UCD 'charblock';
    my $charblock = charblock($codepoint);

=head1 DESCRIPTION

The Unicode module offers a simple interface to the Unicode Character
Database.

=cut

my $UNICODE;
my $BLOCKS;

sub openunicode {
    my ($rfh, @path) = @_;
    my $f;
    unless (defined $$rfh) {
	for my $d (@INC) {
	    use File::Spec;
	    $f = File::Spec->catfile($d, "unicode", @path);
	    if (open($$rfh, $f)) {
		last;
	    } else {
		croak __PACKAGE__, ": open '$f' failed: $!\n";
	    }
	}
	croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n"
	    unless defined $rfh;
    }
    return $f;
}

=head2 charinfo

    use Unicode::UCD 'charinfo';

    my %charinfo = charinfo(0x41);

charinfo() returns a hash that has the following fields as defined
by the Unicode standard:

    key

    code             code point with at least four hexdigits
    name             name of the character IN UPPER CASE
    category         general category of the character
    combining        classes used in the Canonical Ordering Algorithm
    bidi             bidirectional category
    decomposition    character decomposition mapping
    decimal          if decimal digit this is the integer numeric value
    digit            if digit this is the numeric value
    numeric          if numeric is the integer or rational numeric value
    mirrored         if mirrored in bidirectional text
    unicode10        Unicode 1.0 name if existed and different
    comment          ISO 10646 comment field
    upper            uppercase equivalent mapping
    lower            lowercase equivalent mapping
    title            titlecase equivalent mapping
    block            block the character belongs to (used in \p{In...})

If no match is found, an empty hash is returned.

The C<block> property is the same as as returned by charinfo().
(It is not defined in the Unicode Character Database proper but
instead in an auxiliary database.)

=cut

sub charinfo {
    my $code = shift;
    my $hexk = sprintf("%04X", $code);

    openunicode(\$UNICODE, "Unicode.txt");
    if (defined $UNICODE) {
	use Search::Dict;
	if (look($UNICODE, "$hexk;") >= 0) {
	    my $line = <$UNICODE>;
	    chomp $line;
	    my %prop;
	    @prop{qw(
		     code name category
		     combining bidi decomposition
		     decimal digit numeric
		     mirrored unicode10 comment
		     upper lower title
		    )} = split(/;/, $line, -1);
	    if ($prop{code} eq $hexk) {
		$prop{block} = charblock($code);
		return %prop;
	    }
	}
    }
    return;
}

=head2 charblock

    use Unicode::UCD 'charblock';

    my $charblock = charblock(0x41);

charblock() returns the block the character belongs to, e.g.
C<Basic Latin>.  Note that not all the character positions within all
block are defined.

The name is the same name that is used in the C<\p{In...}> construct,
for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
away from the names for the C<\p{In...}>.

=cut

my @BLOCKS;

sub _charblock {
    my ($code, $lo, $hi) = @_;

    return if $lo > $hi;

    my $mid = int(($lo+$hi) / 2);

    if ($BLOCKS[$mid]->[0] < $code) {
	if ($BLOCKS[$mid]->[1] >= $code) {
	    return $BLOCKS[$mid]->[2];
	} else {
	    _charblock($code, $mid + 1, $hi);
	}
    } elsif ($BLOCKS[$mid]->[0] > $code) {
	_charblock($code, $lo, $mid - 1);
    } else {
	return $BLOCKS[$mid]->[2];
    }
}

sub charblock {
    my $code = shift;

    unless (@BLOCKS) {
	if (openunicode(\$BLOCKS, "Blocks.pl")) {
	    while (<$BLOCKS>) {
		if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
		    push @BLOCKS, [ hex($1), hex($2), $3 ];
		}
	    }
	    close($BLOCKS);
	}
    }

    _charblock($code, 0, $#BLOCKS);
}

=head1 AUTHOR

Jarkko Hietaniemi

=cut

1;
Commit	Line	Data
561c79ed	1	package Unicode::UCD;
	2
	3	use strict;
	4	use warnings;
	5
6f50a187	6	our $VERSION = '3.1.0';
561c79ed	7
	8	require Exporter;
	9
	10	our @ISA = qw(Exporter);
	11	our @EXPORT_OK = qw(charinfo charblock);
	12
	13	use Carp;
	14
	15	=head1 NAME
	16
00f2772c	17	Unicode::UCD - Unicode character database
561c79ed	18
	19	=head1 SYNOPSIS
	20
	21	use Unicode::UCD 3.1.0;
	22	# requires that level of the Unicode character database
	23
	24	use Unicode::UCD 'charinfo';
	25	my %charinfo = charinfo($codepoint);
	26
	27	use Unicode::UCD 'charblock';
	28	my $charblock = charblock($codepoint);
	29
	30	=head1 DESCRIPTION
	31
	32	The Unicode module offers a simple interface to the Unicode Character
	33	Database.
	34
	35	=cut
	36
	37	my $UNICODE;
	38	my $BLOCKS;
	39
	40	sub openunicode {
	41	my ($rfh, @path) = @_;
	42	my $f;
	43	unless (defined $$rfh) {
	44	for my $d (@INC) {
	45	use File::Spec;
	46	$f = File::Spec->catfile($d, "unicode", @path);
	47	if (open($$rfh, $f)) {
	48	last;
	49	} else {
	50	croak __PACKAGE__, ": open '$f' failed: $!\n";
	51	}
	52	}
	53	croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n"
	54	unless defined $rfh;
	55	}
	56	return $f;
	57	}
	58
	59	=head2 charinfo
	60
	61	use Unicode::UCD 'charinfo';
	62
	63	my %charinfo = charinfo(0x41);
	64
	65	charinfo() returns a hash that has the following fields as defined
	66	by the Unicode standard:
	67
	68	key
	69
	70	code code point with at least four hexdigits
	71	name name of the character IN UPPER CASE
	72	category general category of the character
	73	combining classes used in the Canonical Ordering Algorithm
	74	bidi bidirectional category
	75	decomposition character decomposition mapping
	76	decimal if decimal digit this is the integer numeric value
	77	digit if digit this is the numeric value
	78	numeric if numeric is the integer or rational numeric value
	79	mirrored if mirrored in bidirectional text
	80	unicode10 Unicode 1.0 name if existed and different
	81	comment ISO 10646 comment field
82	upper uppercase equivalent mapping
83	lower lowercase equivalent mapping
84	title titlecase equivalent mapping
85	block block the character belongs to (used in \p{In...})
86
87	If no match is found, an empty hash is returned.
88
89	The C<block> property is the same as as returned by charinfo().
90	(It is not defined in the Unicode Character Database proper but
91	instead in an auxiliary database.)
92
93	=cut
94
95	sub charinfo {
96	my $code = shift;
97	my $hexk = sprintf("%04X", $code);
98
99	openunicode(\$UNICODE, "Unicode.txt");
100	if (defined $UNICODE) {
101	use Search::Dict;
102	if (look($UNICODE, "$hexk;") >= 0) {
103	my $line = <$UNICODE>;
104	chomp $line;
105	my %prop;
106	@prop{qw(
107	code name category
108	combining bidi decomposition
109	decimal digit numeric
110	mirrored unicode10 comment
111	upper lower title
112	)} = split(/;/, $line, -1);
113	if ($prop{code} eq $hexk) {
114	$prop{block} = charblock($code);
115	return %prop;
116	}
117	}
118	}
119	return;
120	}
121
354a27bf	122	=head2 charblock
561c79ed	123
	124	use Unicode::UCD 'charblock';
	125
	126	my $charblock = charblock(0x41);
	127
	128	charblock() returns the block the character belongs to, e.g.
	129	C<Basic Latin>. Note that not all the character positions within all
	130	block are defined.
	131
	132	The name is the same name that is used in the C<\p{In...}> construct,
	133	for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
	134	away from the names for the C<\p{In...}>.
	135
	136	=cut
	137
	138	my @BLOCKS;
	139
	140	sub _charblock {
	141	my ($code, $lo, $hi) = @_;
	142
	143	return if $lo > $hi;
	144
	145	my $mid = int(($lo+$hi) / 2);
	146
	147	if ($BLOCKS[$mid]->[0] < $code) {
	148	if ($BLOCKS[$mid]->[1] >= $code) {
	149	return $BLOCKS[$mid]->[2];
	150	} else {
	151	_charblock($code, $mid + 1, $hi);
	152	}
	153	} elsif ($BLOCKS[$mid]->[0] > $code) {
	154	_charblock($code, $lo, $mid - 1);
	155	} else {
	156	return $BLOCKS[$mid]->[2];
	157	}
	158	}
	159
	160	sub charblock {
	161	my $code = shift;
	162
	163	unless (@BLOCKS) {
	164	if (openunicode(\$BLOCKS, "Blocks.pl")) {
	165	while (<$BLOCKS>) {
	166	if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
	167	push @BLOCKS, [ hex($1), hex($2), $3 ];
	168	}
	169	}
	170	close($BLOCKS);
	171	}
	172	}
	173
	174	_charblock($code, 0, $#BLOCKS);
	175	}
	176
	177	=head1 AUTHOR
	178
	179	Jarkko Hietaniemi
	180
	181	=cut
	182
	183	1;