[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.pm

package Unicode::UCD;

use strict;
use warnings;

our $VERSION = '3.1.0';

require Exporter;

our @ISA = qw(Exporter);
our @EXPORT_OK = qw(charinfo charblock);

use Carp;

=head1 NAME

Unicode - Unicode character database

=head1 SYNOPSIS

    use Unicode::UCD 3.1.0;
    # requires that level of the Unicode character database

    use Unicode::UCD 'charinfo';
    my %charinfo  = charinfo($codepoint);

    use Unicode::UCD 'charblock';
    my $charblock = charblock($codepoint);

=head1 DESCRIPTION

The Unicode module offers a simple interface to the Unicode Character
Database.

=cut

my $UNICODE;
my $BLOCKS;

sub openunicode {
    my ($rfh, @path) = @_;
    my $f;
    unless (defined $$rfh) {
	for my $d (@INC) {
	    use File::Spec;
	    $f = File::Spec->catfile($d, "unicode", @path);
	    if (open($$rfh, $f)) {
		last;
	    } else {
		croak __PACKAGE__, ": open '$f' failed: $!\n";
	    }
	}
	croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n"
	    unless defined $rfh;
    }
    return $f;
}

=head2 charinfo

    use Unicode::UCD 'charinfo';

    my %charinfo = charinfo(0x41);

charinfo() returns a hash that has the following fields as defined
by the Unicode standard:

    key

    code             code point with at least four hexdigits
    name             name of the character IN UPPER CASE
    category         general category of the character
    combining        classes used in the Canonical Ordering Algorithm
    bidi             bidirectional category
    decomposition    character decomposition mapping
    decimal          if decimal digit this is the integer numeric value
    digit            if digit this is the numeric value
    numeric          if numeric is the integer or rational numeric value
    mirrored         if mirrored in bidirectional text
    unicode10        Unicode 1.0 name if existed and different
    comment          ISO 10646 comment field
    upper            uppercase equivalent mapping
    lower            lowercase equivalent mapping
    title            titlecase equivalent mapping
    block            block the character belongs to (used in \p{In...})

If no match is found, an empty hash is returned.

The C<block> property is the same as as returned by charinfo().
(It is not defined in the Unicode Character Database proper but
instead in an auxiliary database.)

=cut

sub charinfo {
    my $code = shift;
    my $hexk = sprintf("%04X", $code);

    openunicode(\$UNICODE, "Unicode.txt");
    if (defined $UNICODE) {
	use Search::Dict;
	if (look($UNICODE, "$hexk;") >= 0) {
	    my $line = <$UNICODE>;
	    chomp $line;
	    my %prop;
	    @prop{qw(
		     code name category
		     combining bidi decomposition
		     decimal digit numeric
		     mirrored unicode10 comment
		     upper lower title
		    )} = split(/;/, $line, -1);
	    if ($prop{code} eq $hexk) {
		$prop{block} = charblock($code);
		return %prop;
	    }
	}
    }
    return;
}

=head2 charbloc

    use Unicode::UCD 'charblock';

    my $charblock = charblock(0x41);

charblock() returns the block the character belongs to, e.g.
C<Basic Latin>.  Note that not all the character positions within all
block are defined.

The name is the same name that is used in the C<\p{In...}> construct,
for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
away from the names for the C<\p{In...}>.

=cut

my @BLOCKS;

sub _charblock {
    my ($code, $lo, $hi) = @_;

    return if $lo > $hi;

    my $mid = int(($lo+$hi) / 2);

    if ($BLOCKS[$mid]->[0] < $code) {
	if ($BLOCKS[$mid]->[1] >= $code) {
	    return $BLOCKS[$mid]->[2];
	} else {
	    _charblock($code, $mid + 1, $hi);
	}
    } elsif ($BLOCKS[$mid]->[0] > $code) {
	_charblock($code, $lo, $mid - 1);
    } else {
	return $BLOCKS[$mid]->[2];
    }
}

sub charblock {
    my $code = shift;

    unless (@BLOCKS) {
	if (openunicode(\$BLOCKS, "Blocks.pl")) {
	    while (<$BLOCKS>) {
		if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
		    push @BLOCKS, [ hex($1), hex($2), $3 ];
		}
	    }
	    close($BLOCKS);
	}
    }

    _charblock($code, 0, $#BLOCKS);
}

=head1 AUTHOR

Jarkko Hietaniemi

=cut

1;
Commit	Line	Data
561c79ed	1	package Unicode::UCD;
	2
	3	use strict;
	4	use warnings;
	5
6f50a187	6	our $VERSION = '3.1.0';
561c79ed	7
	8	require Exporter;
	9
	10	our @ISA = qw(Exporter);
	11	our @EXPORT_OK = qw(charinfo charblock);
	12
	13	use Carp;
	14
	15	=head1 NAME
	16
	17	Unicode - Unicode character database
	18
	19	=head1 SYNOPSIS
	20
	21	use Unicode::UCD 3.1.0;
	22	# requires that level of the Unicode character database
	23
	24	use Unicode::UCD 'charinfo';
	25	my %charinfo = charinfo($codepoint);
	26
	27	use Unicode::UCD 'charblock';
	28	my $charblock = charblock($codepoint);
	29
	30	=head1 DESCRIPTION
	31
	32	The Unicode module offers a simple interface to the Unicode Character
	33	Database.
	34
	35	=cut
	36
	37	my $UNICODE;
	38	my $BLOCKS;
	39
	40	sub openunicode {
	41	my ($rfh, @path) = @_;
	42	my $f;
	43	unless (defined $$rfh) {
	44	for my $d (@INC) {
	45	use File::Spec;
	46	$f = File::Spec->catfile($d, "unicode", @path);
	47	if (open($$rfh, $f)) {
	48	last;
	49	} else {
	50	croak __PACKAGE__, ": open '$f' failed: $!\n";
	51	}
	52	}
	53	croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n"
	54	unless defined $rfh;
	55	}
	56	return $f;
	57	}
	58
	59	=head2 charinfo
	60
	61	use Unicode::UCD 'charinfo';
	62
	63	my %charinfo = charinfo(0x41);
	64
	65	charinfo() returns a hash that has the following fields as defined
	66	by the Unicode standard:
	67
	68	key
	69
	70	code code point with at least four hexdigits
71	name name of the character IN UPPER CASE
72	category general category of the character
73	combining classes used in the Canonical Ordering Algorithm
74	bidi bidirectional category
75	decomposition character decomposition mapping
76	decimal if decimal digit this is the integer numeric value
77	digit if digit this is the numeric value
78	numeric if numeric is the integer or rational numeric value
79	mirrored if mirrored in bidirectional text
80	unicode10 Unicode 1.0 name if existed and different
81	comment ISO 10646 comment field
82	upper uppercase equivalent mapping
83	lower lowercase equivalent mapping
84	title titlecase equivalent mapping
85	block block the character belongs to (used in \p{In...})
86
87	If no match is found, an empty hash is returned.
88
89	The C<block> property is the same as as returned by charinfo().
90	(It is not defined in the Unicode Character Database proper but
91	instead in an auxiliary database.)
92
93	=cut
94
95	sub charinfo {
96	my $code = shift;
97	my $hexk = sprintf("%04X", $code);
98
99	openunicode(\$UNICODE, "Unicode.txt");
100	if (defined $UNICODE) {
101	use Search::Dict;
102	if (look($UNICODE, "$hexk;") >= 0) {
103	my $line = <$UNICODE>;
104	chomp $line;
105	my %prop;
106	@prop{qw(
107	code name category
108	combining bidi decomposition
109	decimal digit numeric
110	mirrored unicode10 comment
111	upper lower title
112	)} = split(/;/, $line, -1);
113	if ($prop{code} eq $hexk) {
114	$prop{block} = charblock($code);
115	return %prop;
116	}
117	}
118	}
119	return;
120	}
121
122	=head2 charbloc
123
124	use Unicode::UCD 'charblock';
125
126	my $charblock = charblock(0x41);
127
128	charblock() returns the block the character belongs to, e.g.
129	C<Basic Latin>. Note that not all the character positions within all
130	block are defined.
131
132	The name is the same name that is used in the C<\p{In...}> construct,
133	for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
134	away from the names for the C<\p{In...}>.
135
136	=cut
137
138	my @BLOCKS;
139
140	sub _charblock {
141	my ($code, $lo, $hi) = @_;
142
143	return if $lo > $hi;
144
145	my $mid = int(($lo+$hi) / 2);
146
147	if ($BLOCKS[$mid]->[0] < $code) {
148	if ($BLOCKS[$mid]->[1] >= $code) {
149	return $BLOCKS[$mid]->[2];
150	} else {
151	_charblock($code, $mid + 1, $hi);
152	}
153	} elsif ($BLOCKS[$mid]->[0] > $code) {
154	_charblock($code, $lo, $mid - 1);
155	} else {
156	return $BLOCKS[$mid]->[2];
157	}
158	}
159
160	sub charblock {
161	my $code = shift;
162
163	unless (@BLOCKS) {
164	if (openunicode(\$BLOCKS, "Blocks.pl")) {
165	while (<$BLOCKS>) {
166	if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
167	push @BLOCKS, [ hex($1), hex($2), $3 ];
168	}
169	}
170	close($BLOCKS);
171	}
172	}
173
174	_charblock($code, 0, $#BLOCKS);
175	}
176
177	=head1 AUTHOR
178
179	Jarkko Hietaniemi
180
181	=cut
182
183	1;