[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.pm

package Unicode::Normalize;

use 5.006;
use strict;
use warnings;
use Carp;

our $VERSION = '0.12';
our $PACKAGE = __PACKAGE__;

require Exporter;
require DynaLoader;
require AutoLoader;

our @ISA = qw(Exporter DynaLoader);
our @EXPORT = qw( NFC NFD NFKC NFKD );
our @EXPORT_OK = qw(
    normalize decompose reorder compose
    getCanon getCompat getComposite getCombinClass isExclusion
);
our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );

bootstrap Unicode::Normalize $VERSION;

use constant CANON  => 0;
use constant COMPAT => 1;

sub NFD  ($) { reorder(decompose($_[0], CANON )) }
sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }

sub NFC  ($) { compose(reorder(decompose($_[0], CANON ))) }
sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }

sub normalize($$)
{
  my $form = shift;
  $form =~ s/^NF//;
  $form eq 'D'  ? NFD ($_[0]) :
  $form eq 'C'  ? NFC ($_[0]) :
  $form eq 'KD' ? NFKD($_[0]) :
  $form eq 'KC' ? NFKC($_[0]) :
    croak $PACKAGE."::normalize: invalid form name: $form";
}

1;
__END__

=head1 NAME

Unicode::Normalize - normalized forms of Unicode text

=head1 SYNOPSIS

  use Unicode::Normalize;

  $string_NFD  = NFD($raw_string);  # Normalization Form D
  $string_NFC  = NFC($raw_string);  # Normalization Form C
  $string_NFKD = NFKD($raw_string); # Normalization Form KD
  $string_NFKC = NFKC($raw_string); # Normalization Form KC

   or

  use Unicode::Normalize 'normalize';

  $string_NFD  = normalize('D',  $raw_string);  # Normalization Form D
  $string_NFC  = normalize('C',  $raw_string);  # Normalization Form C
  $string_NFKD = normalize('KD', $raw_string);  # Normalization Form KD
  $string_NFKC = normalize('KC', $raw_string);  # Normalization Form KC

=head1 DESCRIPTION

=head2 Normalization

=over 4

=item C<$string_NFD = NFD($raw_string)>

returns the Normalization Form D (formed by canonical decomposition).


=item C<$string_NFC = NFC($raw_string)>

returns the Normalization Form C (formed by canonical decomposition
followed by canonical composition).

=item C<$string_NFKD = NFKD($raw_string)>

returns the Normalization Form KD (formed by compatibility decomposition).

=item C<$string_NFKC = NFKC($raw_string)>

returns the Normalization Form KC (formed by compatibility decomposition
followed by B<canonical> composition).

=item C<$normalized_string = normalize($form_name, $raw_string)>

As C<$form_name>, one of the following names must be given.

  'C'  or 'NFC'  for Normalization Form C
  'D'  or 'NFD'  for Normalization Form D
  'KC' or 'NFKC' for Normalization Form KC
  'KD' or 'NFKD' for Normalization Form KD

=back

=head2 Character Data

These functions are interface of character data used internally.
If you want only to get unicode normalization forms, 
you need not to call them by yourself.

=over 4

=item C<$canonical_decomposed = getCanon($codepoint)>

=item C<$compatibility_decomposed = getCompat($codepoint)>

If the character of the specified codepoint is canonically or 
compatibility decomposable (including Hangul Syllables),
returns the B<completely decomposed> string equivalent to it.

If it is not decomposable, returns undef.

=item C<$uv_composite = getComposite($uv_here, $uv_next)>

If the couple of two characters here and next (as codepoints) is composable
(including Hangul Jamo/Syllables and Exclusions),
returns the codepoint of the composite.

If they are not composable, returns undef.

=item C<$combining_class = getCombinClass($codepoint)>

Returns the combining class as integer of the character.

=item C<$is_exclusion = isExclusion($codepoint)>

Returns a boolean whether the character of the specified codepoint is
a composition exclusion.

=back

=head2 EXPORT

C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.

C<normalize> and other some functions: on request.

=head1 AUTHOR

SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>

  http://homepage1.nifty.com/nomenclator/perl/

  Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.

  This program is free software; you can redistribute it and/or 
  modify it under the same terms as Perl itself.

=head1 SEE ALSO

=over 4

=item http://www.unicode.org/unicode/reports/tr15/

Unicode Normalization Forms - UAX #15

=back

=cut
Commit	Line	Data
ac5ea531	1	package Unicode::Normalize;
	2
	3	use 5.006;
	4	use strict;
	5	use warnings;
	6	use Carp;
	7
2a204b45	8	our $VERSION = '0.12';
ac5ea531	9	our $PACKAGE = __PACKAGE__;
	10
	11	require Exporter;
	12	require DynaLoader;
	13	require AutoLoader;
	14
	15	our @ISA = qw(Exporter DynaLoader);
	16	our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45	17	our @EXPORT_OK = qw(
	18	normalize decompose reorder compose
	19	getCanon getCompat getComposite getCombinClass isExclusion
	20	);
ac5ea531	21	our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
	22
	23	bootstrap Unicode::Normalize $VERSION;
	24
	25	use constant CANON => 0;
	26	use constant COMPAT => 1;
	27
2a204b45	28	sub NFD ($) { reorder(decompose($_[0], CANON )) }
ac5ea531	29	sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
ac5ea531	30
2a204b45	31	sub NFC ($) { compose(reorder(decompose($_[0], CANON ))) }
ac5ea531	32	sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
	33
	34	sub normalize($$)
	35	{
	36	my $form = shift;
93deb893	37	$form =~ s/^NF//;
2a204b45	38	$form eq 'D' ? NFD ($_[0]) :
	39	$form eq 'C' ? NFC ($_[0]) :
	40	$form eq 'KD' ? NFKD($_[0]) :
	41	$form eq 'KC' ? NFKC($_[0]) :
ac5ea531	42	croak $PACKAGE."::normalize: invalid form name: $form";
	43	}
	44
	45	1;
	46	__END__
2a204b45	47
	48	=head1 NAME
	49
	50	Unicode::Normalize - normalized forms of Unicode text
	51
	52	=head1 SYNOPSIS
	53
	54	use Unicode::Normalize;
	55
	56	$string_NFD = NFD($raw_string); # Normalization Form D
	57	$string_NFC = NFC($raw_string); # Normalization Form C
	58	$string_NFKD = NFKD($raw_string); # Normalization Form KD
	59	$string_NFKC = NFKC($raw_string); # Normalization Form KC
	60
	61	or
	62
	63	use Unicode::Normalize 'normalize';
	64
	65	$string_NFD = normalize('D', $raw_string); # Normalization Form D
	66	$string_NFC = normalize('C', $raw_string); # Normalization Form C
	67	$string_NFKD = normalize('KD', $raw_string); # Normalization Form KD
	68	$string_NFKC = normalize('KC', $raw_string); # Normalization Form KC
	69
	70	=head1 DESCRIPTION
	71
	72	=head2 Normalization
	73
	74	=over 4
	75
	76	=item C<$string_NFD = NFD($raw_string)>
	77
	78	returns the Normalization Form D (formed by canonical decomposition).
	79
	80
	81	=item C<$string_NFC = NFC($raw_string)>
	82
	83	returns the Normalization Form C (formed by canonical decomposition
	84	followed by canonical composition).
	85
	86	=item C<$string_NFKD = NFKD($raw_string)>
	87
	88	returns the Normalization Form KD (formed by compatibility decomposition).
	89
	90	=item C<$string_NFKC = NFKC($raw_string)>
	91
	92	returns the Normalization Form KC (formed by compatibility decomposition
	93	followed by B<canonical> composition).
	94
	95	=item C<$normalized_string = normalize($form_name, $raw_string)>
	96
	97	As C<$form_name>, one of the following names must be given.
	98
	99	'C' or 'NFC' for Normalization Form C
	100	'D' or 'NFD' for Normalization Form D
	101	'KC' or 'NFKC' for Normalization Form KC
	102	'KD' or 'NFKD' for Normalization Form KD
	103
	104	=back
	105
	106	=head2 Character Data
	107
	108	These functions are interface of character data used internally.
	109	If you want only to get unicode normalization forms,
	110	you need not to call them by yourself.
111
112	=over 4
113
114	=item C<$canonical_decomposed = getCanon($codepoint)>
115
116	=item C<$compatibility_decomposed = getCompat($codepoint)>
117
118	If the character of the specified codepoint is canonically or
119	compatibility decomposable (including Hangul Syllables),
120	returns the B<completely decomposed> string equivalent to it.
121
122	If it is not decomposable, returns undef.
123
124	=item C<$uv_composite = getComposite($uv_here, $uv_next)>
125
126	If the couple of two characters here and next (as codepoints) is composable
127	(including Hangul Jamo/Syllables and Exclusions),
128	returns the codepoint of the composite.
129
130	If they are not composable, returns undef.
131
132	=item C<$combining_class = getCombinClass($codepoint)>
133
134	Returns the combining class as integer of the character.
135
136	=item C<$is_exclusion = isExclusion($codepoint)>
137
138	Returns a boolean whether the character of the specified codepoint is
139	a composition exclusion.
140
141	=back
142
143	=head2 EXPORT
144
145	C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
146
147	C<normalize> and other some functions: on request.
148
149	=head1 AUTHOR
150
151	SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
152
153	http://homepage1.nifty.com/nomenclator/perl/
154
155	Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
156
157	This program is free software; you can redistribute it and/or
158	modify it under the same terms as Perl itself.
159
160	=head1 SEE ALSO
161
162	=over 4
163
164	=item http://www.unicode.org/unicode/reports/tr15/
165
166	Unicode Normalization Forms - UAX #15
167
168	=back
169
170	=cut
171