[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.pm

package Unicode::Normalize;

BEGIN {
    if (ord("A") == 193) {
	die "Unicode::Normalize not ported to EBCDIC\n";
    }
}

use 5.006;
use strict;
use warnings;
use Carp;

our $VERSION = '0.13';
our $PACKAGE = __PACKAGE__;

require Exporter;
require DynaLoader;
require AutoLoader;

our @ISA = qw(Exporter DynaLoader);
our @EXPORT = qw( NFC NFD NFKC NFKD );
our @EXPORT_OK = qw(
    normalize decompose reorder compose
    getCanon getCompat getComposite getCombinClass isExclusion
);
our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );

bootstrap Unicode::Normalize $VERSION;

use constant COMPAT => 1;

sub NFD  ($) { reorder(decompose($_[0])) }
sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }

sub NFC  ($) { compose(reorder(decompose($_[0]))) }
sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }

sub normalize($$)
{
    my $form = shift;
    $form =~ s/^NF//;
    return
	$form eq 'D'  ? NFD ($_[0]) :
	$form eq 'C'  ? NFC ($_[0]) :
	$form eq 'KD' ? NFKD($_[0]) :
	$form eq 'KC' ? NFKC($_[0]) :
      croak $PACKAGE."::normalize: invalid form name: $form";
}

1;
__END__

=head1 NAME

Unicode::Normalize - normalized forms of Unicode text

=head1 SYNOPSIS

  use Unicode::Normalize;

  $string_NFD  = NFD($raw_string);  # Normalization Form D
  $string_NFC  = NFC($raw_string);  # Normalization Form C
  $string_NFKD = NFKD($raw_string); # Normalization Form KD
  $string_NFKC = NFKC($raw_string); # Normalization Form KC

   or

  use Unicode::Normalize 'normalize';

  $string_NFD  = normalize('D',  $raw_string);  # Normalization Form D
  $string_NFC  = normalize('C',  $raw_string);  # Normalization Form C
  $string_NFKD = normalize('KD', $raw_string);  # Normalization Form KD
  $string_NFKC = normalize('KC', $raw_string);  # Normalization Form KC

=head1 DESCRIPTION

=head2 Normalization Forms

=over 4

=item C<$string_NFD = NFD($raw_string)>

returns the Normalization Form D (formed by canonical decomposition).


=item C<$string_NFC = NFC($raw_string)>

returns the Normalization Form C (formed by canonical decomposition
followed by canonical composition).

=item C<$string_NFKD = NFKD($raw_string)>

returns the Normalization Form KD (formed by compatibility decomposition).

=item C<$string_NFKC = NFKC($raw_string)>

returns the Normalization Form KC (formed by compatibility decomposition
followed by B<canonical> composition).

=item C<$normalized_string = normalize($form_name, $raw_string)>

As C<$form_name>, one of the following names must be given.

  'C'  or 'NFC'  for Normalization Form C
  'D'  or 'NFD'  for Normalization Form D
  'KC' or 'NFKC' for Normalization Form KC
  'KD' or 'NFKD' for Normalization Form KD

=back

=head2 Character Data

These functions are interface of character data used internally.
If you want only to get Unicode normalization forms, you don't need
call them yourself.

=over 4

=item C<$canonical_decomposed = getCanon($codepoint)>

=item C<$compatibility_decomposed = getCompat($codepoint)>

If the character of the specified codepoint is canonically or 
compatibility decomposable (including Hangul Syllables),
returns the B<completely decomposed> string equivalent to it.

If it is not decomposable, returns undef.

=item C<$uv_composite = getComposite($uv_here, $uv_next)>

If two characters here and next (as codepoints) are composable
(including Hangul Jamo/Syllables and Exclusions),
returns the codepoint of the composite.

If they are not composable, returns undef.

=item C<$combining_class = getCombinClass($codepoint)>

Returns the combining class as integer of the character.

=item C<$is_exclusion = isExclusion($codepoint)>

Returns a boolean whether the character of the specified codepoint is
a composition exclusion.

=back

=head2 EXPORT

C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.

C<normalize> and other some functions: on request.

=head2 TODO

Unicode::Normalize has not been ported to EBCDIC.  The code mostly
would work just fine but a decision needs to be made: how the module
should work in EBCDIC?  Should the low 256 characters be understood as
Unicode or as EBCDIC code points?  Should one be chosen or should
there be a way to do either?  Or should such translation be left
outside the module for the user to do, for example by using
Encode::from_to()?

=head1 AUTHOR

SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>

  http://homepage1.nifty.com/nomenclator/perl/

  Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.

  This program is free software; you can redistribute it and/or 
  modify it under the same terms as Perl itself.

=head1 SEE ALSO

=over 4

=item http://www.unicode.org/unicode/reports/tr15/

Unicode Normalization Forms - UAX #15

=back

=cut
Commit	Line	Data
ac5ea531	1	package Unicode::Normalize;
ac5ea531	2
4a2e806c	3	BEGIN {
	4	if (ord("A") == 193) {
	5	die "Unicode::Normalize not ported to EBCDIC\n";
	6	}
	7	}
	8
ac5ea531	9	use 5.006;
	10	use strict;
	11	use warnings;
	12	use Carp;
	13
d85850a7	14	our $VERSION = '0.13';
ac5ea531	15	our $PACKAGE = __PACKAGE__;
	16
	17	require Exporter;
	18	require DynaLoader;
	19	require AutoLoader;
	20
	21	our @ISA = qw(Exporter DynaLoader);
	22	our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45	23	our @EXPORT_OK = qw(
	24	normalize decompose reorder compose
	25	getCanon getCompat getComposite getCombinClass isExclusion
	26	);
ac5ea531	27	our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
	28
	29	bootstrap Unicode::Normalize $VERSION;
	30
ac5ea531	31	use constant COMPAT => 1;
ac5ea531	32
d85850a7	33	sub NFD ($) { reorder(decompose($_[0])) }
ac5ea531	34	sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
ac5ea531	35
d85850a7	36	sub NFC ($) { compose(reorder(decompose($_[0]))) }
ac5ea531	37	sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
	38
	39	sub normalize($$)
	40	{
d85850a7	41	my $form = shift;
	42	$form =~ s/^NF//;
	43	return
	44	$form eq 'D' ? NFD ($_[0]) :
	45	$form eq 'C' ? NFC ($_[0]) :
	46	$form eq 'KD' ? NFKD($_[0]) :
	47	$form eq 'KC' ? NFKC($_[0]) :
	48	croak $PACKAGE."::normalize: invalid form name: $form";
ac5ea531	49	}
	50
	51	1;
	52	__END__
2a204b45	53
	54	=head1 NAME
	55
	56	Unicode::Normalize - normalized forms of Unicode text
	57
	58	=head1 SYNOPSIS
	59
	60	use Unicode::Normalize;
	61
	62	$string_NFD = NFD($raw_string); # Normalization Form D
	63	$string_NFC = NFC($raw_string); # Normalization Form C
	64	$string_NFKD = NFKD($raw_string); # Normalization Form KD
	65	$string_NFKC = NFKC($raw_string); # Normalization Form KC
	66
	67	or
	68
	69	use Unicode::Normalize 'normalize';
	70
	71	$string_NFD = normalize('D', $raw_string); # Normalization Form D
	72	$string_NFC = normalize('C', $raw_string); # Normalization Form C
	73	$string_NFKD = normalize('KD', $raw_string); # Normalization Form KD
	74	$string_NFKC = normalize('KC', $raw_string); # Normalization Form KC
	75
	76	=head1 DESCRIPTION
	77
d85850a7	78	=head2 Normalization Forms
2a204b45	79
	80	=over 4
	81
	82	=item C<$string_NFD = NFD($raw_string)>
	83
	84	returns the Normalization Form D (formed by canonical decomposition).
	85
	86
	87	=item C<$string_NFC = NFC($raw_string)>
	88
	89	returns the Normalization Form C (formed by canonical decomposition
	90	followed by canonical composition).
	91
	92	=item C<$string_NFKD = NFKD($raw_string)>
	93
	94	returns the Normalization Form KD (formed by compatibility decomposition).
	95
	96	=item C<$string_NFKC = NFKC($raw_string)>
	97
	98	returns the Normalization Form KC (formed by compatibility decomposition
	99	followed by B<canonical> composition).
	100
	101	=item C<$normalized_string = normalize($form_name, $raw_string)>
	102
	103	As C<$form_name>, one of the following names must be given.
	104
	105	'C' or 'NFC' for Normalization Form C
	106	'D' or 'NFD' for Normalization Form D
	107	'KC' or 'NFKC' for Normalization Form KC
	108	'KD' or 'NFKD' for Normalization Form KD
	109
	110	=back
	111
	112	=head2 Character Data
	113
	114	These functions are interface of character data used internally.
d0ed0342	115	If you want only to get Unicode normalization forms, you don't need
d0ed0342	116	call them yourself.
2a204b45	117
	118	=over 4
	119
	120	=item C<$canonical_decomposed = getCanon($codepoint)>
	121
	122	=item C<$compatibility_decomposed = getCompat($codepoint)>
	123
	124	If the character of the specified codepoint is canonically or
	125	compatibility decomposable (including Hangul Syllables),
	126	returns the B<completely decomposed> string equivalent to it.
	127
	128	If it is not decomposable, returns undef.
	129
	130	=item C<$uv_composite = getComposite($uv_here, $uv_next)>
	131
d85850a7	132	If two characters here and next (as codepoints) are composable
2a204b45	133	(including Hangul Jamo/Syllables and Exclusions),
	134	returns the codepoint of the composite.
	135
	136	If they are not composable, returns undef.
	137
	138	=item C<$combining_class = getCombinClass($codepoint)>
	139
	140	Returns the combining class as integer of the character.
	141
	142	=item C<$is_exclusion = isExclusion($codepoint)>
	143
	144	Returns a boolean whether the character of the specified codepoint is
	145	a composition exclusion.
	146
	147	=back
	148
	149	=head2 EXPORT
	150
	151	C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
	152
	153	C<normalize> and other some functions: on request.
	154
4a2e806c	155	=head2 TODO
	156
	157	Unicode::Normalize has not been ported to EBCDIC. The code mostly
	158	would work just fine but a decision needs to be made: how the module
	159	should work in EBCDIC? Should the low 256 characters be understood as
	160	Unicode or as EBCDIC code points? Should one be chosen or should
	161	there be a way to do either? Or should such translation be left
	162	outside the module for the user to do, for example by using
	163	Encode::from_to()?
	164
2a204b45	165	=head1 AUTHOR
	166
	167	SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
	168
	169	http://homepage1.nifty.com/nomenclator/perl/
	170
	171	Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
	172
	173	This program is free software; you can redistribute it and/or
	174	modify it under the same terms as Perl itself.
	175
	176	=head1 SEE ALSO
	177
	178	=over 4
	179
	180	=item http://www.unicode.org/unicode/reports/tr15/
	181
	182	Unicode Normalization Forms - UAX #15
	183
	184	=back
	185
	186	=cut
	187