[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.pm

package Unicode::Normalize;

BEGIN {
    unless ("A" eq pack('U', 0x41)) {
	die "Unicode::Normalize cannot stringify a Unicode code point\n";
    }
}

use 5.006;
use strict;
use warnings;
use Carp;

our $VERSION = '0.25';
our $PACKAGE = __PACKAGE__;

require Exporter;
require DynaLoader;

our @ISA = qw(Exporter DynaLoader);
our @EXPORT = qw( NFC NFD NFKC NFKD );
our @EXPORT_OK = qw(
    normalize decompose reorder compose
    checkNFD checkNFKD checkNFC checkNFKC check
    getCanon getCompat getComposite getCombinClass
    isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
    isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
    FCD checkFCD FCC checkFCC composeContiguous
    splitOnLastStarter
);
our %EXPORT_TAGS = (
    all       => [ @EXPORT, @EXPORT_OK ],
    normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
    check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
    fast      => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
);

######

bootstrap Unicode::Normalize $VERSION;

######

sub pack_U {
    return pack('U*', @_);
}

sub unpack_U {
    return unpack('U*', pack('U*').shift);
}


##
## normalization forms
##

use constant COMPAT => 1;

sub NFD  ($) { reorder(decompose($_[0])) }
sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
sub NFC  ($) { compose(reorder(decompose($_[0]))) }
sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }

sub FCD ($) {
    my $str = shift;
    return checkFCD($str) ? $str : NFD($str);
}
sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) }

our %formNorm = (
    NFC  => \&NFC,	C  => \&NFC,
    NFD  => \&NFD,	D  => \&NFD,
    NFKC => \&NFKC,	KC => \&NFKC,
    NFKD => \&NFKD,	KD => \&NFKD,
    FCD  => \&FCD,	FCC => \&FCC,
);

sub normalize($$)
{
    my $form = shift;
    my $str = shift;
    return exists $formNorm{$form} 
	? $formNorm{$form}->($str)
	: croak $PACKAGE."::normalize: invalid form name: $form";
}


##
## quick check
##

our %formCheck = (
    NFC  => \&checkNFC, 	C  => \&checkNFC,
    NFD  => \&checkNFD, 	D  => \&checkNFD,
    NFKC => \&checkNFKC,	KC => \&checkNFKC,
    NFKD => \&checkNFKD,	KD => \&checkNFKD,
    FCD  => \&checkFCD, 	FCC => \&checkFCC,
);

sub check($$)
{
    my $form = shift;
    my $str = shift;
    return exists $formCheck{$form} 
	? $formCheck{$form}->($str)
	: croak $PACKAGE."::check: invalid form name: $form";
}

1;
__END__

=head1 NAME

Unicode::Normalize - Unicode Normalization Forms

=head1 SYNOPSIS

  use Unicode::Normalize;

  $NFD_string  = NFD($string);  # Normalization Form D
  $NFC_string  = NFC($string);  # Normalization Form C
  $NFKD_string = NFKD($string); # Normalization Form KD
  $NFKC_string = NFKC($string); # Normalization Form KC

   or

  use Unicode::Normalize 'normalize';

  $NFD_string  = normalize('D',  $string);  # Normalization Form D
  $NFC_string  = normalize('C',  $string);  # Normalization Form C
  $NFKD_string = normalize('KD', $string);  # Normalization Form KD
  $NFKC_string = normalize('KC', $string);  # Normalization Form KC

=head1 DESCRIPTION

Parameters:

C<$string> is used as a string under character semantics
(see F<perlunicode>).

C<$codepoint> should be an unsigned integer
representing a Unicode code point.

Note: Between XS edition and pure Perl edition,
interpretation of C<$codepoint> as a decimal number has incompatibility.
XS converts C<$codepoint> to an unsigned integer, but pure Perl does not.
Do not use a floating point nor a negative sign in C<$codepoint>.

=head2 Normalization Forms

=over 4

=item C<$NFD_string = NFD($string)>

returns the Normalization Form D (formed by canonical decomposition).

=item C<$NFC_string = NFC($string)>

returns the Normalization Form C (formed by canonical decomposition
followed by canonical composition).

=item C<$NFKD_string = NFKD($string)>

returns the Normalization Form KD (formed by compatibility decomposition).

=item C<$NFKC_string = NFKC($string)>

returns the Normalization Form KC (formed by compatibility decomposition
followed by B<canonical> composition).

=item C<$FCD_string = FCD($string)>

If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
returns it without modification; otherwise returns an FCD string.

Note: FCD is not always unique, then plural forms may be equivalent
each other. C<FCD()> will return one of these equivalent forms.

=item C<$FCC_string = FCC($string)>

returns the FCC form ("Fast C Contiguous"; cf. UTN #5).

Note: FCD is unique, as well as four normalization forms (NF*).

=item C<$normalized_string = normalize($form_name, $string)>

As C<$form_name>, one of the following names must be given.

  'C'  or 'NFC'  for Normalization Form C  (UAX #15)
  'D'  or 'NFD'  for Normalization Form D  (UAX #15)
  'KC' or 'NFKC' for Normalization Form KC (UAX #15)
  'KD' or 'NFKD' for Normalization Form KD (UAX #15)

  'FCD'          for "Fast C or D" Form  (UTN #5)
  'FCC'          for "Fast C Contiguous" (UTN #5)

=back

=head2 Decomposition and Composition

=over 4

=item C<$decomposed_string = decompose($string)>

=item C<$decomposed_string = decompose($string, $useCompatMapping)>

Decomposes the specified string and returns the result.

If the second parameter (a boolean) is omitted or false, decomposes it
using the Canonical Decomposition Mapping.
If true, decomposes it using the Compatibility Decomposition Mapping.

The string returned is not always in NFD/NFKD.
Reordering may be required.

    $NFD_string  = reorder(decompose($string));       # eq. to NFD()
    $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()

=item C<$reordered_string  = reorder($string)>

Reorders the combining characters and the like in the canonical ordering
and returns the result.

E.g., when you have a list of NFD/NFKD strings,
you can get the concatenated NFD/NFKD string from them, saying

    $concat_NFD  = reorder(join '', @NFD_strings);
    $concat_NFKD = reorder(join '', @NFKD_strings);

=item C<$composed_string   = compose($string)>

Returns the string where composable pairs are composed.

E.g., when you have a NFD/NFKD string,
you can get its NFC/NFKC string, saying

    $NFC_string  = compose($NFD_string);
    $NFKC_string = compose($NFKD_string);

=back

=head2 Quick Check

(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)

The following functions check whether the string is in that normalization form.

The result returned will be:

    YES     The string is in that normalization form.
    NO      The string is not in that normalization form.
    MAYBE   Dubious. Maybe yes, maybe no.

=over 4

=item C<$result = checkNFD($string)>

returns C<YES> (C<1>) or C<NO> (C<empty string>).

=item C<$result = checkNFC($string)>

returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).

=item C<$result = checkNFKD($string)>

returns C<YES> (C<1>) or C<NO> (C<empty string>).

=item C<$result = checkNFKC($string)>

returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).

=item C<$result = checkFCD($string)>

returns C<YES> (C<1>) or C<NO> (C<empty string>).

=item C<$result = checkFCC($string)>

returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).

If a string is not in C<FCD>, it must not be in <FCC>.
So C<checkFCC($not_FCD_string)> should return C<NO>.

=item C<$result = check($form_name, $string)>

returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).

C<$form_name> is alike to that for C<normalize()>.

=back

B<Note>

In the cases of NFD, NFKD, and FCD, the answer must be
either C<YES> or C<NO>. The answer C<MAYBE> may be returned
in the cases of NFC, NFKC, and FCC.

A C<MAYBE> string should contain at least one combining character
or the like. For example, C<COMBINING ACUTE ACCENT> has
the MAYBE_NFC/MAYBE_NFKC property.

Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.

If you want to check exactly, compare the string with its NFC/NFKC/FCC;
i.e.,

    $string eq NFC($string)    # thorough than checkNFC($string)
    $string eq NFKC($string)   # thorough than checkNFKC($string)
    $string eq FCC($string)    # thorough than checkFCC($string)

=head2 Character Data

These functions are interface of character data used internally.
If you want only to get Unicode normalization forms, you don't need
call them yourself.

=over 4

=item C<$canonical_decomposed = getCanon($codepoint)>

If the character of the specified codepoint is canonically
decomposable (including Hangul Syllables),
returns the B<completely decomposed> string canonically equivalent to it.

If it is not decomposable, returns C<undef>.

=item C<$compatibility_decomposed = getCompat($codepoint)>

If the character of the specified codepoint is compatibility
decomposable (including Hangul Syllables),
returns the B<completely decomposed> string compatibility equivalent to it.

If it is not decomposable, returns C<undef>.

=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>

If two characters here and next (as codepoints) are composable
(including Hangul Jamo/Syllables and Composition Exclusions),
returns the codepoint of the composite.

If they are not composable, returns C<undef>.

=item C<$combining_class = getCombinClass($codepoint)>

Returns the combining class of the character as an integer.

=item C<$is_exclusion = isExclusion($codepoint)>

Returns a boolean whether the character of the specified codepoint
is a composition exclusion.

=item C<$is_singleton = isSingleton($codepoint)>

Returns a boolean whether the character of the specified codepoint is
a singleton.

=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>

Returns a boolean whether the canonical decomposition
of the character of the specified codepoint
is a Non-Starter Decomposition.

=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>

Returns a boolean whether the character of the specified codepoint
may be composed with the previous one in a certain composition
(including Hangul Compositions, but excluding
Composition Exclusions and Non-Starter Decompositions).

=back

=head2 EXPORT

C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.

C<normalize> and other some functions: on request.

=head1 AUTHOR

SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>

  http://homepage1.nifty.com/nomenclator/perl/

  Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.

  This module is free software; you can redistribute it
  and/or modify it under the same terms as Perl itself.

=head1 SEE ALSO

=over 4

=item http://www.unicode.org/unicode/reports/tr15/

Unicode Normalization Forms - UAX #15

=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt

Derived Normalization Properties

=item http://www.unicode.org/notes/tn5/

Canonical Equivalence in Applications - UTN #5

=back

=cut
Commit	Line	Data
ac5ea531	1	package Unicode::Normalize;
ac5ea531	2
4a2e806c	3	BEGIN {
1efaba7f	4	unless ("A" eq pack('U', 0x41)) {
9f1f04a1	5	die "Unicode::Normalize cannot stringify a Unicode code point\n";
4a2e806c	6	}
	7	}
	8
ac5ea531	9	use 5.006;
	10	use strict;
	11	use warnings;
	12	use Carp;
	13
82e740b6	14	our $VERSION = '0.25';
ac5ea531	15	our $PACKAGE = __PACKAGE__;
	16
	17	require Exporter;
	18	require DynaLoader;
ac5ea531	19
	20	our @ISA = qw(Exporter DynaLoader);
	21	our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45	22	our @EXPORT_OK = qw(
2a204b45	23	normalize decompose reorder compose
8f118dcd	24	checkNFD checkNFKD checkNFC checkNFKC check
	25	getCanon getCompat getComposite getCombinClass
	26	isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
	27	isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
82e740b6	28	FCD checkFCD FCC checkFCC composeContiguous
82e740b6	29	splitOnLastStarter
8f118dcd	30	);
	31	our %EXPORT_TAGS = (
	32	all => [ @EXPORT, @EXPORT_OK ],
	33	normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
	34	check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
82e740b6	35	fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
2a204b45	36	);
ac5ea531	37
82e740b6	38	######
82e740b6	39
ac5ea531	40	bootstrap Unicode::Normalize $VERSION;
ac5ea531	41
82e740b6	42	######
82e740b6	43
9f1f04a1	44	sub pack_U {
b8d10bc1	45	return pack('U*', @_);
9f1f04a1	46	}
	47
	48	sub unpack_U {
b8d10bc1	49	return unpack('U', pack('U').shift);
9f1f04a1	50	}
9f1f04a1	51
82e740b6	52
	53	##
	54	## normalization forms
	55	##
	56
ac5ea531	57	use constant COMPAT => 1;
ac5ea531	58
d85850a7	59	sub NFD ($) { reorder(decompose($_[0])) }
ac5ea531	60	sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
d85850a7	61	sub NFC ($) { compose(reorder(decompose($_[0]))) }
ac5ea531	62	sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
ac5ea531	63
82e740b6	64	sub FCD ($) {
	65	my $str = shift;
	66	return checkFCD($str) ? $str : NFD($str);
	67	}
	68	sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) }
	69
	70	our %formNorm = (
	71	NFC => \&NFC, C => \&NFC,
	72	NFD => \&NFD, D => \&NFD,
	73	NFKC => \&NFKC, KC => \&NFKC,
	74	NFKD => \&NFKD, KD => \&NFKD,
	75	FCD => \&FCD, FCC => \&FCC,
	76	);
	77
ac5ea531	78	sub normalize($$)
ac5ea531	79	{
d85850a7	80	my $form = shift;
f027f502	81	my $str = shift;
82e740b6	82	return exists $formNorm{$form}
	83	? $formNorm{$form}->($str)
	84	: croak $PACKAGE."::normalize: invalid form name: $form";
ac5ea531	85	}
ac5ea531	86
82e740b6	87
	88	##
	89	## quick check
	90	##
	91
	92	our %formCheck = (
	93	NFC => \&checkNFC, C => \&checkNFC,
	94	NFD => \&checkNFD, D => \&checkNFD,
	95	NFKC => \&checkNFKC, KC => \&checkNFKC,
	96	NFKD => \&checkNFKD, KD => \&checkNFKD,
	97	FCD => \&checkFCD, FCC => \&checkFCC,
	98	);
	99
8f118dcd	100	sub check($$)
	101	{
	102	my $form = shift;
f027f502	103	my $str = shift;
82e740b6	104	return exists $formCheck{$form}
	105	? $formCheck{$form}->($str)
	106	: croak $PACKAGE."::check: invalid form name: $form";
8f118dcd	107	}
8f118dcd	108
ac5ea531	109	1;
ac5ea531	110	__END__
2a204b45	111
	112	=head1 NAME
	113
f027f502	114	Unicode::Normalize - Unicode Normalization Forms
2a204b45	115
	116	=head1 SYNOPSIS
	117
	118	use Unicode::Normalize;
	119
8f118dcd	120	$NFD_string = NFD($string); # Normalization Form D
	121	$NFC_string = NFC($string); # Normalization Form C
	122	$NFKD_string = NFKD($string); # Normalization Form KD
	123	$NFKC_string = NFKC($string); # Normalization Form KC
2a204b45	124
	125	or
	126
	127	use Unicode::Normalize 'normalize';
	128
8f118dcd	129	$NFD_string = normalize('D', $string); # Normalization Form D
	130	$NFC_string = normalize('C', $string); # Normalization Form C
	131	$NFKD_string = normalize('KD', $string); # Normalization Form KD
	132	$NFKC_string = normalize('KC', $string); # Normalization Form KC
2a204b45	133
	134	=head1 DESCRIPTION
	135
00f2676f	136	Parameters:
	137
	138	C<$string> is used as a string under character semantics
	139	(see F<perlunicode>).
	140
	141	C<$codepoint> should be an unsigned integer
	142	representing a Unicode code point.
	143
	144	Note: Between XS edition and pure Perl edition,
	145	interpretation of C<$codepoint> as a decimal number has incompatibility.
	146	XS converts C<$codepoint> to an unsigned integer, but pure Perl does not.
	147	Do not use a floating point nor a negative sign in C<$codepoint>.
	148
d85850a7	149	=head2 Normalization Forms
2a204b45	150
	151	=over 4
	152
8f118dcd	153	=item C<$NFD_string = NFD($string)>
2a204b45	154
	155	returns the Normalization Form D (formed by canonical decomposition).
	156
8f118dcd	157	=item C<$NFC_string = NFC($string)>
2a204b45	158
	159	returns the Normalization Form C (formed by canonical decomposition
	160	followed by canonical composition).
	161
8f118dcd	162	=item C<$NFKD_string = NFKD($string)>
2a204b45	163
	164	returns the Normalization Form KD (formed by compatibility decomposition).
	165
8f118dcd	166	=item C<$NFKC_string = NFKC($string)>
2a204b45	167
	168	returns the Normalization Form KC (formed by compatibility decomposition
	169	followed by B<canonical> composition).
	170
82e740b6	171	=item C<$FCD_string = FCD($string)>
	172
	173	If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
	174	returns it without modification; otherwise returns an FCD string.
	175
	176	Note: FCD is not always unique, then plural forms may be equivalent
	177	each other. C<FCD()> will return one of these equivalent forms.
	178
	179	=item C<$FCC_string = FCC($string)>
	180
	181	returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
	182
	183	Note: FCD is unique, as well as four normalization forms (NF*).
	184
8f118dcd	185	=item C<$normalized_string = normalize($form_name, $string)>
2a204b45	186
	187	As C<$form_name>, one of the following names must be given.
	188
82e740b6	189	'C' or 'NFC' for Normalization Form C (UAX #15)
	190	'D' or 'NFD' for Normalization Form D (UAX #15)
	191	'KC' or 'NFKC' for Normalization Form KC (UAX #15)
	192	'KD' or 'NFKD' for Normalization Form KD (UAX #15)
	193
	194	'FCD' for "Fast C or D" Form (UTN #5)
	195	'FCC' for "Fast C Contiguous" (UTN #5)
2a204b45	196
	197	=back
	198
8f118dcd	199	=head2 Decomposition and Composition
	200
	201	=over 4
	202
	203	=item C<$decomposed_string = decompose($string)>
	204
	205	=item C<$decomposed_string = decompose($string, $useCompatMapping)>
	206
9f1f04a1	207	Decomposes the specified string and returns the result.
8f118dcd	208
	209	If the second parameter (a boolean) is omitted or false, decomposes it
	210	using the Canonical Decomposition Mapping.
	211	If true, decomposes it using the Compatibility Decomposition Mapping.
	212
	213	The string returned is not always in NFD/NFKD.
	214	Reordering may be required.
	215
	216	$NFD_string = reorder(decompose($string)); # eq. to NFD()
	217	$NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
	218
	219	=item C<$reordered_string = reorder($string)>
	220
9f1f04a1	221	Reorders the combining characters and the like in the canonical ordering
8f118dcd	222	and returns the result.
	223
	224	E.g., when you have a list of NFD/NFKD strings,
	225	you can get the concatenated NFD/NFKD string from them, saying
	226
	227	$concat_NFD = reorder(join '', @NFD_strings);
	228	$concat_NFKD = reorder(join '', @NFKD_strings);
	229
	230	=item C<$composed_string = compose($string)>
	231
	232	Returns the string where composable pairs are composed.
	233
	234	E.g., when you have a NFD/NFKD string,
	235	you can get its NFC/NFKC string, saying
	236
	237	$NFC_string = compose($NFD_string);
	238	$NFKC_string = compose($NFKD_string);
	239
	240	=back
	241
	242	=head2 Quick Check
	243
82e740b6	244	(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
8f118dcd	245
	246	The following functions check whether the string is in that normalization form.
	247
	248	The result returned will be:
	249
	250	YES The string is in that normalization form.
	251	NO The string is not in that normalization form.
	252	MAYBE Dubious. Maybe yes, maybe no.
	253
	254	=over 4
	255
	256	=item C<$result = checkNFD($string)>
	257
f027f502	258	returns C<YES> (C<1>) or C<NO> (C<empty string>).
8f118dcd	259
	260	=item C<$result = checkNFC($string)>
	261
f027f502	262	returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd	263
	264	=item C<$result = checkNFKD($string)>
	265
f027f502	266	returns C<YES> (C<1>) or C<NO> (C<empty string>).
8f118dcd	267
	268	=item C<$result = checkNFKC($string)>
	269
f027f502	270	returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd	271
82e740b6	272	=item C<$result = checkFCD($string)>
	273
	274	returns C<YES> (C<1>) or C<NO> (C<empty string>).
	275
	276	=item C<$result = checkFCC($string)>
	277
	278	returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
	279
	280	If a string is not in C<FCD>, it must not be in <FCC>.
	281	So C<checkFCC($not_FCD_string)> should return C<NO>.
	282
8f118dcd	283	=item C<$result = check($form_name, $string)>
8f118dcd	284
f027f502	285	returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd	286
	287	C<$form_name> is alike to that for C<normalize()>.
	288
	289	=back
	290
	291	B<Note>
	292
82e740b6	293	In the cases of NFD, NFKD, and FCD, the answer must be
	294	either C<YES> or C<NO>. The answer C<MAYBE> may be returned
	295	in the cases of NFC, NFKC, and FCC.
8f118dcd	296
82e740b6	297	A C<MAYBE> string should contain at least one combining character
82e740b6	298	or the like. For example, C<COMBINING ACUTE ACCENT> has
8f118dcd	299	the MAYBE_NFC/MAYBE_NFKC property.
82e740b6	300
8f118dcd	301	Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
8f118dcd	302	and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
f027f502	303	C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
8f118dcd	304	(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
	305	while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
	306
82e740b6	307	If you want to check exactly, compare the string with its NFC/NFKC/FCC;
82e740b6	308	i.e.,
8f118dcd	309
82e740b6	310	$string eq NFC($string) # thorough than checkNFC($string)
	311	$string eq NFKC($string) # thorough than checkNFKC($string)
	312	$string eq FCC($string) # thorough than checkFCC($string)
8f118dcd	313
2a204b45	314	=head2 Character Data
	315
	316	These functions are interface of character data used internally.
d0ed0342	317	If you want only to get Unicode normalization forms, you don't need
d0ed0342	318	call them yourself.
2a204b45	319
	320	=over 4
	321
	322	=item C<$canonical_decomposed = getCanon($codepoint)>
	323
8f118dcd	324	If the character of the specified codepoint is canonically
	325	decomposable (including Hangul Syllables),
	326	returns the B<completely decomposed> string canonically equivalent to it.
	327
f027f502	328	If it is not decomposable, returns C<undef>.
8f118dcd	329
2a204b45	330	=item C<$compatibility_decomposed = getCompat($codepoint)>
2a204b45	331
8f118dcd	332	If the character of the specified codepoint is compatibility
	333	decomposable (including Hangul Syllables),
	334	returns the B<completely decomposed> string compatibility equivalent to it.
2a204b45	335
f027f502	336	If it is not decomposable, returns C<undef>.
2a204b45	337
8f118dcd	338	=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
2a204b45	339
d85850a7	340	If two characters here and next (as codepoints) are composable
8f118dcd	341	(including Hangul Jamo/Syllables and Composition Exclusions),
2a204b45	342	returns the codepoint of the composite.
2a204b45	343
f027f502	344	If they are not composable, returns C<undef>.
2a204b45	345
	346	=item C<$combining_class = getCombinClass($codepoint)>
	347
8f118dcd	348	Returns the combining class of the character as an integer.
2a204b45	349
	350	=item C<$is_exclusion = isExclusion($codepoint)>
	351
8f118dcd	352	Returns a boolean whether the character of the specified codepoint
	353	is a composition exclusion.
	354
	355	=item C<$is_singleton = isSingleton($codepoint)>
	356
2a204b45	357	Returns a boolean whether the character of the specified codepoint is
8f118dcd	358	a singleton.
8f118dcd	359
6c941e0c	360	=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
8f118dcd	361
	362	Returns a boolean whether the canonical decomposition
	363	of the character of the specified codepoint
	364	is a Non-Starter Decomposition.
	365
	366	=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
	367
	368	Returns a boolean whether the character of the specified codepoint
	369	may be composed with the previous one in a certain composition
	370	(including Hangul Compositions, but excluding
	371	Composition Exclusions and Non-Starter Decompositions).
2a204b45	372
	373	=back
	374
	375	=head2 EXPORT
	376
	377	C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
	378
	379	C<normalize> and other some functions: on request.
	380
	381	=head1 AUTHOR
	382
82e740b6	383	SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>
2a204b45	384
	385	http://homepage1.nifty.com/nomenclator/perl/
	386
6c941e0c	387	Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
2a204b45	388
6c941e0c	389	This module is free software; you can redistribute it
6c941e0c	390	and/or modify it under the same terms as Perl itself.
2a204b45	391
	392	=head1 SEE ALSO
	393
	394	=over 4
	395
	396	=item http://www.unicode.org/unicode/reports/tr15/
	397
	398	Unicode Normalization Forms - UAX #15
	399
14e6b36c	400	=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
8f118dcd	401
	402	Derived Normalization Properties
	403
82e740b6	404	=item http://www.unicode.org/notes/tn5/
	405
	406	Canonical Equivalence in Applications - UTN #5
	407
2a204b45	408	=back
	409
	410	=cut
	411