[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.pm

package Unicode::Normalize;

BEGIN {
    unless ("A" eq pack('U', 0x41)) {
	die "Unicode::Normalize cannot stringify a Unicode code point\n";
    }
}

use 5.006;
use strict;
use warnings;
use Carp;

our $VERSION = '0.23';
our $PACKAGE = __PACKAGE__;

require Exporter;
require DynaLoader;
require AutoLoader;

our @ISA = qw(Exporter DynaLoader);
our @EXPORT = qw( NFC NFD NFKC NFKD );
our @EXPORT_OK = qw(
    normalize decompose reorder compose
    checkNFD checkNFKD checkNFC checkNFKC check
    getCanon getCompat getComposite getCombinClass
    isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
    isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
);
our %EXPORT_TAGS = (
    all       => [ @EXPORT, @EXPORT_OK ],
    normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
    check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
);

bootstrap Unicode::Normalize $VERSION;

sub pack_U {
    return pack('U*', @_);
}

sub unpack_U {
    return unpack('U*', pack('U*').shift);
}

use constant COMPAT => 1;

sub NFD  ($) { reorder(decompose($_[0])) }
sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
sub NFC  ($) { compose(reorder(decompose($_[0]))) }
sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }

sub normalize($$)
{
    my $form = shift;
    my $str = shift;
    $form =~ s/^NF//;
    return
	$form eq 'D'  ? NFD ($str) :
	$form eq 'C'  ? NFC ($str) :
	$form eq 'KD' ? NFKD($str) :
	$form eq 'KC' ? NFKC($str) :
      croak $PACKAGE."::normalize: invalid form name: $form";
}

sub check($$)
{
    my $form = shift;
    my $str = shift;
    $form =~ s/^NF//;
    return
	$form eq 'D'  ? checkNFD ($str) :
	$form eq 'C'  ? checkNFC ($str) :
	$form eq 'KD' ? checkNFKD($str) :
	$form eq 'KC' ? checkNFKC($str) :
      croak $PACKAGE."::check: invalid form name: $form";
}

1;
__END__

=head1 NAME

Unicode::Normalize - Unicode Normalization Forms

=head1 SYNOPSIS

  use Unicode::Normalize;

  $NFD_string  = NFD($string);  # Normalization Form D
  $NFC_string  = NFC($string);  # Normalization Form C
  $NFKD_string = NFKD($string); # Normalization Form KD
  $NFKC_string = NFKC($string); # Normalization Form KC

   or

  use Unicode::Normalize 'normalize';

  $NFD_string  = normalize('D',  $string);  # Normalization Form D
  $NFC_string  = normalize('C',  $string);  # Normalization Form C
  $NFKD_string = normalize('KD', $string);  # Normalization Form KD
  $NFKC_string = normalize('KC', $string);  # Normalization Form KC

=head1 DESCRIPTION

Parameters:

C<$string> is used as a string under character semantics
(see F<perlunicode>).

C<$codepoint> should be an unsigned integer
representing a Unicode code point.

Note: Between XS edition and pure Perl edition,
interpretation of C<$codepoint> as a decimal number has incompatibility.
XS converts C<$codepoint> to an unsigned integer, but pure Perl does not.
Do not use a floating point nor a negative sign in C<$codepoint>.

=head2 Normalization Forms

=over 4

=item C<$NFD_string = NFD($string)>

returns the Normalization Form D (formed by canonical decomposition).

=item C<$NFC_string = NFC($string)>

returns the Normalization Form C (formed by canonical decomposition
followed by canonical composition).

=item C<$NFKD_string = NFKD($string)>

returns the Normalization Form KD (formed by compatibility decomposition).

=item C<$NFKC_string = NFKC($string)>

returns the Normalization Form KC (formed by compatibility decomposition
followed by B<canonical> composition).

=item C<$normalized_string = normalize($form_name, $string)>

As C<$form_name>, one of the following names must be given.

  'C'  or 'NFC'  for Normalization Form C
  'D'  or 'NFD'  for Normalization Form D
  'KC' or 'NFKC' for Normalization Form KC
  'KD' or 'NFKD' for Normalization Form KD

=back

=head2 Decomposition and Composition

=over 4

=item C<$decomposed_string = decompose($string)>

=item C<$decomposed_string = decompose($string, $useCompatMapping)>

Decomposes the specified string and returns the result.

If the second parameter (a boolean) is omitted or false, decomposes it
using the Canonical Decomposition Mapping.
If true, decomposes it using the Compatibility Decomposition Mapping.

The string returned is not always in NFD/NFKD.
Reordering may be required.

    $NFD_string  = reorder(decompose($string));       # eq. to NFD()
    $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()

=item C<$reordered_string  = reorder($string)>

Reorders the combining characters and the like in the canonical ordering
and returns the result.

E.g., when you have a list of NFD/NFKD strings,
you can get the concatenated NFD/NFKD string from them, saying

    $concat_NFD  = reorder(join '', @NFD_strings);
    $concat_NFKD = reorder(join '', @NFKD_strings);

=item C<$composed_string   = compose($string)>

Returns the string where composable pairs are composed.

E.g., when you have a NFD/NFKD string,
you can get its NFC/NFKC string, saying

    $NFC_string  = compose($NFD_string);
    $NFKC_string = compose($NFKD_string);

=back

=head2 Quick Check

(see Annex 8, UAX #15, and F<DerivedNormalizationProps.txt>)

The following functions check whether the string is in that normalization form.

The result returned will be:

    YES     The string is in that normalization form.
    NO      The string is not in that normalization form.
    MAYBE   Dubious. Maybe yes, maybe no.

=over 4

=item C<$result = checkNFD($string)>

returns C<YES> (C<1>) or C<NO> (C<empty string>).

=item C<$result = checkNFC($string)>

returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).

=item C<$result = checkNFKD($string)>

returns C<YES> (C<1>) or C<NO> (C<empty string>).

=item C<$result = checkNFKC($string)>

returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).

=item C<$result = check($form_name, $string)>

returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).

C<$form_name> is alike to that for C<normalize()>.

=back

B<Note>

In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
The answer C<MAYBE> may be returned in the cases of NFC and NFKC.

A MAYBE-NFC/NFKC string should contain at least
one combining character or the like.
For example, C<COMBINING ACUTE ACCENT> has
the MAYBE_NFC/MAYBE_NFKC property.
Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.

If you want to check exactly, compare the string with its NFC/NFKC; i.e.,

    $string eq NFC($string)    # more thorough than checkNFC($string)
    $string eq NFKC($string)   # more thorough than checkNFKC($string)

=head2 Character Data

These functions are interface of character data used internally.
If you want only to get Unicode normalization forms, you don't need
call them yourself.

=over 4

=item C<$canonical_decomposed = getCanon($codepoint)>

If the character of the specified codepoint is canonically
decomposable (including Hangul Syllables),
returns the B<completely decomposed> string canonically equivalent to it.

If it is not decomposable, returns C<undef>.

=item C<$compatibility_decomposed = getCompat($codepoint)>

If the character of the specified codepoint is compatibility
decomposable (including Hangul Syllables),
returns the B<completely decomposed> string compatibility equivalent to it.

If it is not decomposable, returns C<undef>.

=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>

If two characters here and next (as codepoints) are composable
(including Hangul Jamo/Syllables and Composition Exclusions),
returns the codepoint of the composite.

If they are not composable, returns C<undef>.

=item C<$combining_class = getCombinClass($codepoint)>

Returns the combining class of the character as an integer.

=item C<$is_exclusion = isExclusion($codepoint)>

Returns a boolean whether the character of the specified codepoint
is a composition exclusion.

=item C<$is_singleton = isSingleton($codepoint)>

Returns a boolean whether the character of the specified codepoint is
a singleton.

=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>

Returns a boolean whether the canonical decomposition
of the character of the specified codepoint
is a Non-Starter Decomposition.

=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>

Returns a boolean whether the character of the specified codepoint
may be composed with the previous one in a certain composition
(including Hangul Compositions, but excluding
Composition Exclusions and Non-Starter Decompositions).

=back

=head2 EXPORT

C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.

C<normalize> and other some functions: on request.

=head1 AUTHOR

SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>

  http://homepage1.nifty.com/nomenclator/perl/

  Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.

  This module is free software; you can redistribute it
  and/or modify it under the same terms as Perl itself.

=head1 SEE ALSO

=over 4

=item http://www.unicode.org/unicode/reports/tr15/

Unicode Normalization Forms - UAX #15

=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt

Derived Normalization Properties

=back

=cut
Commit	Line	Data
ac5ea531	1	package Unicode::Normalize;
ac5ea531	2
4a2e806c	3	BEGIN {
1efaba7f	4	unless ("A" eq pack('U', 0x41)) {
9f1f04a1	5	die "Unicode::Normalize cannot stringify a Unicode code point\n";
4a2e806c	6	}
	7	}
	8
ac5ea531	9	use 5.006;
	10	use strict;
	11	use warnings;
	12	use Carp;
	13
00f2676f	14	our $VERSION = '0.23';
ac5ea531	15	our $PACKAGE = __PACKAGE__;
	16
	17	require Exporter;
	18	require DynaLoader;
	19	require AutoLoader;
	20
	21	our @ISA = qw(Exporter DynaLoader);
	22	our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45	23	our @EXPORT_OK = qw(
2a204b45	24	normalize decompose reorder compose
8f118dcd	25	checkNFD checkNFKD checkNFC checkNFKC check
	26	getCanon getCompat getComposite getCombinClass
	27	isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
	28	isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
	29	);
	30	our %EXPORT_TAGS = (
	31	all => [ @EXPORT, @EXPORT_OK ],
	32	normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
	33	check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
2a204b45	34	);
ac5ea531	35
	36	bootstrap Unicode::Normalize $VERSION;
	37
9f1f04a1	38	sub pack_U {
b8d10bc1	39	return pack('U*', @_);
9f1f04a1	40	}
	41
	42	sub unpack_U {
b8d10bc1	43	return unpack('U', pack('U').shift);
9f1f04a1	44	}
9f1f04a1	45
ac5ea531	46	use constant COMPAT => 1;
ac5ea531	47
d85850a7	48	sub NFD ($) { reorder(decompose($_[0])) }
ac5ea531	49	sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
d85850a7	50	sub NFC ($) { compose(reorder(decompose($_[0]))) }
ac5ea531	51	sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
	52
	53	sub normalize($$)
	54	{
d85850a7	55	my $form = shift;
f027f502	56	my $str = shift;
d85850a7	57	$form =~ s/^NF//;
d85850a7	58	return
f027f502	59	$form eq 'D' ? NFD ($str) :
	60	$form eq 'C' ? NFC ($str) :
	61	$form eq 'KD' ? NFKD($str) :
	62	$form eq 'KC' ? NFKC($str) :
d85850a7	63	croak $PACKAGE."::normalize: invalid form name: $form";
ac5ea531	64	}
ac5ea531	65
8f118dcd	66	sub check($$)
	67	{
	68	my $form = shift;
f027f502	69	my $str = shift;
8f118dcd	70	$form =~ s/^NF//;
8f118dcd	71	return
f027f502	72	$form eq 'D' ? checkNFD ($str) :
	73	$form eq 'C' ? checkNFC ($str) :
	74	$form eq 'KD' ? checkNFKD($str) :
	75	$form eq 'KC' ? checkNFKC($str) :
8f118dcd	76	croak $PACKAGE."::check: invalid form name: $form";
	77	}
	78
ac5ea531	79	1;
ac5ea531	80	__END__
2a204b45	81
	82	=head1 NAME
	83
f027f502	84	Unicode::Normalize - Unicode Normalization Forms
2a204b45	85
	86	=head1 SYNOPSIS
	87
	88	use Unicode::Normalize;
	89
8f118dcd	90	$NFD_string = NFD($string); # Normalization Form D
	91	$NFC_string = NFC($string); # Normalization Form C
	92	$NFKD_string = NFKD($string); # Normalization Form KD
	93	$NFKC_string = NFKC($string); # Normalization Form KC
2a204b45	94
	95	or
	96
	97	use Unicode::Normalize 'normalize';
	98
8f118dcd	99	$NFD_string = normalize('D', $string); # Normalization Form D
	100	$NFC_string = normalize('C', $string); # Normalization Form C
	101	$NFKD_string = normalize('KD', $string); # Normalization Form KD
	102	$NFKC_string = normalize('KC', $string); # Normalization Form KC
2a204b45	103
	104	=head1 DESCRIPTION
	105
00f2676f	106	Parameters:
	107
	108	C<$string> is used as a string under character semantics
	109	(see F<perlunicode>).
	110
	111	C<$codepoint> should be an unsigned integer
	112	representing a Unicode code point.
	113
	114	Note: Between XS edition and pure Perl edition,
	115	interpretation of C<$codepoint> as a decimal number has incompatibility.
	116	XS converts C<$codepoint> to an unsigned integer, but pure Perl does not.
	117	Do not use a floating point nor a negative sign in C<$codepoint>.
	118
d85850a7	119	=head2 Normalization Forms
2a204b45	120
	121	=over 4
	122
8f118dcd	123	=item C<$NFD_string = NFD($string)>
2a204b45	124
	125	returns the Normalization Form D (formed by canonical decomposition).
	126
8f118dcd	127	=item C<$NFC_string = NFC($string)>
2a204b45	128
	129	returns the Normalization Form C (formed by canonical decomposition
	130	followed by canonical composition).
	131
8f118dcd	132	=item C<$NFKD_string = NFKD($string)>
2a204b45	133
	134	returns the Normalization Form KD (formed by compatibility decomposition).
	135
8f118dcd	136	=item C<$NFKC_string = NFKC($string)>
2a204b45	137
	138	returns the Normalization Form KC (formed by compatibility decomposition
	139	followed by B<canonical> composition).
	140
8f118dcd	141	=item C<$normalized_string = normalize($form_name, $string)>
2a204b45	142
	143	As C<$form_name>, one of the following names must be given.
	144
	145	'C' or 'NFC' for Normalization Form C
	146	'D' or 'NFD' for Normalization Form D
	147	'KC' or 'NFKC' for Normalization Form KC
	148	'KD' or 'NFKD' for Normalization Form KD
	149
	150	=back
	151
8f118dcd	152	=head2 Decomposition and Composition
	153
	154	=over 4
	155
	156	=item C<$decomposed_string = decompose($string)>
	157
	158	=item C<$decomposed_string = decompose($string, $useCompatMapping)>
	159
9f1f04a1	160	Decomposes the specified string and returns the result.
8f118dcd	161
	162	If the second parameter (a boolean) is omitted or false, decomposes it
	163	using the Canonical Decomposition Mapping.
	164	If true, decomposes it using the Compatibility Decomposition Mapping.
	165
	166	The string returned is not always in NFD/NFKD.
	167	Reordering may be required.
	168
	169	$NFD_string = reorder(decompose($string)); # eq. to NFD()
	170	$NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
	171
	172	=item C<$reordered_string = reorder($string)>
	173
9f1f04a1	174	Reorders the combining characters and the like in the canonical ordering
8f118dcd	175	and returns the result.
	176
	177	E.g., when you have a list of NFD/NFKD strings,
	178	you can get the concatenated NFD/NFKD string from them, saying
	179
	180	$concat_NFD = reorder(join '', @NFD_strings);
	181	$concat_NFKD = reorder(join '', @NFKD_strings);
	182
	183	=item C<$composed_string = compose($string)>
	184
	185	Returns the string where composable pairs are composed.
	186
	187	E.g., when you have a NFD/NFKD string,
	188	you can get its NFC/NFKC string, saying
	189
	190	$NFC_string = compose($NFD_string);
	191	$NFKC_string = compose($NFKD_string);
	192
	193	=back
	194
	195	=head2 Quick Check
	196
6c941e0c	197	(see Annex 8, UAX #15, and F<DerivedNormalizationProps.txt>)
8f118dcd	198
	199	The following functions check whether the string is in that normalization form.
	200
	201	The result returned will be:
	202
	203	YES The string is in that normalization form.
	204	NO The string is not in that normalization form.
	205	MAYBE Dubious. Maybe yes, maybe no.
	206
	207	=over 4
	208
	209	=item C<$result = checkNFD($string)>
	210
f027f502	211	returns C<YES> (C<1>) or C<NO> (C<empty string>).
8f118dcd	212
	213	=item C<$result = checkNFC($string)>
	214
f027f502	215	returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd	216
	217	=item C<$result = checkNFKD($string)>
	218
f027f502	219	returns C<YES> (C<1>) or C<NO> (C<empty string>).
8f118dcd	220
	221	=item C<$result = checkNFKC($string)>
	222
f027f502	223	returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd	224
	225	=item C<$result = check($form_name, $string)>
	226
f027f502	227	returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd	228
	229	C<$form_name> is alike to that for C<normalize()>.
	230
	231	=back
	232
	233	B<Note>
	234
	235	In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
	236	The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
	237
	238	A MAYBE-NFC/NFKC string should contain at least
	239	one combining character or the like.
	240	For example, C<COMBINING ACUTE ACCENT> has
	241	the MAYBE_NFC/MAYBE_NFKC property.
	242	Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
	243	and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
f027f502	244	C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
8f118dcd	245	(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
	246	while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
	247
	248	If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
	249
	250	$string eq NFC($string) # more thorough than checkNFC($string)
	251	$string eq NFKC($string) # more thorough than checkNFKC($string)
	252
2a204b45	253	=head2 Character Data
	254
	255	These functions are interface of character data used internally.
d0ed0342	256	If you want only to get Unicode normalization forms, you don't need
d0ed0342	257	call them yourself.
2a204b45	258
	259	=over 4
	260
	261	=item C<$canonical_decomposed = getCanon($codepoint)>
	262
8f118dcd	263	If the character of the specified codepoint is canonically
	264	decomposable (including Hangul Syllables),
	265	returns the B<completely decomposed> string canonically equivalent to it.
	266
f027f502	267	If it is not decomposable, returns C<undef>.
8f118dcd	268
2a204b45	269	=item C<$compatibility_decomposed = getCompat($codepoint)>
2a204b45	270
8f118dcd	271	If the character of the specified codepoint is compatibility
	272	decomposable (including Hangul Syllables),
	273	returns the B<completely decomposed> string compatibility equivalent to it.
2a204b45	274
f027f502	275	If it is not decomposable, returns C<undef>.
2a204b45	276
8f118dcd	277	=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
2a204b45	278
d85850a7	279	If two characters here and next (as codepoints) are composable
8f118dcd	280	(including Hangul Jamo/Syllables and Composition Exclusions),
2a204b45	281	returns the codepoint of the composite.
2a204b45	282
f027f502	283	If they are not composable, returns C<undef>.
2a204b45	284
	285	=item C<$combining_class = getCombinClass($codepoint)>
	286
8f118dcd	287	Returns the combining class of the character as an integer.
2a204b45	288
	289	=item C<$is_exclusion = isExclusion($codepoint)>
	290
8f118dcd	291	Returns a boolean whether the character of the specified codepoint
	292	is a composition exclusion.
	293
	294	=item C<$is_singleton = isSingleton($codepoint)>
	295
2a204b45	296	Returns a boolean whether the character of the specified codepoint is
8f118dcd	297	a singleton.
8f118dcd	298
6c941e0c	299	=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
8f118dcd	300
	301	Returns a boolean whether the canonical decomposition
	302	of the character of the specified codepoint
	303	is a Non-Starter Decomposition.
	304
	305	=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
	306
	307	Returns a boolean whether the character of the specified codepoint
	308	may be composed with the previous one in a certain composition
	309	(including Hangul Compositions, but excluding
	310	Composition Exclusions and Non-Starter Decompositions).
2a204b45	311
	312	=back
	313
	314	=head2 EXPORT
	315
	316	C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
	317
	318	C<normalize> and other some functions: on request.
	319
	320	=head1 AUTHOR
	321
	322	SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
	323
	324	http://homepage1.nifty.com/nomenclator/perl/
	325
6c941e0c	326	Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
2a204b45	327
6c941e0c	328	This module is free software; you can redistribute it
6c941e0c	329	and/or modify it under the same terms as Perl itself.
2a204b45	330
	331	=head1 SEE ALSO
	332
	333	=over 4
	334
	335	=item http://www.unicode.org/unicode/reports/tr15/
	336
	337	Unicode Normalization Forms - UAX #15
	338
14e6b36c	339	=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
8f118dcd	340
	341	Derived Normalization Properties
	342
2a204b45	343	=back
	344
	345	=cut
	346