[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.pm

package Unicode::Normalize;

BEGIN {
    unless ("A" eq pack('U', 0x41) || "A" eq pack('U', ord("A"))) {
	die "Unicode::Normalize cannot stringify a Unicode code point\n";
    }
}

use 5.006;
use strict;
use warnings;
use Carp;

our $VERSION = '0.21';
our $PACKAGE = __PACKAGE__;

require Exporter;
require DynaLoader;
require AutoLoader;

our @ISA = qw(Exporter DynaLoader);
our @EXPORT = qw( NFC NFD NFKC NFKD );
our @EXPORT_OK = qw(
    normalize decompose reorder compose
    checkNFD checkNFKD checkNFC checkNFKC check
    getCanon getCompat getComposite getCombinClass
    isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
    isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
);
our %EXPORT_TAGS = (
    all       => [ @EXPORT, @EXPORT_OK ],
    normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
    check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
);

bootstrap Unicode::Normalize $VERSION;

sub pack_U {
    return pack('U*', @_);
}

sub unpack_U {
    return unpack('U*', pack('U*').shift);
}

use constant COMPAT => 1;

sub NFD  ($) { reorder(decompose($_[0])) }
sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
sub NFC  ($) { compose(reorder(decompose($_[0]))) }
sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }

sub normalize($$)
{
    my $form = shift;
    my $str = shift;
    $form =~ s/^NF//;
    return
	$form eq 'D'  ? NFD ($str) :
	$form eq 'C'  ? NFC ($str) :
	$form eq 'KD' ? NFKD($str) :
	$form eq 'KC' ? NFKC($str) :
      croak $PACKAGE."::normalize: invalid form name: $form";
}

sub check($$)
{
    my $form = shift;
    my $str = shift;
    $form =~ s/^NF//;
    return
	$form eq 'D'  ? checkNFD ($str) :
	$form eq 'C'  ? checkNFC ($str) :
	$form eq 'KD' ? checkNFKD($str) :
	$form eq 'KC' ? checkNFKC($str) :
      croak $PACKAGE."::check: invalid form name: $form";
}

1;
__END__

=head1 NAME

Unicode::Normalize - Unicode Normalization Forms

=head1 SYNOPSIS

  use Unicode::Normalize;

  $NFD_string  = NFD($string);  # Normalization Form D
  $NFC_string  = NFC($string);  # Normalization Form C
  $NFKD_string = NFKD($string); # Normalization Form KD
  $NFKC_string = NFKC($string); # Normalization Form KC

   or

  use Unicode::Normalize 'normalize';

  $NFD_string  = normalize('D',  $string);  # Normalization Form D
  $NFC_string  = normalize('C',  $string);  # Normalization Form C
  $NFKD_string = normalize('KD', $string);  # Normalization Form KD
  $NFKC_string = normalize('KC', $string);  # Normalization Form KC

=head1 DESCRIPTION

=head2 Normalization Forms

=over 4

=item C<$NFD_string = NFD($string)>

returns the Normalization Form D (formed by canonical decomposition).

=item C<$NFC_string = NFC($string)>

returns the Normalization Form C (formed by canonical decomposition
followed by canonical composition).

=item C<$NFKD_string = NFKD($string)>

returns the Normalization Form KD (formed by compatibility decomposition).

=item C<$NFKC_string = NFKC($string)>

returns the Normalization Form KC (formed by compatibility decomposition
followed by B<canonical> composition).

=item C<$normalized_string = normalize($form_name, $string)>

As C<$form_name>, one of the following names must be given.

  'C'  or 'NFC'  for Normalization Form C
  'D'  or 'NFD'  for Normalization Form D
  'KC' or 'NFKC' for Normalization Form KC
  'KD' or 'NFKD' for Normalization Form KD

=back

=head2 Decomposition and Composition

=over 4

=item C<$decomposed_string = decompose($string)>

=item C<$decomposed_string = decompose($string, $useCompatMapping)>

Decomposes the specified string and returns the result.

If the second parameter (a boolean) is omitted or false, decomposes it
using the Canonical Decomposition Mapping.
If true, decomposes it using the Compatibility Decomposition Mapping.

The string returned is not always in NFD/NFKD.
Reordering may be required.

    $NFD_string  = reorder(decompose($string));       # eq. to NFD()
    $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()

=item C<$reordered_string  = reorder($string)>

Reorders the combining characters and the like in the canonical ordering
and returns the result.

E.g., when you have a list of NFD/NFKD strings,
you can get the concatenated NFD/NFKD string from them, saying

    $concat_NFD  = reorder(join '', @NFD_strings);
    $concat_NFKD = reorder(join '', @NFKD_strings);

=item C<$composed_string   = compose($string)>

Returns the string where composable pairs are composed.

E.g., when you have a NFD/NFKD string,
you can get its NFC/NFKC string, saying

    $NFC_string  = compose($NFD_string);
    $NFKC_string = compose($NFKD_string);

=back

=head2 Quick Check

(see Annex 8, UAX #15, and F<DerivedNormalizationProps.txt>)

The following functions check whether the string is in that normalization form.

The result returned will be:

    YES     The string is in that normalization form.
    NO      The string is not in that normalization form.
    MAYBE   Dubious. Maybe yes, maybe no.

=over 4

=item C<$result = checkNFD($string)>

returns C<YES> (C<1>) or C<NO> (C<empty string>).

=item C<$result = checkNFC($string)>

returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).

=item C<$result = checkNFKD($string)>

returns C<YES> (C<1>) or C<NO> (C<empty string>).

=item C<$result = checkNFKC($string)>

returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).

=item C<$result = check($form_name, $string)>

returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).

C<$form_name> is alike to that for C<normalize()>.

=back

B<Note>

In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
The answer C<MAYBE> may be returned in the cases of NFC and NFKC.

A MAYBE-NFC/NFKC string should contain at least
one combining character or the like.
For example, C<COMBINING ACUTE ACCENT> has
the MAYBE_NFC/MAYBE_NFKC property.
Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.

If you want to check exactly, compare the string with its NFC/NFKC; i.e.,

    $string eq NFC($string)    # more thorough than checkNFC($string)
    $string eq NFKC($string)   # more thorough than checkNFKC($string)

=head2 Character Data

These functions are interface of character data used internally.
If you want only to get Unicode normalization forms, you don't need
call them yourself.

=over 4

=item C<$canonical_decomposed = getCanon($codepoint)>

If the character of the specified codepoint is canonically
decomposable (including Hangul Syllables),
returns the B<completely decomposed> string canonically equivalent to it.

If it is not decomposable, returns C<undef>.

=item C<$compatibility_decomposed = getCompat($codepoint)>

If the character of the specified codepoint is compatibility
decomposable (including Hangul Syllables),
returns the B<completely decomposed> string compatibility equivalent to it.

If it is not decomposable, returns C<undef>.

=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>

If two characters here and next (as codepoints) are composable
(including Hangul Jamo/Syllables and Composition Exclusions),
returns the codepoint of the composite.

If they are not composable, returns C<undef>.

=item C<$combining_class = getCombinClass($codepoint)>

Returns the combining class of the character as an integer.

=item C<$is_exclusion = isExclusion($codepoint)>

Returns a boolean whether the character of the specified codepoint
is a composition exclusion.

=item C<$is_singleton = isSingleton($codepoint)>

Returns a boolean whether the character of the specified codepoint is
a singleton.

=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>

Returns a boolean whether the canonical decomposition
of the character of the specified codepoint
is a Non-Starter Decomposition.

=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>

Returns a boolean whether the character of the specified codepoint
may be composed with the previous one in a certain composition
(including Hangul Compositions, but excluding
Composition Exclusions and Non-Starter Decompositions).

=back

=head2 EXPORT

C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.

C<normalize> and other some functions: on request.

=head1 AUTHOR

SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>

  http://homepage1.nifty.com/nomenclator/perl/

  Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.

  This module is free software; you can redistribute it
  and/or modify it under the same terms as Perl itself.

=head1 SEE ALSO

=over 4

=item http://www.unicode.org/unicode/reports/tr15/

Unicode Normalization Forms - UAX #15

=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt

Derived Normalization Properties

=back

=cut
Commit	Line	Data
ac5ea531	1	package Unicode::Normalize;
ac5ea531	2
4a2e806c	3	BEGIN {
9f1f04a1	4	unless ("A" eq pack('U', 0x41) \|\| "A" eq pack('U', ord("A"))) {
9f1f04a1	5	die "Unicode::Normalize cannot stringify a Unicode code point\n";
4a2e806c	6	}
	7	}
	8
ac5ea531	9	use 5.006;
	10	use strict;
	11	use warnings;
	12	use Carp;
	13
9f1f04a1	14	our $VERSION = '0.21';
ac5ea531	15	our $PACKAGE = __PACKAGE__;
	16
	17	require Exporter;
	18	require DynaLoader;
	19	require AutoLoader;
	20
	21	our @ISA = qw(Exporter DynaLoader);
	22	our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45	23	our @EXPORT_OK = qw(
2a204b45	24	normalize decompose reorder compose
8f118dcd	25	checkNFD checkNFKD checkNFC checkNFKC check
	26	getCanon getCompat getComposite getCombinClass
	27	isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
	28	isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
	29	);
	30	our %EXPORT_TAGS = (
	31	all => [ @EXPORT, @EXPORT_OK ],
	32	normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
	33	check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
2a204b45	34	);
ac5ea531	35
	36	bootstrap Unicode::Normalize $VERSION;
	37
9f1f04a1	38	sub pack_U {
b8d10bc1	39	return pack('U*', @_);
9f1f04a1	40	}
	41
	42	sub unpack_U {
b8d10bc1	43	return unpack('U', pack('U').shift);
9f1f04a1	44	}
9f1f04a1	45
ac5ea531	46	use constant COMPAT => 1;
ac5ea531	47
d85850a7	48	sub NFD ($) { reorder(decompose($_[0])) }
ac5ea531	49	sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
d85850a7	50	sub NFC ($) { compose(reorder(decompose($_[0]))) }
ac5ea531	51	sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
	52
	53	sub normalize($$)
	54	{
d85850a7	55	my $form = shift;
f027f502	56	my $str = shift;
d85850a7	57	$form =~ s/^NF//;
d85850a7	58	return
f027f502	59	$form eq 'D' ? NFD ($str) :
	60	$form eq 'C' ? NFC ($str) :
	61	$form eq 'KD' ? NFKD($str) :
	62	$form eq 'KC' ? NFKC($str) :
d85850a7	63	croak $PACKAGE."::normalize: invalid form name: $form";
ac5ea531	64	}
ac5ea531	65
8f118dcd	66	sub check($$)
	67	{
	68	my $form = shift;
f027f502	69	my $str = shift;
8f118dcd	70	$form =~ s/^NF//;
8f118dcd	71	return
f027f502	72	$form eq 'D' ? checkNFD ($str) :
	73	$form eq 'C' ? checkNFC ($str) :
	74	$form eq 'KD' ? checkNFKD($str) :
	75	$form eq 'KC' ? checkNFKC($str) :
8f118dcd	76	croak $PACKAGE."::check: invalid form name: $form";
	77	}
	78
ac5ea531	79	1;
ac5ea531	80	__END__
2a204b45	81
	82	=head1 NAME
	83
f027f502	84	Unicode::Normalize - Unicode Normalization Forms
2a204b45	85
	86	=head1 SYNOPSIS
	87
	88	use Unicode::Normalize;
	89
8f118dcd	90	$NFD_string = NFD($string); # Normalization Form D
	91	$NFC_string = NFC($string); # Normalization Form C
	92	$NFKD_string = NFKD($string); # Normalization Form KD
	93	$NFKC_string = NFKC($string); # Normalization Form KC
2a204b45	94
	95	or
	96
	97	use Unicode::Normalize 'normalize';
	98
8f118dcd	99	$NFD_string = normalize('D', $string); # Normalization Form D
	100	$NFC_string = normalize('C', $string); # Normalization Form C
	101	$NFKD_string = normalize('KD', $string); # Normalization Form KD
	102	$NFKC_string = normalize('KC', $string); # Normalization Form KC
2a204b45	103
	104	=head1 DESCRIPTION
	105
d85850a7	106	=head2 Normalization Forms
2a204b45	107
	108	=over 4
	109
8f118dcd	110	=item C<$NFD_string = NFD($string)>
2a204b45	111
	112	returns the Normalization Form D (formed by canonical decomposition).
	113
8f118dcd	114	=item C<$NFC_string = NFC($string)>
2a204b45	115
	116	returns the Normalization Form C (formed by canonical decomposition
	117	followed by canonical composition).
	118
8f118dcd	119	=item C<$NFKD_string = NFKD($string)>
2a204b45	120
	121	returns the Normalization Form KD (formed by compatibility decomposition).
	122
8f118dcd	123	=item C<$NFKC_string = NFKC($string)>
2a204b45	124
	125	returns the Normalization Form KC (formed by compatibility decomposition
	126	followed by B<canonical> composition).
	127
8f118dcd	128	=item C<$normalized_string = normalize($form_name, $string)>
2a204b45	129
	130	As C<$form_name>, one of the following names must be given.
	131
	132	'C' or 'NFC' for Normalization Form C
	133	'D' or 'NFD' for Normalization Form D
	134	'KC' or 'NFKC' for Normalization Form KC
	135	'KD' or 'NFKD' for Normalization Form KD
	136
	137	=back
	138
8f118dcd	139	=head2 Decomposition and Composition
	140
	141	=over 4
	142
	143	=item C<$decomposed_string = decompose($string)>
	144
	145	=item C<$decomposed_string = decompose($string, $useCompatMapping)>
	146
9f1f04a1	147	Decomposes the specified string and returns the result.
8f118dcd	148
	149	If the second parameter (a boolean) is omitted or false, decomposes it
	150	using the Canonical Decomposition Mapping.
	151	If true, decomposes it using the Compatibility Decomposition Mapping.
	152
	153	The string returned is not always in NFD/NFKD.
	154	Reordering may be required.
	155
	156	$NFD_string = reorder(decompose($string)); # eq. to NFD()
	157	$NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
	158
	159	=item C<$reordered_string = reorder($string)>
	160
9f1f04a1	161	Reorders the combining characters and the like in the canonical ordering
8f118dcd	162	and returns the result.
	163
	164	E.g., when you have a list of NFD/NFKD strings,
	165	you can get the concatenated NFD/NFKD string from them, saying
	166
	167	$concat_NFD = reorder(join '', @NFD_strings);
	168	$concat_NFKD = reorder(join '', @NFKD_strings);
	169
	170	=item C<$composed_string = compose($string)>
	171
	172	Returns the string where composable pairs are composed.
	173
	174	E.g., when you have a NFD/NFKD string,
	175	you can get its NFC/NFKC string, saying
	176
	177	$NFC_string = compose($NFD_string);
	178	$NFKC_string = compose($NFKD_string);
	179
	180	=back
	181
	182	=head2 Quick Check
	183
6c941e0c	184	(see Annex 8, UAX #15, and F<DerivedNormalizationProps.txt>)
8f118dcd	185
	186	The following functions check whether the string is in that normalization form.
	187
	188	The result returned will be:
	189
	190	YES The string is in that normalization form.
	191	NO The string is not in that normalization form.
	192	MAYBE Dubious. Maybe yes, maybe no.
	193
	194	=over 4
	195
	196	=item C<$result = checkNFD($string)>
	197
f027f502	198	returns C<YES> (C<1>) or C<NO> (C<empty string>).
8f118dcd	199
	200	=item C<$result = checkNFC($string)>
	201
f027f502	202	returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd	203
	204	=item C<$result = checkNFKD($string)>
	205
f027f502	206	returns C<YES> (C<1>) or C<NO> (C<empty string>).
8f118dcd	207
	208	=item C<$result = checkNFKC($string)>
	209
f027f502	210	returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd	211
	212	=item C<$result = check($form_name, $string)>
	213
f027f502	214	returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
8f118dcd	215
	216	C<$form_name> is alike to that for C<normalize()>.
	217
	218	=back
	219
	220	B<Note>
	221
	222	In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
	223	The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
	224
	225	A MAYBE-NFC/NFKC string should contain at least
	226	one combining character or the like.
	227	For example, C<COMBINING ACUTE ACCENT> has
	228	the MAYBE_NFC/MAYBE_NFKC property.
	229	Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
	230	and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
f027f502	231	C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
8f118dcd	232	(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
	233	while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
	234
	235	If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
	236
	237	$string eq NFC($string) # more thorough than checkNFC($string)
	238	$string eq NFKC($string) # more thorough than checkNFKC($string)
	239
2a204b45	240	=head2 Character Data
	241
	242	These functions are interface of character data used internally.
d0ed0342	243	If you want only to get Unicode normalization forms, you don't need
d0ed0342	244	call them yourself.
2a204b45	245
	246	=over 4
	247
	248	=item C<$canonical_decomposed = getCanon($codepoint)>
	249
8f118dcd	250	If the character of the specified codepoint is canonically
	251	decomposable (including Hangul Syllables),
	252	returns the B<completely decomposed> string canonically equivalent to it.
	253
f027f502	254	If it is not decomposable, returns C<undef>.
8f118dcd	255
2a204b45	256	=item C<$compatibility_decomposed = getCompat($codepoint)>
2a204b45	257
8f118dcd	258	If the character of the specified codepoint is compatibility
	259	decomposable (including Hangul Syllables),
	260	returns the B<completely decomposed> string compatibility equivalent to it.
2a204b45	261
f027f502	262	If it is not decomposable, returns C<undef>.
2a204b45	263
8f118dcd	264	=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
2a204b45	265
d85850a7	266	If two characters here and next (as codepoints) are composable
8f118dcd	267	(including Hangul Jamo/Syllables and Composition Exclusions),
2a204b45	268	returns the codepoint of the composite.
2a204b45	269
f027f502	270	If they are not composable, returns C<undef>.
2a204b45	271
	272	=item C<$combining_class = getCombinClass($codepoint)>
	273
8f118dcd	274	Returns the combining class of the character as an integer.
2a204b45	275
	276	=item C<$is_exclusion = isExclusion($codepoint)>
	277
8f118dcd	278	Returns a boolean whether the character of the specified codepoint
	279	is a composition exclusion.
	280
	281	=item C<$is_singleton = isSingleton($codepoint)>
	282
2a204b45	283	Returns a boolean whether the character of the specified codepoint is
8f118dcd	284	a singleton.
8f118dcd	285
6c941e0c	286	=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
8f118dcd	287
	288	Returns a boolean whether the canonical decomposition
	289	of the character of the specified codepoint
	290	is a Non-Starter Decomposition.
	291
	292	=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
	293
	294	Returns a boolean whether the character of the specified codepoint
	295	may be composed with the previous one in a certain composition
	296	(including Hangul Compositions, but excluding
	297	Composition Exclusions and Non-Starter Decompositions).
2a204b45	298
	299	=back
	300
	301	=head2 EXPORT
	302
	303	C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
	304
	305	C<normalize> and other some functions: on request.
	306
	307	=head1 AUTHOR
	308
	309	SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
	310
	311	http://homepage1.nifty.com/nomenclator/perl/
	312
6c941e0c	313	Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
2a204b45	314
6c941e0c	315	This module is free software; you can redistribute it
6c941e0c	316	and/or modify it under the same terms as Perl itself.
2a204b45	317
	318	=head1 SEE ALSO
	319
	320	=over 4
	321
	322	=item http://www.unicode.org/unicode/reports/tr15/
	323
	324	Unicode Normalization Forms - UAX #15
	325
14e6b36c	326	=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
8f118dcd	327
	328	Derived Normalization Properties
	329
2a204b45	330	=back
	331
	332	=cut
	333