Integrate mainline (for ndbm fixes etc.)
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.pm
CommitLineData
ac5ea531 1package Unicode::Normalize;
2
3use 5.006;
4use strict;
5use warnings;
6use Carp;
7
d85850a7 8our $VERSION = '0.13';
ac5ea531 9our $PACKAGE = __PACKAGE__;
10
11require Exporter;
12require DynaLoader;
13require AutoLoader;
14
15our @ISA = qw(Exporter DynaLoader);
16our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45 17our @EXPORT_OK = qw(
18 normalize decompose reorder compose
19 getCanon getCompat getComposite getCombinClass isExclusion
20);
ac5ea531 21our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
22
23bootstrap Unicode::Normalize $VERSION;
24
ac5ea531 25use constant COMPAT => 1;
26
d85850a7 27sub NFD ($) { reorder(decompose($_[0])) }
ac5ea531 28sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
29
d85850a7 30sub NFC ($) { compose(reorder(decompose($_[0]))) }
ac5ea531 31sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
32
33sub normalize($$)
34{
d85850a7 35 my $form = shift;
36 $form =~ s/^NF//;
37 return
38 $form eq 'D' ? NFD ($_[0]) :
39 $form eq 'C' ? NFC ($_[0]) :
40 $form eq 'KD' ? NFKD($_[0]) :
41 $form eq 'KC' ? NFKC($_[0]) :
42 croak $PACKAGE."::normalize: invalid form name: $form";
ac5ea531 43}
44
451;
46__END__
2a204b45 47
48=head1 NAME
49
50Unicode::Normalize - normalized forms of Unicode text
51
52=head1 SYNOPSIS
53
54 use Unicode::Normalize;
55
56 $string_NFD = NFD($raw_string); # Normalization Form D
57 $string_NFC = NFC($raw_string); # Normalization Form C
58 $string_NFKD = NFKD($raw_string); # Normalization Form KD
59 $string_NFKC = NFKC($raw_string); # Normalization Form KC
60
61 or
62
63 use Unicode::Normalize 'normalize';
64
65 $string_NFD = normalize('D', $raw_string); # Normalization Form D
66 $string_NFC = normalize('C', $raw_string); # Normalization Form C
67 $string_NFKD = normalize('KD', $raw_string); # Normalization Form KD
68 $string_NFKC = normalize('KC', $raw_string); # Normalization Form KC
69
70=head1 DESCRIPTION
71
d85850a7 72=head2 Normalization Forms
2a204b45 73
74=over 4
75
76=item C<$string_NFD = NFD($raw_string)>
77
78returns the Normalization Form D (formed by canonical decomposition).
79
80
81=item C<$string_NFC = NFC($raw_string)>
82
83returns the Normalization Form C (formed by canonical decomposition
84followed by canonical composition).
85
86=item C<$string_NFKD = NFKD($raw_string)>
87
88returns the Normalization Form KD (formed by compatibility decomposition).
89
90=item C<$string_NFKC = NFKC($raw_string)>
91
92returns the Normalization Form KC (formed by compatibility decomposition
93followed by B<canonical> composition).
94
95=item C<$normalized_string = normalize($form_name, $raw_string)>
96
97As C<$form_name>, one of the following names must be given.
98
99 'C' or 'NFC' for Normalization Form C
100 'D' or 'NFD' for Normalization Form D
101 'KC' or 'NFKC' for Normalization Form KC
102 'KD' or 'NFKD' for Normalization Form KD
103
104=back
105
106=head2 Character Data
107
108These functions are interface of character data used internally.
235bddc8 109If you want only to get Unicode normalization forms, you don't need
110call them yourself.
2a204b45 111
112=over 4
113
114=item C<$canonical_decomposed = getCanon($codepoint)>
115
116=item C<$compatibility_decomposed = getCompat($codepoint)>
117
118If the character of the specified codepoint is canonically or
119compatibility decomposable (including Hangul Syllables),
120returns the B<completely decomposed> string equivalent to it.
121
122If it is not decomposable, returns undef.
123
124=item C<$uv_composite = getComposite($uv_here, $uv_next)>
125
d85850a7 126If two characters here and next (as codepoints) are composable
2a204b45 127(including Hangul Jamo/Syllables and Exclusions),
128returns the codepoint of the composite.
129
130If they are not composable, returns undef.
131
132=item C<$combining_class = getCombinClass($codepoint)>
133
134Returns the combining class as integer of the character.
135
136=item C<$is_exclusion = isExclusion($codepoint)>
137
138Returns a boolean whether the character of the specified codepoint is
139a composition exclusion.
140
141=back
142
143=head2 EXPORT
144
145C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
146
147C<normalize> and other some functions: on request.
148
149=head1 AUTHOR
150
151SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
152
153 http://homepage1.nifty.com/nomenclator/perl/
154
155 Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
156
157 This program is free software; you can redistribute it and/or
158 modify it under the same terms as Perl itself.
159
160=head1 SEE ALSO
161
162=over 4
163
164=item http://www.unicode.org/unicode/reports/tr15/
165
166Unicode Normalization Forms - UAX #15
167
168=back
169
170=cut
171