Subject: Missing diagnostics
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.pm
CommitLineData
ac5ea531 1package Unicode::Normalize;
2
4a2e806c 3BEGIN {
4 if (ord("A") == 193) {
5 die "Unicode::Normalize not ported to EBCDIC\n";
6 }
7}
8
ac5ea531 9use 5.006;
10use strict;
11use warnings;
12use Carp;
13
d85850a7 14our $VERSION = '0.13';
ac5ea531 15our $PACKAGE = __PACKAGE__;
16
17require Exporter;
18require DynaLoader;
19require AutoLoader;
20
21our @ISA = qw(Exporter DynaLoader);
22our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45 23our @EXPORT_OK = qw(
24 normalize decompose reorder compose
25 getCanon getCompat getComposite getCombinClass isExclusion
26);
ac5ea531 27our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
28
29bootstrap Unicode::Normalize $VERSION;
30
ac5ea531 31use constant COMPAT => 1;
32
d85850a7 33sub NFD ($) { reorder(decompose($_[0])) }
ac5ea531 34sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
35
d85850a7 36sub NFC ($) { compose(reorder(decompose($_[0]))) }
ac5ea531 37sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
38
39sub normalize($$)
40{
d85850a7 41 my $form = shift;
42 $form =~ s/^NF//;
43 return
44 $form eq 'D' ? NFD ($_[0]) :
45 $form eq 'C' ? NFC ($_[0]) :
46 $form eq 'KD' ? NFKD($_[0]) :
47 $form eq 'KC' ? NFKC($_[0]) :
48 croak $PACKAGE."::normalize: invalid form name: $form";
ac5ea531 49}
50
511;
52__END__
2a204b45 53
54=head1 NAME
55
56Unicode::Normalize - normalized forms of Unicode text
57
58=head1 SYNOPSIS
59
60 use Unicode::Normalize;
61
62 $string_NFD = NFD($raw_string); # Normalization Form D
63 $string_NFC = NFC($raw_string); # Normalization Form C
64 $string_NFKD = NFKD($raw_string); # Normalization Form KD
65 $string_NFKC = NFKC($raw_string); # Normalization Form KC
66
67 or
68
69 use Unicode::Normalize 'normalize';
70
71 $string_NFD = normalize('D', $raw_string); # Normalization Form D
72 $string_NFC = normalize('C', $raw_string); # Normalization Form C
73 $string_NFKD = normalize('KD', $raw_string); # Normalization Form KD
74 $string_NFKC = normalize('KC', $raw_string); # Normalization Form KC
75
76=head1 DESCRIPTION
77
d85850a7 78=head2 Normalization Forms
2a204b45 79
80=over 4
81
82=item C<$string_NFD = NFD($raw_string)>
83
84returns the Normalization Form D (formed by canonical decomposition).
85
86
87=item C<$string_NFC = NFC($raw_string)>
88
89returns the Normalization Form C (formed by canonical decomposition
90followed by canonical composition).
91
92=item C<$string_NFKD = NFKD($raw_string)>
93
94returns the Normalization Form KD (formed by compatibility decomposition).
95
96=item C<$string_NFKC = NFKC($raw_string)>
97
98returns the Normalization Form KC (formed by compatibility decomposition
99followed by B<canonical> composition).
100
101=item C<$normalized_string = normalize($form_name, $raw_string)>
102
103As C<$form_name>, one of the following names must be given.
104
105 'C' or 'NFC' for Normalization Form C
106 'D' or 'NFD' for Normalization Form D
107 'KC' or 'NFKC' for Normalization Form KC
108 'KD' or 'NFKD' for Normalization Form KD
109
110=back
111
112=head2 Character Data
113
114These functions are interface of character data used internally.
d0ed0342 115If you want only to get Unicode normalization forms, you don't need
116call them yourself.
2a204b45 117
118=over 4
119
120=item C<$canonical_decomposed = getCanon($codepoint)>
121
122=item C<$compatibility_decomposed = getCompat($codepoint)>
123
124If the character of the specified codepoint is canonically or
125compatibility decomposable (including Hangul Syllables),
126returns the B<completely decomposed> string equivalent to it.
127
128If it is not decomposable, returns undef.
129
130=item C<$uv_composite = getComposite($uv_here, $uv_next)>
131
d85850a7 132If two characters here and next (as codepoints) are composable
2a204b45 133(including Hangul Jamo/Syllables and Exclusions),
134returns the codepoint of the composite.
135
136If they are not composable, returns undef.
137
138=item C<$combining_class = getCombinClass($codepoint)>
139
140Returns the combining class as integer of the character.
141
142=item C<$is_exclusion = isExclusion($codepoint)>
143
144Returns a boolean whether the character of the specified codepoint is
145a composition exclusion.
146
147=back
148
149=head2 EXPORT
150
151C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
152
153C<normalize> and other some functions: on request.
154
4a2e806c 155=head2 TODO
156
157Unicode::Normalize has not been ported to EBCDIC. The code mostly
158would work just fine but a decision needs to be made: how the module
159should work in EBCDIC? Should the low 256 characters be understood as
160Unicode or as EBCDIC code points? Should one be chosen or should
161there be a way to do either? Or should such translation be left
162outside the module for the user to do, for example by using
163Encode::from_to()?
164
2a204b45 165=head1 AUTHOR
166
167SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
168
169 http://homepage1.nifty.com/nomenclator/perl/
170
171 Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
172
173 This program is free software; you can redistribute it and/or
174 modify it under the same terms as Perl itself.
175
176=head1 SEE ALSO
177
178=over 4
179
180=item http://www.unicode.org/unicode/reports/tr15/
181
182Unicode Normalization Forms - UAX #15
183
184=back
185
186=cut
187