ext/Unicode/Normalize/Normalize.pm

   1 package Unicode::Normalize;
   2
   3 BEGIN {
   4     if (ord("A") == 193) {
   5         die "Unicode::Normalize not ported to EBCDIC\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13
  14 our $VERSION = '0.14';
  15 our $PACKAGE = __PACKAGE__;
  16
  17 require Exporter;
  18 require DynaLoader;
  19 require AutoLoader;
  20
  21 our @ISA = qw(Exporter DynaLoader);
  22 our @EXPORT = qw( NFC NFD NFKC NFKD );
  23 our @EXPORT_OK = qw(
  24     normalize decompose reorder compose
  25     getCanon getCompat getComposite getCombinClass isExclusion
  26 );
  27 our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
  28
  29 bootstrap Unicode::Normalize $VERSION;
  30
  31 use constant COMPAT => 1;
  32
  33 sub NFD  ($) { reorder(decompose($_[0])) }
  34 sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
  35
  36 sub NFC  ($) { compose(reorder(decompose($_[0]))) }
  37 sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
  38
  39 sub normalize($$)
  40 {
  41     my $form = shift;
  42     $form =~ s/^NF//;
  43     return
  44         $form eq 'D'  ? NFD ($_[0]) :
  45         $form eq 'C'  ? NFC ($_[0]) :
  46         $form eq 'KD' ? NFKD($_[0]) :
  47         $form eq 'KC' ? NFKC($_[0]) :
  48       croak $PACKAGE."::normalize: invalid form name: $form";
  49 }
  50
  51 1;
  52 __END__
  53
  54 =head1 NAME
  55
  56 Unicode::Normalize - normalized forms of Unicode text
  57
  58 =head1 SYNOPSIS
  59
  60   use Unicode::Normalize;
  61
  62   $string_NFD  = NFD($raw_string);  # Normalization Form D
  63   $string_NFC  = NFC($raw_string);  # Normalization Form C
  64   $string_NFKD = NFKD($raw_string); # Normalization Form KD
  65   $string_NFKC = NFKC($raw_string); # Normalization Form KC
  66
  67    or
  68
  69   use Unicode::Normalize 'normalize';
  70
  71   $string_NFD  = normalize('D',  $raw_string);  # Normalization Form D
  72   $string_NFC  = normalize('C',  $raw_string);  # Normalization Form C
  73   $string_NFKD = normalize('KD', $raw_string);  # Normalization Form KD
  74   $string_NFKC = normalize('KC', $raw_string);  # Normalization Form KC
  75
  76 =head1 DESCRIPTION
  77
  78 =head2 Normalization Forms
  79
  80 =over 4
  81
  82 =item C<$string_NFD = NFD($raw_string)>
  83
  84 returns the Normalization Form D (formed by canonical decomposition).
  85
  86
  87 =item C<$string_NFC = NFC($raw_string)>
  88
  89 returns the Normalization Form C (formed by canonical decomposition
  90 followed by canonical composition).
  91
  92 =item C<$string_NFKD = NFKD($raw_string)>
  93
  94 returns the Normalization Form KD (formed by compatibility decomposition).
  95
  96 =item C<$string_NFKC = NFKC($raw_string)>
  97
  98 returns the Normalization Form KC (formed by compatibility decomposition
  99 followed by B<canonical> composition).
 100
 101 =item C<$normalized_string = normalize($form_name, $raw_string)>
 102
 103 As C<$form_name>, one of the following names must be given.
 104
 105   'C'  or 'NFC'  for Normalization Form C
 106   'D'  or 'NFD'  for Normalization Form D
 107   'KC' or 'NFKC' for Normalization Form KC
 108   'KD' or 'NFKD' for Normalization Form KD
 109
 110 =back
 111
 112 =head2 Character Data
 113
 114 These functions are interface of character data used internally.
 115 If you want only to get Unicode normalization forms, you don't need
 116 call them yourself.
 117
 118 =over 4
 119
 120 =item C<$canonical_decomposed = getCanon($codepoint)>
 121
 122 =item C<$compatibility_decomposed = getCompat($codepoint)>
 123
 124 If the character of the specified codepoint is canonically or
 125 compatibility decomposable (including Hangul Syllables),
 126 returns the B<completely decomposed> string equivalent to it.
 127
 128 If it is not decomposable, returns undef.
 129
 130 =item C<$uv_composite = getComposite($uv_here, $uv_next)>
 131
 132 If two characters here and next (as codepoints) are composable
 133 (including Hangul Jamo/Syllables and Exclusions),
 134 returns the codepoint of the composite.
 135
 136 If they are not composable, returns undef.
 137
 138 =item C<$combining_class = getCombinClass($codepoint)>
 139
 140 Returns the combining class as integer of the character.
 141
 142 =item C<$is_exclusion = isExclusion($codepoint)>
 143
 144 Returns a boolean whether the character of the specified codepoint is
 145 a composition exclusion.
 146
 147 =back
 148
 149 =head2 EXPORT
 150
 151 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
 152
 153 C<normalize> and other some functions: on request.
 154
 155 =head2 TODO
 156
 157 Unicode::Normalize has not been ported to EBCDIC.  The code mostly
 158 would work just fine but a decision needs to be made: how the module
 159 should work in EBCDIC?  Should the low 256 characters be understood as
 160 Unicode or as EBCDIC code points?  Should one be chosen or should
 161 there be a way to do either?  Or should such translation be left
 162 outside the module for the user to do, for example by using
 163 Encode::from_to()?
 164
 165 =head1 AUTHOR
 166
 167 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
 168
 169   http://homepage1.nifty.com/nomenclator/perl/
 170
 171   Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
 172
 173   This program is free software; you can redistribute it and/or
 174   modify it under the same terms as Perl itself.
 175
 176 =head1 SEE ALSO
 177
 178 =over 4
 179
 180 =item http://www.unicode.org/unicode/reports/tr15/
 181
 182 Unicode Normalization Forms - UAX #15
 183
 184 =back
 185
 186 =cut
 187