ext/Unicode/Normalize/Normalize.pm

   1 package Unicode::Normalize;
   2
   3 use 5.006;
   4 use strict;
   5 use warnings;
   6 use Carp;
   7
   8 our $VERSION = '0.13';
   9 our $PACKAGE = __PACKAGE__;
  10
  11 require Exporter;
  12 require DynaLoader;
  13 require AutoLoader;
  14
  15 our @ISA = qw(Exporter DynaLoader);
  16 our @EXPORT = qw( NFC NFD NFKC NFKD );
  17 our @EXPORT_OK = qw(
  18     normalize decompose reorder compose
  19     getCanon getCompat getComposite getCombinClass isExclusion
  20 );
  21 our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
  22
  23 bootstrap Unicode::Normalize $VERSION;
  24
  25 use constant COMPAT => 1;
  26
  27 sub NFD  ($) { reorder(decompose($_[0])) }
  28 sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
  29
  30 sub NFC  ($) { compose(reorder(decompose($_[0]))) }
  31 sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
  32
  33 sub normalize($$)
  34 {
  35     my $form = shift;
  36     $form =~ s/^NF//;
  37     return
  38         $form eq 'D'  ? NFD ($_[0]) :
  39         $form eq 'C'  ? NFC ($_[0]) :
  40         $form eq 'KD' ? NFKD($_[0]) :
  41         $form eq 'KC' ? NFKC($_[0]) :
  42       croak $PACKAGE."::normalize: invalid form name: $form";
  43 }
  44
  45 1;
  46 __END__
  47
  48 =head1 NAME
  49
  50 Unicode::Normalize - normalized forms of Unicode text
  51
  52 =head1 SYNOPSIS
  53
  54   use Unicode::Normalize;
  55
  56   $string_NFD  = NFD($raw_string);  # Normalization Form D
  57   $string_NFC  = NFC($raw_string);  # Normalization Form C
  58   $string_NFKD = NFKD($raw_string); # Normalization Form KD
  59   $string_NFKC = NFKC($raw_string); # Normalization Form KC
  60
  61    or
  62
  63   use Unicode::Normalize 'normalize';
  64
  65   $string_NFD  = normalize('D',  $raw_string);  # Normalization Form D
  66   $string_NFC  = normalize('C',  $raw_string);  # Normalization Form C
  67   $string_NFKD = normalize('KD', $raw_string);  # Normalization Form KD
  68   $string_NFKC = normalize('KC', $raw_string);  # Normalization Form KC
  69
  70 =head1 DESCRIPTION
  71
  72 =head2 Normalization Forms
  73
  74 =over 4
  75
  76 =item C<$string_NFD = NFD($raw_string)>
  77
  78 returns the Normalization Form D (formed by canonical decomposition).
  79
  80
  81 =item C<$string_NFC = NFC($raw_string)>
  82
  83 returns the Normalization Form C (formed by canonical decomposition
  84 followed by canonical composition).
  85
  86 =item C<$string_NFKD = NFKD($raw_string)>
  87
  88 returns the Normalization Form KD (formed by compatibility decomposition).
  89
  90 =item C<$string_NFKC = NFKC($raw_string)>
  91
  92 returns the Normalization Form KC (formed by compatibility decomposition
  93 followed by B<canonical> composition).
  94
  95 =item C<$normalized_string = normalize($form_name, $raw_string)>
  96
  97 As C<$form_name>, one of the following names must be given.
  98
  99   'C'  or 'NFC'  for Normalization Form C
 100   'D'  or 'NFD'  for Normalization Form D
 101   'KC' or 'NFKC' for Normalization Form KC
 102   'KD' or 'NFKD' for Normalization Form KD
 103
 104 =back
 105
 106 =head2 Character Data
 107
 108 These functions are interface of character data used internally.
 109 If you want only to get Unicode normalization forms, you don't need
 110 call them yourself.
 111
 112 =over 4
 113
 114 =item C<$canonical_decomposed = getCanon($codepoint)>
 115
 116 =item C<$compatibility_decomposed = getCompat($codepoint)>
 117
 118 If the character of the specified codepoint is canonically or
 119 compatibility decomposable (including Hangul Syllables),
 120 returns the B<completely decomposed> string equivalent to it.
 121
 122 If it is not decomposable, returns undef.
 123
 124 =item C<$uv_composite = getComposite($uv_here, $uv_next)>
 125
 126 If two characters here and next (as codepoints) are composable
 127 (including Hangul Jamo/Syllables and Exclusions),
 128 returns the codepoint of the composite.
 129
 130 If they are not composable, returns undef.
 131
 132 =item C<$combining_class = getCombinClass($codepoint)>
 133
 134 Returns the combining class as integer of the character.
 135
 136 =item C<$is_exclusion = isExclusion($codepoint)>
 137
 138 Returns a boolean whether the character of the specified codepoint is
 139 a composition exclusion.
 140
 141 =back
 142
 143 =head2 EXPORT
 144
 145 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
 146
 147 C<normalize> and other some functions: on request.
 148
 149 =head1 AUTHOR
 150
 151 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
 152
 153   http://homepage1.nifty.com/nomenclator/perl/
 154
 155   Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
 156
 157   This program is free software; you can redistribute it and/or
 158   modify it under the same terms as Perl itself.
 159
 160 =head1 SEE ALSO
 161
 162 =over 4
 163
 164 =item http://www.unicode.org/unicode/reports/tr15/
 165
 166 Unicode Normalization Forms - UAX #15
 167
 168 =back
 169
 170 =cut
 171