ext/Encode/Encode.pm

   1 package Encode;
   2
   3 $VERSION = 0.01;
   4
   5 require DynaLoader;
   6 require Exporter;
   7
   8 @ISA = qw(Exporter DynaLoader);
   9
  10 @EXPORT_OK =
  11     qw(
  12        bytes_to_utf8
  13        utf8_to_bytes
  14        chars_to_utf8
  15        utf8_to_chars
  16        utf8_to_chars_check
  17        bytes_to_chars
  18        chars_to_bytes
  19        from_to
  20        is_utf8
  21        on_utf8
  22        off_utf8
  23        utf_to_utf
  24       );
  25
  26 bootstrap Encode ();
  27
  28 =pod
  29
  30 =head1 NAME
  31
  32 Encode - character encodings
  33
  34 =head2 TERMINOLOGY
  35
  36 =over
  37
  38 =item *
  39
  40 I<char>: a character in the range 0..maxint (at least 2**32-1)
  41
  42 =item *
  43
  44 I<byte>: a character in the range 0..255
  45
  46 =back
  47
  48 The marker [INTERNAL] marks Internal Implementation Details, in
  49 general meant only for those who think they know what they are doing,
  50 and such details may change in future releases.
  51
  52 =head2 bytes
  53
  54 =over 4
  55
  56 =item *
  57
  58         bytes_to_utf8(STRING [, FROM])
  59
  60 The bytes in STRING are recoded in-place into UTF-8.  If no FROM is
  61 specified the bytes are expected to be encoded in US-ASCII or ISO
  62 8859-1 (Latin 1).  Returns the new size of STRING, or C<undef> if
  63 there's a failure.
  64
  65 [INTERNAL] Also the UTF-8 flag of STRING is turned on.
  66
  67 =item *
  68
  69         utf8_to_bytes(STRING [, TO [, CHECK]])
  70
  71 The UTF-8 in STRING is decoded in-place into bytes.  If no TO encoding
  72 is specified the bytes are expected to be encoded in US-ASCII or ISO
  73 8859-1 (Latin 1).  Returns the new size of STRING, or C<undef> if
  74 there's a failure.
  75
  76 What if there are characters > 255?  What if the UTF-8 in STRING is
  77 malformed?  See L</"Handling Malformed Data">.
  78
  79 [INTERNAL] The UTF-8 flag of STRING is not checked.
  80
  81 =back
  82
  83 =head2 chars
  84
  85 =over 4
  86
  87 =item *
  88
  89         chars_to_utf8(STRING)
  90
  91 The chars in STRING are encoded in-place into UTF-8.  Returns the new
  92 size of STRING, or C<undef> if there's a failure.
  93
  94 No assumptions are made on the encoding of the chars.  If you want to
  95 assume that the chars are Unicode and to trap illegal Unicode
  96 characters, you must use C<from_to('Unicode', ...)>.
  97
  98 [INTERNAL] Also the UTF-8 flag of STRING is turned on.
  99
 100 =over 4
 101
 102 =item *
 103
 104         utf8_to_chars(STRING)
 105
 106 The UTF-8 in STRING is decoded in-place into chars.  Returns the new
 107 size of STRING, or C<undef> if there's a failure.
 108
 109 If the UTF-8 in STRING is malformed C<undef> is returned, and also an
 110 optional lexical warning (category utf8) is given.
 111
 112 [INTERNAL] The UTF-8 flag of STRING is not checked.
 113
 114 =item *
 115
 116         utf8_to_chars_check(STRING [, CHECK])
 117
 118 (Note that special naming of this interface since a two-argument
 119 utf8_to_chars() has different semantics.)
 120
 121 The UTF-8 in STRING is decoded in-place into chars.  Returns the new
 122 size of STRING, or C<undef> if there is a failure.
 123
 124 If the UTF-8 in STRING is malformed?  See L</"Handling Malformed Data">.
 125
 126 [INTERNAL] The UTF-8 flag of STRING is not checked.
 127
 128 =back
 129
 130 =head2 chars With Encoding
 131
 132 =over 4
 133
 134 =item *
 135
 136         chars_to_utf8(STRING, FROM [, CHECK])
 137
 138 The chars in STRING encoded in FROM are recoded in-place into UTF-8.
 139 Returns the new size of STRING, or C<undef> if there's a failure.
 140
 141 No assumptions are made on the encoding of the chars.  If you want to
 142 assume that the chars are Unicode and to trap illegal Unicode
 143 characters, you must use C<from_to('Unicode', ...)>.
 144
 145 [INTERNAL] Also the UTF-8 flag of STRING is turned on.
 146
 147 =item *
 148
 149         utf8_to_chars(STRING, TO [, CHECK])
 150
 151 The UTF-8 in STRING is decoded in-place into chars encoded in TO.
 152 Returns the new size of STRING, or C<undef> if there's a failure.
 153
 154 If the UTF-8 in STRING is malformed?  See L</"Handling Malformed Data">.
 155
 156 [INTERNAL] The UTF-8 flag of STRING is not checked.
 157
 158 =item *
 159
 160         bytes_to_chars(STRING, FROM [, CHECK])
 161
 162 The bytes in STRING encoded in FROM are recoded in-place into chars.
 163 Returns the new size of STRING in bytes, or C<undef> if there's a
 164 failure.
 165
 166 If the mapping is impossible?  See L</"Handling Malformed Data">.
 167
 168 =item *
 169
 170         chars_to_bytes(STRING, TO [, CHECK])
 171
 172 The chars in STRING are recoded in-place to bytes encoded in TO.
 173 Returns the new size of STRING in bytes, or C<undef> if there's a
 174 failure.
 175
 176 If the mapping is impossible?  See L</"Handling Malformed Data">.
 177
 178 =item *
 179
 180         from_to(STRING, FROM, TO [, CHECK])
 181
 182 The chars in STRING encoded in FROM are recoded in-place into TO.
 183 Returns the new size of STRING, or C<undef> if there's a failure.
 184
 185 If mapping between the encodings is impossible?
 186 See L</"Handling Malformed Data">.
 187
 188 [INTERNAL] If TO is UTF-8, also the UTF-8 flag of STRING is turned on.
 189
 190 =back
 191
 192 =head2 Testing For UTF-8
 193
 194 =over 4
 195
 196 =item *
 197
 198         is_utf8(STRING [, CHECK])
 199
 200 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
 201 If CHECK is true, also checks the data in STRING for being
 202 well-formed UTF-8.  Returns true if successful, false otherwise.
 203
 204 =back
 205
 206 =head2 Toggling UTF-8-ness
 207
 208 =over 4
 209
 210 =item *
 211
 212         on_utf8(STRING)
 213
 214 [INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
 215 B<not> checked for being well-formed UTF-8.  Do not use unless you
 216 B<know> that the STRING is well-formed UTF-8.  Returns the previous
 217 state of the UTF-8 flag (so please don't test the return value as
 218 I<not> success or failure), or C<undef> if STRING is not a string.
 219
 220 =item *
 221
 222         off_utf8(STRING)
 223
 224 [INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
 225 Returns the previous state of the UTF-8 flag (so please don't test the
 226 return value as I<not> success or failure), or C<undef> if STRING is
 227 not a string.
 228
 229 =back
 230
 231 =head2 UTF-16 and UTF-32 Encodings
 232
 233 =over 4
 234
 235 =item *
 236
 237         utf_to_utf(STRING, FROM, TO [, CHECK])
 238
 239 The data in STRING is converted from Unicode Transfer Encoding FROM to
 240 Unicode Transfer Encoding TO.  Both FROM and TO may be any of the
 241 following tags (case-insensitive, with or without 'utf' or 'utf-' prefix):
 242
 243         tag             meaning
 244
 245         '7'             UTF-7
 246         '8'             UTF-8
 247         '16be'          UTF-16 big-endian
 248         '16le'          UTF-16 little-endian
 249         '16'            UTF-16 native-endian
 250         '32be'          UTF-32 big-endian
 251         '32le'          UTF-32 little-endian
 252         '32'            UTF-32 native-endian
 253
 254 UTF-16 is also known as UCS-2, 16 bit or 2-byte chunks, and UTF-32 as
 255 UCS-4, 32-bit or 4-byte chunks.  Returns the new size of STRING, or
 256 C<undef> is there's a failure.
 257
 258 If FROM is UTF-8 and the UTF-8 in STRING is malformed?  See
 259 L</"Handling Malformed Data">.
 260
 261 [INTERNAL] Even if CHECK is true and FROM is UTF-8, the UTF-8 flag of
 262 STRING is not checked.  If TO is UTF-8, also the UTF-8 flag of STRING is
 263 turned on.  Identical FROM and TO are fine.
 264
 265 =back
 266
 267 =head2 Handling Malformed Data
 268
 269 If CHECK is not set, C<undef> is returned.  If the data is supposed to
 270 be UTF-8, an optional lexical warning (category utf8) is given.  If
 271 CHECK is true but not a code reference, dies.  If CHECK is a code
 272 reference, it is called with the arguments
 273
 274         (MALFORMED_STRING, STRING_FROM_SO_FAR, STRING_TO_SO_FAR)
 275
 276 Two return values are expected from the call: the string to be used in
 277 the result string in place of the malformed section, and the length of
 278 the malformed section in bytes.
 279
 280 =cut
 281
 282 sub bytes_to_utf8 {
 283     &_bytes_to_utf8;
 284 }
 285
 286 sub utf8_to_bytes {
 287     &_utf8_to_bytes;
 288 }
 289
 290 sub chars_to_utf8 {
 291     &C_to_utf8;
 292 }
 293
 294 sub utf8_to_chars {
 295     &_utf8_to_chars;
 296 }
 297
 298 sub utf8_to_chars_check {
 299     &_utf8_to_chars_check;
 300 }
 301
 302 sub bytes_to_chars {
 303     &_bytes_to_chars;
 304 }
 305
 306 sub chars_to_bytes {
 307     &_chars_to_bytes;
 308 }
 309
 310 sub from_to {
 311     &_from_to;
 312 }
 313
 314 sub is_utf8 {
 315     &_is_utf8;
 316 }
 317
 318 sub on_utf8 {
 319     &_on_utf8;
 320 }
 321
 322 sub off_utf8 {
 323     &_off_utf8;
 324 }
 325
 326 sub utf_to_utf {
 327     &_utf_to_utf;
 328 }
 329