[p5sagit/p5-mst-13.2.git] / ext / Encode / encengine.c

/*
Data structures for encoding transformations.

Perl works internally in either a native 'byte' encoding or
in UTF-8 encoded Unicode.  We have no immediate need for a "wchar_t"
representation. When we do we can use utf8_to_uv().

Most character encodings are either simple byte mappings or
variable length multi-byte encodings. UTF-8 can be viewed as a
rather extreme case of the latter.

So to solve an important part of perl's encode needs we need to solve the
"multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
case. (Where one of multi-bytes will usually be UTF-8.)

The other type of encoding is a shift encoding where a prefix sequence
determines what subsequent bytes mean. Such encodings have state.

We also need to handle case where a character in one encoding has to be
represented as multiple characters in the other. e.g. letter+diacritic.

The process can be considered as pseudo perl:

my $dst = '';
while (length($src))
 {
  my $size    = $count($src);
  my $in_seq  = substr($src,0,$size,'');
  my $out_seq = $s2d_hash{$in_seq};
  if (defined $out_seq)
   {
    $dst .= $out_seq;
   }
  else
   {
    # an error condition
   }
 }
return $dst;

That has the following components:
 &src_count - a "rule" for how many bytes make up the next character in the
              source.
 %s2d_hash  - a mapping from input sequences to output sequences

The problem with that scheme is that it does not allow the output
character repertoire to affect the characters considered from the
input.

So we use a "trie" representation which can also be considered
a state machine:

my $dst   = '';
my $seq   = \@s2d_seq;
my $next  = \@s2d_next;
while (length($src))
 {
  my $byte    = $substr($src,0,1,'');
  my $out_seq = $seq->[$byte];
  if (defined $out_seq)
   {
    $dst .= $out_seq;
   }
  else
   {
    # an error condition
   }
  ($next,$seq) = @$next->[$byte] if $next;
 }
return $dst;

There is now a pair of data structures to represent everything.
It is valid for output sequence at a particular point to
be defined but zero length, that just means "don't know yet".
For the single byte case there is no 'next' so new tables will be the same as
the original tables. For a multi-byte case a prefix byte will flip to the tables
for  the next page (adding nothing to the output), then the tables for the page
will provide the actual output and set tables back to original base page.

This scheme can also handle shift encodings.

A slight enhancement to the scheme also allows for look-ahead - if
we add a flag to re-add the removed byte to the source we could handle
  a" -> ä
  ab -> a (and take b back please)

*/

#include <EXTERN.h>
#include <perl.h>
#define U8 U8
#include "encode.h"

int
do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STRLEN *dout, int approx)
{
 const U8 *s    = src;
 const U8 *send = s+*slen;
 const U8 *last = s;
 U8 *d          = dst;
 U8 *dend       = d+dlen;
 int code       = 0;
 while (s < send)
  {
   encpage_t *e = enc;
   U8 byte = *s;
   while (byte > e->max)
    e++;
   if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80)))
    {
     const U8 *cend = s + (e->slen & 0x7f);
     if (cend <= send)
      {
       STRLEN n;
       if ((n = e->dlen))
        {
         const U8 *out  = e->seq+n*(byte - e->min);
         U8 *oend = d+n;
         if (dst)
          {
           if (oend <= dend)
            {
             while (d < oend)
              *d++ = *out++;
            }
           else
            {
             /* Out of space */
             code = ENCODE_NOSPACE;
             break;
            }
          }
         else
          d = oend;
        }
       enc = e->next;
       s++;
       if (s == cend)
        {
         if (approx && (e->slen & 0x80))
          code = ENCODE_FALLBACK;
         last = s;
        }
      }
     else
      {
       /* partial source character */
       code = ENCODE_PARTIAL;
       break;
      }
    }
   else
    {
     /* Cannot represent */
     code = ENCODE_NOREP;
     break;
    }
  }
 *slen = last - src;
 *dout = d - dst;
 return code;
}
Commit	Line	Data
017e2add	1	/*
	2	Data structures for encoding transformations.
	3
	4	Perl works internally in either a native 'byte' encoding or
	5	in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t"
	6	representation. When we do we can use utf8_to_uv().
	7
	8	Most character encodings are either simple byte mappings or
	9	variable length multi-byte encodings. UTF-8 can be viewed as a
	10	rather extreme case of the latter.
	11
	12	So to solve an important part of perl's encode needs we need to solve the
	13	"multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
	14	case. (Where one of multi-bytes will usually be UTF-8.)
	15
	16	The other type of encoding is a shift encoding where a prefix sequence
	17	determines what subsequent bytes mean. Such encodings have state.
	18
	19	We also need to handle case where a character in one encoding has to be
	20	represented as multiple characters in the other. e.g. letter+diacritic.
	21
	22	The process can be considered as pseudo perl:
	23
	24	my $dst = '';
	25	while (length($src))
	26	{
	27	my $size = $count($src);
	28	my $in_seq = substr($src,0,$size,'');
	29	my $out_seq = $s2d_hash{$in_seq};
	30	if (defined $out_seq)
	31	{
	32	$dst .= $out_seq;
	33	}
	34	else
	35	{
	36	# an error condition
	37	}
	38	}
	39	return $dst;
	40
	41	That has the following components:
	42	&src_count - a "rule" for how many bytes make up the next character in the
	43	source.
	44	%s2d_hash - a mapping from input sequences to output sequences
	45
	46	The problem with that scheme is that it does not allow the output
	47	character repertoire to affect the characters considered from the
	48	input.
	49
	50	So we use a "trie" representation which can also be considered
	51	a state machine:
	52
	53	my $dst = '';
	54	my $seq = \@s2d_seq;
	55	my $next = \@s2d_next;
	56	while (length($src))
	57	{
	58	my $byte = $substr($src,0,1,'');
	59	my $out_seq = $seq->[$byte];
	60	if (defined $out_seq)
	61	{
	62	$dst .= $out_seq;
	63	}
	64	else
65	{
66	# an error condition
67	}
68	($next,$seq) = @$next->[$byte] if $next;
69	}
70	return $dst;
71
72	There is now a pair of data structures to represent everything.
73	It is valid for output sequence at a particular point to
74	be defined but zero length, that just means "don't know yet".
75	For the single byte case there is no 'next' so new tables will be the same as
76	the original tables. For a multi-byte case a prefix byte will flip to the tables
77	for the next page (adding nothing to the output), then the tables for the page
78	will provide the actual output and set tables back to original base page.
79
80	This scheme can also handle shift encodings.
81
82	A slight enhancement to the scheme also allows for look-ahead - if
83	we add a flag to re-add the removed byte to the source we could handle
84	a" -> ä
85	ab -> a (and take b back please)
86
87	*/
88
89	#include <EXTERN.h>
90	#include <perl.h>
91	#define U8 U8
92	#include "encode.h"
93
2f2b4ff2	94	int
9b37254d	95	do_encode(encpage_t enc, const U8 src, STRLEN slen, U8 dst, STRLEN dlen, STRLEN *dout, int approx)
017e2add	96	{
2f2b4ff2	97	const U8 *s = src;
	98	const U8 send = s+slen;
	99	const U8 *last = s;
	100	U8 *d = dst;
	101	U8 *dend = d+dlen;
	102	int code = 0;
	103	while (s < send)
017e2add	104	{
017e2add	105	encpage_t *e = enc;
2f2b4ff2	106	U8 byte = *s;
017e2add	107	while (byte > e->max)
017e2add	108	e++;
c8991b40	109	if (byte >= e->min && e->slen && (approx \|\| !(e->slen & 0x80)))
017e2add	110	{
9b37254d	111	const U8 *cend = s + (e->slen & 0x7f);
2f2b4ff2	112	if (cend <= send)
017e2add	113	{
2f2b4ff2	114	STRLEN n;
2f2b4ff2	115	if ((n = e->dlen))
017e2add	116	{
2f2b4ff2	117	const U8 out = e->seq+n(byte - e->min);
2f2b4ff2	118	U8 *oend = d+n;
017e2add	119	if (dst)
2f2b4ff2	120	{
	121	if (oend <= dend)
	122	{
	123	while (d < oend)
	124	d++ = out++;
	125	}
	126	else
	127	{
	128	/* Out of space */
	129	code = ENCODE_NOSPACE;
	130	break;
	131	}
	132	}
	133	else
	134	d = oend;
017e2add	135	}
2f2b4ff2	136	enc = e->next;
	137	s++;
	138	if (s == cend)
9b37254d	139	{
	140	if (approx && (e->slen & 0x80))
	141	code = ENCODE_FALLBACK;
	142	last = s;
	143	}
2f2b4ff2	144	}
	145	else
	146	{
	147	/* partial source character */
	148	code = ENCODE_PARTIAL;
	149	break;
017e2add	150	}
017e2add	151	}
	152	else
	153	{
	154	/* Cannot represent */
2f2b4ff2	155	code = ENCODE_NOREP;
2f2b4ff2	156	break;
017e2add	157	}
017e2add	158	}
2f2b4ff2	159	*slen = last - src;
	160	*dout = d - dst;
	161	return code;
017e2add	162	}
	163
	164