Commit | Line | Data |
017e2add |
1 | /* |
2 | Data structures for encoding transformations. |
3 | |
4 | Perl works internally in either a native 'byte' encoding or |
5 | in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t" |
6 | representation. When we do we can use utf8_to_uv(). |
7 | |
8 | Most character encodings are either simple byte mappings or |
9 | variable length multi-byte encodings. UTF-8 can be viewed as a |
10 | rather extreme case of the latter. |
11 | |
12 | So to solve an important part of perl's encode needs we need to solve the |
13 | "multi-byte -> multi-byte" case. The simple byte forms are then just degenerate |
14 | case. (Where one of multi-bytes will usually be UTF-8.) |
15 | |
16 | The other type of encoding is a shift encoding where a prefix sequence |
17 | determines what subsequent bytes mean. Such encodings have state. |
18 | |
19 | We also need to handle case where a character in one encoding has to be |
20 | represented as multiple characters in the other. e.g. letter+diacritic. |
21 | |
22 | The process can be considered as pseudo perl: |
23 | |
24 | my $dst = ''; |
25 | while (length($src)) |
26 | { |
27 | my $size = $count($src); |
28 | my $in_seq = substr($src,0,$size,''); |
29 | my $out_seq = $s2d_hash{$in_seq}; |
30 | if (defined $out_seq) |
31 | { |
32 | $dst .= $out_seq; |
33 | } |
34 | else |
35 | { |
36 | # an error condition |
37 | } |
38 | } |
39 | return $dst; |
40 | |
41 | That has the following components: |
42 | &src_count - a "rule" for how many bytes make up the next character in the |
43 | source. |
44 | %s2d_hash - a mapping from input sequences to output sequences |
45 | |
46 | The problem with that scheme is that it does not allow the output |
47 | character repertoire to affect the characters considered from the |
48 | input. |
49 | |
50 | So we use a "trie" representation which can also be considered |
51 | a state machine: |
52 | |
53 | my $dst = ''; |
54 | my $seq = \@s2d_seq; |
55 | my $next = \@s2d_next; |
56 | while (length($src)) |
57 | { |
58 | my $byte = $substr($src,0,1,''); |
59 | my $out_seq = $seq->[$byte]; |
60 | if (defined $out_seq) |
61 | { |
62 | $dst .= $out_seq; |
63 | } |
64 | else |
65 | { |
66 | # an error condition |
67 | } |
68 | ($next,$seq) = @$next->[$byte] if $next; |
69 | } |
70 | return $dst; |
71 | |
72 | There is now a pair of data structures to represent everything. |
73 | It is valid for output sequence at a particular point to |
74 | be defined but zero length, that just means "don't know yet". |
75 | For the single byte case there is no 'next' so new tables will be the same as |
76 | the original tables. For a multi-byte case a prefix byte will flip to the tables |
77 | for the next page (adding nothing to the output), then the tables for the page |
78 | will provide the actual output and set tables back to original base page. |
79 | |
80 | This scheme can also handle shift encodings. |
81 | |
82 | A slight enhancement to the scheme also allows for look-ahead - if |
83 | we add a flag to re-add the removed byte to the source we could handle |
84 | a" -> รค |
85 | ab -> a (and take b back please) |
86 | |
87 | */ |
88 | |
89 | #include <EXTERN.h> |
90 | #include <perl.h> |
91 | #define U8 U8 |
92 | #include "encode.h" |
93 | |
2f2b4ff2 |
94 | int |
dd88d393 |
95 | do_encode(encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst, |
96 | STRLEN dlen, STRLEN * dout, int approx) |
017e2add |
97 | { |
dd88d393 |
98 | const U8 *s = src; |
99 | const U8 *send = s + *slen; |
100 | const U8 *last = s; |
101 | U8 *d = dst; |
102 | U8 *dend = d + dlen; |
103 | int code = 0; |
104 | while (s < send) { |
105 | encpage_t *e = enc; |
106 | U8 byte = *s; |
107 | while (byte > e->max) |
108 | e++; |
109 | if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) { |
110 | const U8 *cend = s + (e->slen & 0x7f); |
111 | if (cend <= send) { |
112 | STRLEN n; |
113 | if ((n = e->dlen)) { |
114 | const U8 *out = e->seq + n * (byte - e->min); |
115 | U8 *oend = d + n; |
116 | if (dst) { |
117 | if (oend <= dend) { |
118 | while (d < oend) |
119 | *d++ = *out++; |
120 | } |
121 | else { |
122 | /* Out of space */ |
123 | code = ENCODE_NOSPACE; |
124 | break; |
125 | } |
126 | } |
127 | else |
128 | d = oend; |
129 | } |
130 | enc = e->next; |
131 | s++; |
132 | if (s == cend) { |
133 | if (approx && (e->slen & 0x80)) |
134 | code = ENCODE_FALLBACK; |
135 | last = s; |
136 | } |
137 | } |
138 | else { |
139 | /* partial source character */ |
140 | code = ENCODE_PARTIAL; |
141 | break; |
142 | } |
143 | } |
144 | else { |
145 | /* Cannot represent */ |
146 | code = ENCODE_NOREP; |
147 | break; |
148 | } |
017e2add |
149 | } |
dd88d393 |
150 | *slen = last - src; |
151 | *dout = d - dst; |
152 | return code; |
017e2add |
153 | } |