Commit | Line | Data |
017e2add |
1 | /* |
2 | Data structures for encoding transformations. |
3 | |
4 | Perl works internally in either a native 'byte' encoding or |
5 | in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t" |
6 | representation. When we do we can use utf8_to_uv(). |
7 | |
8 | Most character encodings are either simple byte mappings or |
9 | variable length multi-byte encodings. UTF-8 can be viewed as a |
10 | rather extreme case of the latter. |
11 | |
12 | So to solve an important part of perl's encode needs we need to solve the |
13 | "multi-byte -> multi-byte" case. The simple byte forms are then just degenerate |
14 | case. (Where one of multi-bytes will usually be UTF-8.) |
15 | |
16 | The other type of encoding is a shift encoding where a prefix sequence |
17 | determines what subsequent bytes mean. Such encodings have state. |
18 | |
19 | We also need to handle case where a character in one encoding has to be |
20 | represented as multiple characters in the other. e.g. letter+diacritic. |
21 | |
22 | The process can be considered as pseudo perl: |
23 | |
24 | my $dst = ''; |
25 | while (length($src)) |
26 | { |
27 | my $size = $count($src); |
28 | my $in_seq = substr($src,0,$size,''); |
29 | my $out_seq = $s2d_hash{$in_seq}; |
30 | if (defined $out_seq) |
31 | { |
32 | $dst .= $out_seq; |
33 | } |
34 | else |
35 | { |
36 | # an error condition |
37 | } |
38 | } |
39 | return $dst; |
40 | |
41 | That has the following components: |
42 | &src_count - a "rule" for how many bytes make up the next character in the |
43 | source. |
44 | %s2d_hash - a mapping from input sequences to output sequences |
45 | |
46 | The problem with that scheme is that it does not allow the output |
47 | character repertoire to affect the characters considered from the |
48 | input. |
49 | |
50 | So we use a "trie" representation which can also be considered |
51 | a state machine: |
52 | |
53 | my $dst = ''; |
54 | my $seq = \@s2d_seq; |
55 | my $next = \@s2d_next; |
56 | while (length($src)) |
57 | { |
58 | my $byte = $substr($src,0,1,''); |
59 | my $out_seq = $seq->[$byte]; |
60 | if (defined $out_seq) |
61 | { |
62 | $dst .= $out_seq; |
63 | } |
64 | else |
65 | { |
66 | # an error condition |
67 | } |
68 | ($next,$seq) = @$next->[$byte] if $next; |
69 | } |
70 | return $dst; |
71 | |
72 | There is now a pair of data structures to represent everything. |
73 | It is valid for output sequence at a particular point to |
74 | be defined but zero length, that just means "don't know yet". |
75 | For the single byte case there is no 'next' so new tables will be the same as |
76 | the original tables. For a multi-byte case a prefix byte will flip to the tables |
77 | for the next page (adding nothing to the output), then the tables for the page |
78 | will provide the actual output and set tables back to original base page. |
79 | |
80 | This scheme can also handle shift encodings. |
81 | |
82 | A slight enhancement to the scheme also allows for look-ahead - if |
83 | we add a flag to re-add the removed byte to the source we could handle |
84 | a" -> รค |
85 | ab -> a (and take b back please) |
86 | |
87 | */ |
88 | |
89 | #include <EXTERN.h> |
90 | #include <perl.h> |
91 | #define U8 U8 |
92 | #include "encode.h" |
93 | |
2f2b4ff2 |
94 | int |
9b37254d |
95 | do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STRLEN *dout, int approx) |
017e2add |
96 | { |
2f2b4ff2 |
97 | const U8 *s = src; |
98 | const U8 *send = s+*slen; |
99 | const U8 *last = s; |
100 | U8 *d = dst; |
101 | U8 *dend = d+dlen; |
102 | int code = 0; |
103 | while (s < send) |
017e2add |
104 | { |
105 | encpage_t *e = enc; |
2f2b4ff2 |
106 | U8 byte = *s; |
017e2add |
107 | while (byte > e->max) |
108 | e++; |
c8991b40 |
109 | if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) |
017e2add |
110 | { |
9b37254d |
111 | const U8 *cend = s + (e->slen & 0x7f); |
2f2b4ff2 |
112 | if (cend <= send) |
017e2add |
113 | { |
2f2b4ff2 |
114 | STRLEN n; |
115 | if ((n = e->dlen)) |
017e2add |
116 | { |
2f2b4ff2 |
117 | const U8 *out = e->seq+n*(byte - e->min); |
118 | U8 *oend = d+n; |
017e2add |
119 | if (dst) |
2f2b4ff2 |
120 | { |
121 | if (oend <= dend) |
122 | { |
123 | while (d < oend) |
124 | *d++ = *out++; |
125 | } |
126 | else |
127 | { |
128 | /* Out of space */ |
129 | code = ENCODE_NOSPACE; |
130 | break; |
131 | } |
132 | } |
133 | else |
134 | d = oend; |
017e2add |
135 | } |
2f2b4ff2 |
136 | enc = e->next; |
137 | s++; |
138 | if (s == cend) |
9b37254d |
139 | { |
140 | if (approx && (e->slen & 0x80)) |
141 | code = ENCODE_FALLBACK; |
142 | last = s; |
143 | } |
2f2b4ff2 |
144 | } |
145 | else |
146 | { |
147 | /* partial source character */ |
148 | code = ENCODE_PARTIAL; |
149 | break; |
017e2add |
150 | } |
017e2add |
151 | } |
152 | else |
153 | { |
154 | /* Cannot represent */ |
2f2b4ff2 |
155 | code = ENCODE_NOREP; |
156 | break; |
017e2add |
157 | } |
158 | } |
2f2b4ff2 |
159 | *slen = last - src; |
160 | *dout = d - dst; |
161 | return code; |
017e2add |
162 | } |
163 | |
164 | |