Commit | Line | Data |
ac5ea531 |
1 | |
2 | #include "EXTERN.h" |
3 | #include "perl.h" |
4 | #include "XSUB.h" |
5 | |
6 | /* These 5 files are prepared by mkheader */ |
7 | #include "unfcmb.h" |
8 | #include "unfcan.h" |
9 | #include "unfcpt.h" |
10 | #include "unfcmp.h" |
11 | #include "unfexc.h" |
12 | |
13 | /* Perl 5.6.1 ? */ |
14 | #ifndef uvuni_to_utf8 |
15 | #define uvuni_to_utf8 uv_to_utf8 |
6c941e0c |
16 | #endif /* uvuni_to_utf8 */ |
ac5ea531 |
17 | |
18 | /* Perl 5.6.1 ? */ |
ab8fe378 |
19 | #ifndef utf8n_to_uvuni |
20 | #define utf8n_to_uvuni utf8_to_uv |
6c941e0c |
21 | #endif /* utf8n_to_uvuni */ |
ac5ea531 |
22 | |
e524f5b2 |
23 | /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ |
24 | #ifdef UTF8_ALLOW_BOM |
25 | #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF) |
628bbff0 |
26 | #else |
e524f5b2 |
27 | #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF) |
28 | #endif |
29 | |
30 | /* if utf8n_to_uvuni() sets retlen to 0 (?) */ |
fe067ad9 |
31 | #define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character" |
82e740b6 |
32 | |
33 | /* utf8_hop() hops back before start. Maybe broken UTF-8 */ |
34 | #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" |
35 | |
51683ce6 |
36 | /* It should never happen as there is no instance in UTF-8 and UTF-EBCDIC. |
37 | If Unicode would add a new composition of A + B to C |
38 | where bytes::length(A) + bytes::length(B) < bytes::length(C), |
39 | this code should be fixed. |
40 | In this case, mkheader will prevent Unicode::Normalize from building. */ |
fe067ad9 |
41 | #define ErrLongerThanSrc "panic (Unicode::Normalize %s): longer than source" |
42 | |
43 | /* uvuni_to_utf8 wants UTF8_MAXBYTES free bytes available */ |
44 | #define ErrTargetNotEnough "panic (Unicode::Normalize %s): target not enough" |
45 | |
ac5ea531 |
46 | /* At present, char > 0x10ffff are unaffected without complaint, right? */ |
47 | #define VALID_UTF_MAX (0x10ffff) |
48 | #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) |
49 | |
fe067ad9 |
50 | /* size of array for combining characters */ |
51 | /* enough as an initial value? */ |
52 | #define CC_SEQ_SIZE (10) |
53 | #define CC_SEQ_STEP (5) |
54 | |
55 | /* HANGUL begin */ |
ac5ea531 |
56 | #define Hangul_SBase 0xAC00 |
57 | #define Hangul_SFinal 0xD7A3 |
58 | #define Hangul_SCount 11172 |
59 | |
60 | #define Hangul_NCount 588 |
61 | |
62 | #define Hangul_LBase 0x1100 |
63 | #define Hangul_LFinal 0x1112 |
64 | #define Hangul_LCount 19 |
65 | |
66 | #define Hangul_VBase 0x1161 |
67 | #define Hangul_VFinal 0x1175 |
68 | #define Hangul_VCount 21 |
69 | |
70 | #define Hangul_TBase 0x11A7 |
71 | #define Hangul_TFinal 0x11C2 |
72 | #define Hangul_TCount 28 |
73 | |
74 | #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) |
2a204b45 |
75 | #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) |
ac5ea531 |
76 | #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) |
77 | #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) |
78 | #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) |
79 | #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) |
fe067ad9 |
80 | /* HANGUL end */ |
ac5ea531 |
81 | |
82 | /* this is used for canonical ordering of combining characters (c.c.). */ |
83 | typedef struct { |
84 | U8 cc; /* combining class */ |
85 | UV uv; /* codepoint */ |
86 | STRLEN pos; /* position */ |
87 | } UNF_cc; |
88 | |
fe067ad9 |
89 | static int compare_cc(const void *a, const void *b) |
ac5ea531 |
90 | { |
91 | int ret_cc; |
6c941e0c |
92 | ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc; |
8f118dcd |
93 | if (ret_cc) |
94 | return ret_cc; |
6c941e0c |
95 | |
96 | return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos ) |
97 | - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos ); |
ac5ea531 |
98 | } |
99 | |
fe067ad9 |
100 | static U8* dec_canonical(UV uv) |
ac5ea531 |
101 | { |
102 | U8 ***plane, **row; |
8f118dcd |
103 | if (OVER_UTF_MAX(uv)) |
104 | return NULL; |
ac5ea531 |
105 | plane = (U8***)UNF_canon[uv >> 16]; |
8f118dcd |
106 | if (! plane) |
107 | return NULL; |
ac5ea531 |
108 | row = plane[(uv >> 8) & 0xff]; |
109 | return row ? row[uv & 0xff] : NULL; |
110 | } |
111 | |
fe067ad9 |
112 | static U8* dec_compat(UV uv) |
ac5ea531 |
113 | { |
114 | U8 ***plane, **row; |
8f118dcd |
115 | if (OVER_UTF_MAX(uv)) |
116 | return NULL; |
ac5ea531 |
117 | plane = (U8***)UNF_compat[uv >> 16]; |
8f118dcd |
118 | if (! plane) |
119 | return NULL; |
ac5ea531 |
120 | row = plane[(uv >> 8) & 0xff]; |
121 | return row ? row[uv & 0xff] : NULL; |
122 | } |
123 | |
fe067ad9 |
124 | static UV composite_uv(UV uv, UV uv2) |
ac5ea531 |
125 | { |
126 | UNF_complist ***plane, **row, *cell, *i; |
127 | |
fe067ad9 |
128 | if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) |
8f118dcd |
129 | return 0; |
ac5ea531 |
130 | |
8f118dcd |
131 | if (Hangul_IsL(uv) && Hangul_IsV(uv2)) { |
fe067ad9 |
132 | UV lindex = uv - Hangul_LBase; |
133 | UV vindex = uv2 - Hangul_VBase; |
134 | return(Hangul_SBase + (lindex * Hangul_VCount + vindex) * |
135 | Hangul_TCount); |
ac5ea531 |
136 | } |
8f118dcd |
137 | if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) { |
fe067ad9 |
138 | UV tindex = uv2 - Hangul_TBase; |
139 | return(uv + tindex); |
ac5ea531 |
140 | } |
141 | plane = UNF_compos[uv >> 16]; |
8f118dcd |
142 | if (! plane) |
143 | return 0; |
ac5ea531 |
144 | row = plane[(uv >> 8) & 0xff]; |
8f118dcd |
145 | if (! row) |
146 | return 0; |
ac5ea531 |
147 | cell = row[uv & 0xff]; |
8f118dcd |
148 | if (! cell) |
149 | return 0; |
150 | for (i = cell; i->nextchar; i++) { |
151 | if (uv2 == i->nextchar) |
152 | return i->composite; |
ac5ea531 |
153 | } |
154 | return 0; |
155 | } |
156 | |
fe067ad9 |
157 | static U8 getCombinClass(UV uv) |
ac5ea531 |
158 | { |
159 | U8 **plane, *row; |
8f118dcd |
160 | if (OVER_UTF_MAX(uv)) |
161 | return 0; |
ac5ea531 |
162 | plane = (U8**)UNF_combin[uv >> 16]; |
8f118dcd |
163 | if (! plane) |
164 | return 0; |
ac5ea531 |
165 | row = plane[(uv >> 8) & 0xff]; |
166 | return row ? row[uv & 0xff] : 0; |
167 | } |
168 | |
fe067ad9 |
169 | static U8* pv_cat_decompHangul(U8* d, UV uv) |
ac5ea531 |
170 | { |
fe067ad9 |
171 | UV sindex = uv - Hangul_SBase; |
172 | UV lindex = sindex / Hangul_NCount; |
173 | UV vindex = (sindex % Hangul_NCount) / Hangul_TCount; |
174 | UV tindex = sindex % Hangul_TCount; |
ac5ea531 |
175 | |
8f118dcd |
176 | if (! Hangul_IsS(uv)) |
fe067ad9 |
177 | return d; |
ac5ea531 |
178 | |
fe067ad9 |
179 | d = uvuni_to_utf8(d, (lindex + Hangul_LBase)); |
180 | d = uvuni_to_utf8(d, (vindex + Hangul_VBase)); |
8f118dcd |
181 | if (tindex) |
fe067ad9 |
182 | d = uvuni_to_utf8(d, (tindex + Hangul_TBase)); |
183 | return d; |
ac5ea531 |
184 | } |
185 | |
39f4556f |
186 | static char* sv_2pvunicode(SV *sv, STRLEN *lp) |
a092bcfd |
187 | { |
188 | char *s; |
189 | STRLEN len; |
39f4556f |
190 | s = SvPV(sv,len); |
a092bcfd |
191 | if (!SvUTF8(sv)) { |
39f4556f |
192 | SV* tmpsv = sv_2mortal(newSVpvn(s, len)); |
a092bcfd |
193 | if (!SvPOK(tmpsv)) |
39f4556f |
194 | s = SvPV_force(tmpsv,len); |
a092bcfd |
195 | sv_utf8_upgrade(tmpsv); |
39f4556f |
196 | s = SvPV(tmpsv,len); |
a092bcfd |
197 | } |
fe067ad9 |
198 | if (lp) |
199 | *lp = len; |
a092bcfd |
200 | return s; |
201 | } |
202 | |
fe067ad9 |
203 | static |
204 | U8* pv_utf8_decompose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat) |
205 | { |
206 | U8* p = s; |
207 | U8* e = s + slen; |
208 | U8* dstart = *dp; |
209 | U8* d = dstart; |
210 | |
211 | while (p < e) { |
212 | STRLEN retlen; |
213 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
214 | if (!retlen) |
fe067ad9 |
215 | croak(ErrRetlenIsZero, "decompose"); |
216 | p += retlen; |
217 | |
218 | if (Hangul_IsS(uv)) { |
219 | STRLEN cur = d - dstart; |
82e740b6 |
220 | |
fe067ad9 |
221 | if (dlen < cur + UTF8_MAXLEN * 3) { |
222 | dlen += UTF8_MAXLEN * 3; |
223 | Renew(dstart, dlen+1, U8); |
224 | d = dstart + cur; |
225 | } |
226 | d = pv_cat_decompHangul(d, uv); |
227 | } |
ac5ea531 |
228 | else { |
fe067ad9 |
229 | U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv); |
230 | |
231 | if (r) { |
232 | STRLEN len = (STRLEN)strlen((char *)r); |
233 | STRLEN cur = d - dstart; |
234 | if (dlen < cur + len) { |
235 | dlen += len; |
236 | Renew(dstart, dlen+1, U8); |
237 | d = dstart + cur; |
238 | } |
239 | while (len--) |
240 | *d++ = *r++; |
241 | } |
242 | else { |
243 | STRLEN cur = d - dstart; |
244 | |
245 | if (dlen < cur + UTF8_MAXLEN) { |
246 | dlen += UTF8_MAXLEN; |
247 | Renew(dstart, dlen+1, U8); |
248 | d = dstart + cur; |
249 | } |
250 | d = uvuni_to_utf8(d, uv); |
251 | } |
ac5ea531 |
252 | } |
253 | } |
fe067ad9 |
254 | *dp = dstart; |
255 | return d; |
256 | } |
ac5ea531 |
257 | |
fe067ad9 |
258 | static |
259 | U8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen) |
260 | { |
261 | U8* p = s; |
262 | U8* e = s + slen; |
263 | U8* dend = d + dlen; |
264 | |
265 | UNF_cc seq_ary[CC_SEQ_SIZE]; |
266 | UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */ |
267 | UNF_cc* seq_ext = NULL; /* extend if need */ |
268 | STRLEN seq_max = CC_SEQ_SIZE; |
269 | STRLEN cc_pos = 0; |
270 | |
271 | if (dlen < slen || dlen < slen + UTF8_MAXLEN) |
272 | croak(ErrTargetNotEnough, "reorder"); |
273 | dend -= UTF8_MAXLEN; /* safety */ |
274 | |
275 | while (p < e) { |
276 | U8 curCC; |
277 | STRLEN retlen; |
278 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
279 | if (!retlen) |
280 | croak(ErrRetlenIsZero, "reorder"); |
281 | p += retlen; |
ac5ea531 |
282 | |
fe067ad9 |
283 | curCC = getCombinClass(uv); |
ac5ea531 |
284 | |
fe067ad9 |
285 | if (curCC != 0) { |
286 | if (seq_max < cc_pos + 1) { /* extend if need */ |
287 | seq_max = cc_pos + CC_SEQ_STEP; /* new size */ |
288 | if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ |
289 | STRLEN i; |
290 | New(0, seq_ext, seq_max, UNF_cc); |
291 | for (i = 0; i < cc_pos; i++) |
292 | seq_ext[i] = seq_ary[i]; |
293 | } |
294 | else { |
295 | Renew(seq_ext, seq_max, UNF_cc); |
296 | } |
39f4556f |
297 | seq_ptr = seq_ext; /* use seq_ext from now */ |
fe067ad9 |
298 | } |
a092bcfd |
299 | |
fe067ad9 |
300 | seq_ptr[cc_pos].cc = curCC; |
301 | seq_ptr[cc_pos].uv = uv; |
302 | seq_ptr[cc_pos].pos = cc_pos; |
303 | ++cc_pos; |
ac5ea531 |
304 | |
fe067ad9 |
305 | if (p < e) |
306 | continue; |
307 | } |
ac5ea531 |
308 | |
fe067ad9 |
309 | if (cc_pos) { |
310 | STRLEN i; |
311 | |
312 | if (cc_pos > 1) /* reordered if there are two c.c.'s */ |
313 | qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc); |
314 | |
315 | for (i = 0; i < cc_pos; i++) { |
316 | d = uvuni_to_utf8(d, seq_ptr[i].uv); |
317 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ |
318 | croak(ErrLongerThanSrc, "reorder"); |
319 | } |
320 | cc_pos = 0; |
321 | } |
2a204b45 |
322 | |
e524f5b2 |
323 | if (curCC == 0) { |
324 | d = uvuni_to_utf8(d, uv); |
fe067ad9 |
325 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ |
326 | croak(ErrLongerThanSrc, "reorder"); |
e524f5b2 |
327 | } |
fe067ad9 |
328 | } |
329 | if (seq_ext) |
330 | Safefree(seq_ext); |
331 | return d; |
332 | } |
ac5ea531 |
333 | |
fe067ad9 |
334 | static |
335 | U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig) |
336 | { |
337 | U8* p = s; |
338 | U8* e = s + slen; |
339 | U8* dend = d + dlen; |
340 | |
39f4556f |
341 | UV uvS = 0; /* code point of the starter */ |
fe067ad9 |
342 | bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */ |
343 | U8 preCC = 0; |
344 | |
345 | UV seq_ary[CC_SEQ_SIZE]; |
346 | UV* seq_ptr = seq_ary; /* use array at the beginning */ |
347 | UV* seq_ext = NULL; /* extend if need */ |
348 | STRLEN seq_max = CC_SEQ_SIZE; |
349 | STRLEN cc_pos = 0; |
350 | |
351 | if (dlen < slen || dlen < slen + UTF8_MAXLEN) |
352 | croak(ErrTargetNotEnough, "compose"); |
353 | dend -= UTF8_MAXLEN; /* safety */ |
354 | |
355 | while (p < e) { |
356 | U8 curCC; |
357 | STRLEN retlen; |
358 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
359 | if (!retlen) |
360 | croak(ErrRetlenIsZero, "compose"); |
361 | p += retlen; |
ac5ea531 |
362 | |
fe067ad9 |
363 | curCC = getCombinClass(uv); |
82e740b6 |
364 | |
fe067ad9 |
365 | if (!valid_uvS) { |
e524f5b2 |
366 | if (curCC == 0) { |
fe067ad9 |
367 | uvS = uv; /* the first Starter is found */ |
368 | valid_uvS = TRUE; |
369 | if (p < e) |
370 | continue; |
e524f5b2 |
371 | } |
fe067ad9 |
372 | else { |
373 | d = uvuni_to_utf8(d, uv); |
374 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ |
375 | croak(ErrLongerThanSrc, "compose"); |
376 | continue; |
377 | } |
378 | } |
379 | else { |
380 | bool composed; |
381 | |
382 | /* blocked */ |
383 | if (iscontig && cc_pos || /* discontiguous combination */ |
384 | curCC != 0 && preCC == curCC || /* blocked by same CC */ |
385 | preCC > curCC) /* blocked by higher CC: revised D2 */ |
386 | composed = FALSE; |
387 | |
388 | /* not blocked: |
389 | iscontig && cc_pos == 0 -- contiguous combination |
390 | curCC == 0 && preCC == 0 -- starter + starter |
391 | curCC != 0 && preCC < curCC -- lower CC */ |
392 | else { |
393 | /* try composition */ |
394 | UV uvComp = composite_uv(uvS, uv); |
395 | |
396 | if (uvComp && !isExclusion(uvComp)) { |
397 | uvS = uvComp; |
398 | composed = TRUE; |
82e740b6 |
399 | |
fe067ad9 |
400 | /* preCC should not be changed to curCC */ |
401 | /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */ |
402 | if (p < e) |
403 | continue; |
404 | } |
405 | else |
406 | composed = FALSE; |
407 | } |
408 | |
409 | if (!composed) { |
410 | preCC = curCC; |
411 | if (curCC != 0 || !(p < e)) { |
412 | if (seq_max < cc_pos + 1) { /* extend if need */ |
413 | seq_max = cc_pos + CC_SEQ_STEP; /* new size */ |
414 | if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ |
415 | New(0, seq_ext, seq_max, UV); |
416 | Copy(seq_ary, seq_ext, cc_pos, UV); |
417 | } |
418 | else { |
419 | Renew(seq_ext, seq_max, UV); |
420 | } |
2b8d773d |
421 | seq_ptr = seq_ext; /* use seq_ext from now */ |
fe067ad9 |
422 | } |
423 | seq_ptr[cc_pos] = uv; |
424 | ++cc_pos; |
425 | } |
426 | if (curCC != 0 && p < e) |
427 | continue; |
ac5ea531 |
428 | } |
ac5ea531 |
429 | } |
430 | |
fe067ad9 |
431 | d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */ |
432 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ |
433 | croak(ErrLongerThanSrc, "compose"); |
434 | |
e524f5b2 |
435 | if (cc_pos) { |
fe067ad9 |
436 | STRLEN i; |
ac5ea531 |
437 | |
fe067ad9 |
438 | for (i = 0; i < cc_pos; i++) { |
439 | d = uvuni_to_utf8(d, seq_ptr[i]); |
440 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ |
441 | croak(ErrLongerThanSrc, "compose"); |
442 | } |
443 | cc_pos = 0; |
ac5ea531 |
444 | } |
fe067ad9 |
445 | |
446 | uvS = uv; |
ac5ea531 |
447 | } |
fe067ad9 |
448 | if (seq_ext) |
449 | Safefree(seq_ext); |
450 | return d; |
451 | } |
452 | |
453 | MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize |
454 | |
455 | SV* |
456 | decompose(src, compat = &PL_sv_no) |
457 | SV * src |
458 | SV * compat |
459 | PROTOTYPE: $;$ |
460 | PREINIT: |
461 | SV* dst; |
462 | U8 *s, *d, *dend; |
463 | STRLEN slen, dlen; |
464 | CODE: |
465 | s = (U8*)sv_2pvunicode(src,&slen); |
466 | dst = newSVpvn("", 0); |
467 | dlen = slen; |
468 | New(0, d, dlen+1, U8); |
39f4556f |
469 | dend = pv_utf8_decompose(s, slen, &d, dlen, (bool)SvTRUE(compat)); |
9b374906 |
470 | sv_setpvn(dst, (char *)d, dend - d); |
fe067ad9 |
471 | SvUTF8_on(dst); |
472 | Safefree(d); |
8f118dcd |
473 | RETVAL = dst; |
ac5ea531 |
474 | OUTPUT: |
475 | RETVAL |
476 | |
fe067ad9 |
477 | SV* |
478 | reorder(src) |
479 | SV * src |
480 | PROTOTYPE: $ |
481 | PREINIT: |
482 | SV* dst; |
483 | U8 *s, *d, *dend; |
484 | STRLEN slen, dlen; |
485 | CODE: |
486 | s = (U8*)sv_2pvunicode(src,&slen); |
487 | dst = newSVpvn("", 0); |
488 | dlen = slen + UTF8_MAXLEN; |
489 | d = (U8*)SvGROW(dst,dlen+1); |
490 | SvUTF8_on(dst); |
491 | dend = pv_utf8_reorder(s, slen, d, dlen); |
492 | *dend = '\0'; |
493 | SvCUR_set(dst, dend - d); |
494 | RETVAL = dst; |
495 | OUTPUT: |
496 | RETVAL |
ac5ea531 |
497 | |
2a204b45 |
498 | SV* |
a092bcfd |
499 | compose(src) |
500 | SV * src |
ac5ea531 |
501 | PROTOTYPE: $ |
82e740b6 |
502 | ALIAS: |
503 | composeContiguous = 1 |
ac5ea531 |
504 | PREINIT: |
fe067ad9 |
505 | SV* dst; |
506 | U8 *s, *d, *dend; |
507 | STRLEN slen, dlen; |
2a204b45 |
508 | CODE: |
fe067ad9 |
509 | s = (U8*)sv_2pvunicode(src,&slen); |
510 | dst = newSVpvn("", 0); |
511 | dlen = slen + UTF8_MAXLEN; |
512 | d = (U8*)SvGROW(dst,dlen+1); |
ac5ea531 |
513 | SvUTF8_on(dst); |
fe067ad9 |
514 | dend = pv_utf8_compose(s, slen, d, dlen, (bool)ix); |
515 | *dend = '\0'; |
516 | SvCUR_set(dst, dend - d); |
517 | RETVAL = dst; |
518 | OUTPUT: |
519 | RETVAL |
ac5ea531 |
520 | |
fe067ad9 |
521 | SV* |
522 | NFD(src) |
523 | SV * src |
524 | PROTOTYPE: $ |
525 | ALIAS: |
526 | NFKD = 1 |
527 | PREINIT: |
528 | SV *dst; |
529 | U8 *s, *t, *tend, *d, *dend; |
530 | STRLEN slen, tlen, dlen; |
531 | CODE: |
532 | /* decompose */ |
533 | s = (U8*)sv_2pvunicode(src,&slen); |
534 | tlen = slen; |
535 | New(0, t, tlen+1, U8); |
536 | tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)ix); |
537 | *tend = '\0'; |
538 | tlen = tend - t; /* no longer know real tlen */ |
539 | |
540 | /* reorder */ |
541 | dst = newSVpvn("", 0); |
542 | dlen = tlen + UTF8_MAXLEN; |
543 | d = (U8*)SvGROW(dst,dlen+1); |
544 | SvUTF8_on(dst); |
545 | dend = pv_utf8_reorder(t, tlen, d, dlen); |
546 | *dend = '\0'; |
547 | SvCUR_set(dst, dend - d); |
ac5ea531 |
548 | |
fe067ad9 |
549 | /* return */ |
550 | Safefree(t); |
551 | RETVAL = dst; |
552 | OUTPUT: |
553 | RETVAL |
82e740b6 |
554 | |
fe067ad9 |
555 | SV* |
556 | NFC(src) |
557 | SV * src |
558 | PROTOTYPE: $ |
559 | ALIAS: |
560 | NFKC = 1 |
561 | FCC = 2 |
562 | PREINIT: |
563 | SV *dst; |
564 | U8 *s, *t, *tend, *u, *uend, *d, *dend; |
565 | STRLEN slen, tlen, ulen, dlen; |
566 | CODE: |
567 | /* decompose */ |
568 | s = (U8*)sv_2pvunicode(src,&slen); |
569 | tlen = slen; |
570 | New(0, t, tlen+1, U8); |
571 | tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)(ix==1)); |
572 | *tend = '\0'; |
573 | tlen = tend - t; /* no longer know real tlen */ |
574 | |
575 | /* reorder */ |
576 | ulen = tlen + UTF8_MAXLEN; |
577 | New(0, u, ulen+1, U8); |
578 | uend = pv_utf8_reorder(t, tlen, u, ulen); |
579 | *uend = '\0'; |
580 | ulen = uend - u; |
581 | |
582 | /* compose */ |
583 | dst = newSVpvn("", 0); |
584 | dlen = ulen + UTF8_MAXLEN; |
585 | d = (U8*)SvGROW(dst,dlen+1); |
586 | SvUTF8_on(dst); |
587 | dend = pv_utf8_compose(u, ulen, d, dlen, (bool)(ix==2)); |
588 | *dend = '\0'; |
589 | SvCUR_set(dst, dend - d); |
ac5ea531 |
590 | |
fe067ad9 |
591 | /* return */ |
592 | Safefree(t); |
593 | Safefree(u); |
2a204b45 |
594 | RETVAL = dst; |
595 | OUTPUT: |
596 | RETVAL |
ac5ea531 |
597 | |
2b8d773d |
598 | SV* |
a092bcfd |
599 | checkNFD(src) |
600 | SV * src |
8f118dcd |
601 | PROTOTYPE: $ |
602 | ALIAS: |
603 | checkNFKD = 1 |
604 | PREINIT: |
8f118dcd |
605 | STRLEN srclen, retlen; |
606 | U8 *s, *e, *p, curCC, preCC; |
2b8d773d |
607 | bool result = TRUE; |
82e740b6 |
608 | CODE: |
a092bcfd |
609 | s = (U8*)sv_2pvunicode(src,&srclen); |
8f118dcd |
610 | e = s + srclen; |
611 | |
612 | preCC = 0; |
613 | for (p = s; p < e; p += retlen) { |
fe067ad9 |
614 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
615 | if (!retlen) |
fe067ad9 |
616 | croak(ErrRetlenIsZero, "checkNFD or -NFKD"); |
82e740b6 |
617 | |
8f118dcd |
618 | curCC = getCombinClass(uv); |
2b8d773d |
619 | if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ |
620 | result = FALSE; |
621 | break; |
622 | } |
623 | if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) { |
624 | result = FALSE; |
625 | break; |
626 | } |
8f118dcd |
627 | preCC = curCC; |
628 | } |
2b8d773d |
629 | RETVAL = boolSV(result); |
630 | OUTPUT: |
631 | RETVAL |
8f118dcd |
632 | |
633 | |
2b8d773d |
634 | SV* |
a092bcfd |
635 | checkNFC(src) |
636 | SV * src |
8f118dcd |
637 | PROTOTYPE: $ |
638 | ALIAS: |
639 | checkNFKC = 1 |
640 | PREINIT: |
8f118dcd |
641 | STRLEN srclen, retlen; |
642 | U8 *s, *e, *p, curCC, preCC; |
2b8d773d |
643 | bool result = TRUE; |
644 | bool isMAYBE = FALSE; |
82e740b6 |
645 | CODE: |
a092bcfd |
646 | s = (U8*)sv_2pvunicode(src,&srclen); |
8f118dcd |
647 | e = s + srclen; |
648 | |
649 | preCC = 0; |
8f118dcd |
650 | for (p = s; p < e; p += retlen) { |
fe067ad9 |
651 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
652 | if (!retlen) |
fe067ad9 |
653 | croak(ErrRetlenIsZero, "checkNFC or -NFKC"); |
82e740b6 |
654 | |
8f118dcd |
655 | curCC = getCombinClass(uv); |
2b8d773d |
656 | if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ |
657 | result = FALSE; |
658 | break; |
659 | } |
8f118dcd |
660 | |
661 | /* get NFC/NFKC property */ |
662 | if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */ |
663 | ; /* YES */ |
2b8d773d |
664 | else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { |
665 | result = FALSE; |
666 | break; |
667 | } |
8f118dcd |
668 | else if (isComp2nd(uv)) |
669 | isMAYBE = TRUE; |
670 | else if (ix) { |
671 | char *canon, *compat; |
6c941e0c |
672 | /* NFKC_NO when having compatibility mapping. */ |
8f118dcd |
673 | canon = (char *) dec_canonical(uv); |
674 | compat = (char *) dec_compat(uv); |
2b8d773d |
675 | if (compat && !(canon && strEQ(canon, compat))) { |
676 | result = FALSE; |
677 | break; |
678 | } |
8f118dcd |
679 | } /* end of get NFC/NFKC property */ |
680 | |
681 | preCC = curCC; |
682 | } |
2b8d773d |
683 | if (isMAYBE && result) /* NO precedes MAYBE */ |
8f118dcd |
684 | XSRETURN_UNDEF; |
2b8d773d |
685 | RETVAL = boolSV(result); |
686 | OUTPUT: |
687 | RETVAL |
8f118dcd |
688 | |
689 | |
2b8d773d |
690 | SV* |
a092bcfd |
691 | checkFCD(src) |
692 | SV * src |
82e740b6 |
693 | PROTOTYPE: $ |
694 | ALIAS: |
695 | checkFCC = 1 |
696 | PREINIT: |
fe067ad9 |
697 | STRLEN srclen, retlen; |
82e740b6 |
698 | U8 *s, *e, *p, curCC, preCC; |
2b8d773d |
699 | bool result = TRUE; |
700 | bool isMAYBE = FALSE; |
82e740b6 |
701 | CODE: |
a092bcfd |
702 | s = (U8*)sv_2pvunicode(src,&srclen); |
82e740b6 |
703 | e = s + srclen; |
82e740b6 |
704 | preCC = 0; |
82e740b6 |
705 | for (p = s; p < e; p += retlen) { |
fe067ad9 |
706 | U8 *sCan; |
707 | UV uvLead; |
39f4556f |
708 | STRLEN canlen = 0; |
fe067ad9 |
709 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
710 | if (!retlen) |
fe067ad9 |
711 | croak(ErrRetlenIsZero, "checkFCD or -FCC"); |
82e740b6 |
712 | |
713 | sCan = (U8*) dec_canonical(uv); |
714 | |
715 | if (sCan) { |
39f4556f |
716 | STRLEN canret; |
82e740b6 |
717 | canlen = (STRLEN)strlen((char *) sCan); |
e524f5b2 |
718 | uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF); |
fe067ad9 |
719 | if (!canret) |
720 | croak(ErrRetlenIsZero, "checkFCD or -FCC"); |
82e740b6 |
721 | } |
722 | else { |
723 | uvLead = uv; |
724 | } |
725 | |
726 | curCC = getCombinClass(uvLead); |
727 | |
2b8d773d |
728 | if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */ |
729 | result = FALSE; |
730 | break; |
731 | } |
82e740b6 |
732 | |
733 | if (ix) { |
2b8d773d |
734 | if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { |
735 | result = FALSE; |
736 | break; |
737 | } |
82e740b6 |
738 | else if (isComp2nd(uv)) |
739 | isMAYBE = TRUE; |
740 | } |
741 | |
742 | if (sCan) { |
39f4556f |
743 | STRLEN canret; |
fe067ad9 |
744 | UV uvTrail; |
745 | U8* eCan = sCan + canlen; |
746 | U8* pCan = utf8_hop(eCan, -1); |
82e740b6 |
747 | if (pCan < sCan) |
748 | croak(ErrHopBeforeStart); |
e524f5b2 |
749 | uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF); |
fe067ad9 |
750 | if (!canret) |
751 | croak(ErrRetlenIsZero, "checkFCD or -FCC"); |
82e740b6 |
752 | preCC = getCombinClass(uvTrail); |
753 | } |
754 | else { |
755 | preCC = curCC; |
756 | } |
757 | } |
2b8d773d |
758 | if (isMAYBE && result) /* NO precedes MAYBE */ |
82e740b6 |
759 | XSRETURN_UNDEF; |
2b8d773d |
760 | RETVAL = boolSV(result); |
761 | OUTPUT: |
762 | RETVAL |
82e740b6 |
763 | |
764 | |
ac5ea531 |
765 | U8 |
766 | getCombinClass(uv) |
767 | UV uv |
8f118dcd |
768 | PROTOTYPE: $ |
ac5ea531 |
769 | |
770 | bool |
2a204b45 |
771 | isExclusion(uv) |
ac5ea531 |
772 | UV uv |
8f118dcd |
773 | PROTOTYPE: $ |
774 | |
775 | bool |
776 | isSingleton(uv) |
777 | UV uv |
778 | PROTOTYPE: $ |
779 | |
780 | bool |
781 | isNonStDecomp(uv) |
782 | UV uv |
783 | PROTOTYPE: $ |
784 | |
785 | bool |
786 | isComp2nd(uv) |
787 | UV uv |
788 | PROTOTYPE: $ |
789 | ALIAS: |
790 | isNFC_MAYBE = 1 |
791 | isNFKC_MAYBE = 2 |
792 | |
793 | |
794 | |
2b8d773d |
795 | SV* |
8f118dcd |
796 | isNFD_NO(uv) |
797 | UV uv |
798 | PROTOTYPE: $ |
799 | ALIAS: |
800 | isNFKD_NO = 1 |
2b8d773d |
801 | PREINIT: |
802 | bool result = FALSE; |
82e740b6 |
803 | CODE: |
8f118dcd |
804 | if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) |
2b8d773d |
805 | result = TRUE; /* NFD_NO or NFKD_NO */ |
806 | RETVAL = boolSV(result); |
807 | OUTPUT: |
808 | RETVAL |
8f118dcd |
809 | |
810 | |
2b8d773d |
811 | SV* |
8f118dcd |
812 | isComp_Ex(uv) |
813 | UV uv |
814 | PROTOTYPE: $ |
815 | ALIAS: |
816 | isNFC_NO = 0 |
817 | isNFKC_NO = 1 |
2b8d773d |
818 | PREINIT: |
819 | bool result = FALSE; |
82e740b6 |
820 | CODE: |
8f118dcd |
821 | if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) |
2b8d773d |
822 | result = TRUE; /* NFC_NO or NFKC_NO */ |
8f118dcd |
823 | else if (ix) { |
824 | char *canon, *compat; |
825 | canon = (char *) dec_canonical(uv); |
826 | compat = (char *) dec_compat(uv); |
827 | if (compat && (!canon || strNE(canon, compat))) |
2b8d773d |
828 | result = TRUE; /* NFC_NO or NFKC_NO */ |
8f118dcd |
829 | } |
2b8d773d |
830 | RETVAL = boolSV(result); |
831 | OUTPUT: |
832 | RETVAL |
ac5ea531 |
833 | |
2a204b45 |
834 | SV* |
ac5ea531 |
835 | getComposite(uv, uv2) |
836 | UV uv |
837 | UV uv2 |
2a204b45 |
838 | PROTOTYPE: $$ |
839 | PREINIT: |
bcdb689b |
840 | UV composite; |
2a204b45 |
841 | CODE: |
bcdb689b |
842 | composite = composite_uv(uv, uv2); |
843 | RETVAL = composite ? newSVuv(composite) : &PL_sv_undef; |
2a204b45 |
844 | OUTPUT: |
845 | RETVAL |
ac5ea531 |
846 | |
8f118dcd |
847 | |
848 | |
ac5ea531 |
849 | SV* |
850 | getCanon(uv) |
851 | UV uv |
852 | PROTOTYPE: $ |
853 | ALIAS: |
854 | getCompat = 1 |
ac5ea531 |
855 | CODE: |
8f118dcd |
856 | if (Hangul_IsS(uv)) { |
fe067ad9 |
857 | U8 tmp[3 * UTF8_MAXLEN + 1]; |
858 | U8 *t = tmp; |
859 | U8 *e = pv_cat_decompHangul(t, uv); |
860 | RETVAL = newSVpvn((char *)t, e - t); |
ac5ea531 |
861 | } else { |
fe067ad9 |
862 | U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv); |
8f118dcd |
863 | if (!rstr) |
864 | XSRETURN_UNDEF; |
ac5ea531 |
865 | RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); |
866 | } |
867 | SvUTF8_on(RETVAL); |
868 | OUTPUT: |
869 | RETVAL |
870 | |
82e740b6 |
871 | |
872 | void |
a092bcfd |
873 | splitOnLastStarter(src) |
874 | SV * src |
82e740b6 |
875 | PREINIT: |
a092bcfd |
876 | SV *svp; |
fe067ad9 |
877 | STRLEN srclen; |
82e740b6 |
878 | U8 *s, *e, *p; |
879 | PPCODE: |
a092bcfd |
880 | s = (U8*)sv_2pvunicode(src,&srclen); |
82e740b6 |
881 | e = s + srclen; |
fe067ad9 |
882 | p = e; |
883 | while (s < p) { |
884 | UV uv; |
82e740b6 |
885 | p = utf8_hop(p, -1); |
886 | if (p < s) |
887 | croak(ErrHopBeforeStart); |
fe067ad9 |
888 | uv = utf8n_to_uvuni(p, e - p, NULL, AllowAnyUTF); |
82e740b6 |
889 | if (getCombinClass(uv) == 0) /* Last Starter found */ |
890 | break; |
891 | } |
892 | |
893 | svp = sv_2mortal(newSVpvn((char*)s, p - s)); |
894 | SvUTF8_on(svp); |
895 | XPUSHs(svp); |
896 | |
897 | svp = sv_2mortal(newSVpvn((char*)p, e - p)); |
898 | SvUTF8_on(svp); |
899 | XPUSHs(svp); |
900 | |