Commit | Line | Data |
ac5ea531 |
1 | |
2 | #include "EXTERN.h" |
3 | #include "perl.h" |
4 | #include "XSUB.h" |
5 | |
6 | /* These 5 files are prepared by mkheader */ |
7 | #include "unfcmb.h" |
8 | #include "unfcan.h" |
9 | #include "unfcpt.h" |
10 | #include "unfcmp.h" |
11 | #include "unfexc.h" |
12 | |
13 | /* Perl 5.6.1 ? */ |
14 | #ifndef uvuni_to_utf8 |
15 | #define uvuni_to_utf8 uv_to_utf8 |
6c941e0c |
16 | #endif /* uvuni_to_utf8 */ |
ac5ea531 |
17 | |
18 | /* Perl 5.6.1 ? */ |
ab8fe378 |
19 | #ifndef utf8n_to_uvuni |
20 | #define utf8n_to_uvuni utf8_to_uv |
6c941e0c |
21 | #endif /* utf8n_to_uvuni */ |
ac5ea531 |
22 | |
e524f5b2 |
23 | /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ |
24 | #ifdef UTF8_ALLOW_BOM |
25 | #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF) |
628bbff0 |
26 | #else |
e524f5b2 |
27 | #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF) |
28 | #endif |
29 | |
30 | /* if utf8n_to_uvuni() sets retlen to 0 (?) */ |
fe067ad9 |
31 | #define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character" |
82e740b6 |
32 | |
33 | /* utf8_hop() hops back before start. Maybe broken UTF-8 */ |
34 | #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" |
35 | |
fe067ad9 |
36 | /* It should never happen as there is no instance in UTF-8 and UTF-EBCDIC; |
37 | according to Versioning and Stability in UAX#15, no new composition |
38 | should come in future. */ |
39 | #define ErrLongerThanSrc "panic (Unicode::Normalize %s): longer than source" |
40 | |
41 | /* uvuni_to_utf8 wants UTF8_MAXBYTES free bytes available */ |
42 | #define ErrTargetNotEnough "panic (Unicode::Normalize %s): target not enough" |
43 | |
ac5ea531 |
44 | /* At present, char > 0x10ffff are unaffected without complaint, right? */ |
45 | #define VALID_UTF_MAX (0x10ffff) |
46 | #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) |
47 | |
fe067ad9 |
48 | /* size of array for combining characters */ |
49 | /* enough as an initial value? */ |
50 | #define CC_SEQ_SIZE (10) |
51 | #define CC_SEQ_STEP (5) |
52 | |
53 | /* HANGUL begin */ |
ac5ea531 |
54 | #define Hangul_SBase 0xAC00 |
55 | #define Hangul_SFinal 0xD7A3 |
56 | #define Hangul_SCount 11172 |
57 | |
58 | #define Hangul_NCount 588 |
59 | |
60 | #define Hangul_LBase 0x1100 |
61 | #define Hangul_LFinal 0x1112 |
62 | #define Hangul_LCount 19 |
63 | |
64 | #define Hangul_VBase 0x1161 |
65 | #define Hangul_VFinal 0x1175 |
66 | #define Hangul_VCount 21 |
67 | |
68 | #define Hangul_TBase 0x11A7 |
69 | #define Hangul_TFinal 0x11C2 |
70 | #define Hangul_TCount 28 |
71 | |
72 | #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) |
2a204b45 |
73 | #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) |
ac5ea531 |
74 | #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) |
75 | #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) |
76 | #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) |
77 | #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) |
fe067ad9 |
78 | /* HANGUL end */ |
ac5ea531 |
79 | |
80 | /* this is used for canonical ordering of combining characters (c.c.). */ |
81 | typedef struct { |
82 | U8 cc; /* combining class */ |
83 | UV uv; /* codepoint */ |
84 | STRLEN pos; /* position */ |
85 | } UNF_cc; |
86 | |
fe067ad9 |
87 | static int compare_cc(const void *a, const void *b) |
ac5ea531 |
88 | { |
89 | int ret_cc; |
6c941e0c |
90 | ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc; |
8f118dcd |
91 | if (ret_cc) |
92 | return ret_cc; |
6c941e0c |
93 | |
94 | return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos ) |
95 | - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos ); |
ac5ea531 |
96 | } |
97 | |
fe067ad9 |
98 | static U8* dec_canonical(UV uv) |
ac5ea531 |
99 | { |
100 | U8 ***plane, **row; |
8f118dcd |
101 | if (OVER_UTF_MAX(uv)) |
102 | return NULL; |
ac5ea531 |
103 | plane = (U8***)UNF_canon[uv >> 16]; |
8f118dcd |
104 | if (! plane) |
105 | return NULL; |
ac5ea531 |
106 | row = plane[(uv >> 8) & 0xff]; |
107 | return row ? row[uv & 0xff] : NULL; |
108 | } |
109 | |
fe067ad9 |
110 | static U8* dec_compat(UV uv) |
ac5ea531 |
111 | { |
112 | U8 ***plane, **row; |
8f118dcd |
113 | if (OVER_UTF_MAX(uv)) |
114 | return NULL; |
ac5ea531 |
115 | plane = (U8***)UNF_compat[uv >> 16]; |
8f118dcd |
116 | if (! plane) |
117 | return NULL; |
ac5ea531 |
118 | row = plane[(uv >> 8) & 0xff]; |
119 | return row ? row[uv & 0xff] : NULL; |
120 | } |
121 | |
fe067ad9 |
122 | static UV composite_uv(UV uv, UV uv2) |
ac5ea531 |
123 | { |
124 | UNF_complist ***plane, **row, *cell, *i; |
125 | |
fe067ad9 |
126 | if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) |
8f118dcd |
127 | return 0; |
ac5ea531 |
128 | |
8f118dcd |
129 | if (Hangul_IsL(uv) && Hangul_IsV(uv2)) { |
fe067ad9 |
130 | UV lindex = uv - Hangul_LBase; |
131 | UV vindex = uv2 - Hangul_VBase; |
132 | return(Hangul_SBase + (lindex * Hangul_VCount + vindex) * |
133 | Hangul_TCount); |
ac5ea531 |
134 | } |
8f118dcd |
135 | if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) { |
fe067ad9 |
136 | UV tindex = uv2 - Hangul_TBase; |
137 | return(uv + tindex); |
ac5ea531 |
138 | } |
139 | plane = UNF_compos[uv >> 16]; |
8f118dcd |
140 | if (! plane) |
141 | return 0; |
ac5ea531 |
142 | row = plane[(uv >> 8) & 0xff]; |
8f118dcd |
143 | if (! row) |
144 | return 0; |
ac5ea531 |
145 | cell = row[uv & 0xff]; |
8f118dcd |
146 | if (! cell) |
147 | return 0; |
148 | for (i = cell; i->nextchar; i++) { |
149 | if (uv2 == i->nextchar) |
150 | return i->composite; |
ac5ea531 |
151 | } |
152 | return 0; |
153 | } |
154 | |
fe067ad9 |
155 | static U8 getCombinClass(UV uv) |
ac5ea531 |
156 | { |
157 | U8 **plane, *row; |
8f118dcd |
158 | if (OVER_UTF_MAX(uv)) |
159 | return 0; |
ac5ea531 |
160 | plane = (U8**)UNF_combin[uv >> 16]; |
8f118dcd |
161 | if (! plane) |
162 | return 0; |
ac5ea531 |
163 | row = plane[(uv >> 8) & 0xff]; |
164 | return row ? row[uv & 0xff] : 0; |
165 | } |
166 | |
fe067ad9 |
167 | static U8* pv_cat_decompHangul(U8* d, UV uv) |
ac5ea531 |
168 | { |
fe067ad9 |
169 | UV sindex = uv - Hangul_SBase; |
170 | UV lindex = sindex / Hangul_NCount; |
171 | UV vindex = (sindex % Hangul_NCount) / Hangul_TCount; |
172 | UV tindex = sindex % Hangul_TCount; |
ac5ea531 |
173 | |
8f118dcd |
174 | if (! Hangul_IsS(uv)) |
fe067ad9 |
175 | return d; |
ac5ea531 |
176 | |
fe067ad9 |
177 | d = uvuni_to_utf8(d, (lindex + Hangul_LBase)); |
178 | d = uvuni_to_utf8(d, (vindex + Hangul_VBase)); |
8f118dcd |
179 | if (tindex) |
fe067ad9 |
180 | d = uvuni_to_utf8(d, (tindex + Hangul_TBase)); |
181 | return d; |
ac5ea531 |
182 | } |
183 | |
39f4556f |
184 | static char* sv_2pvunicode(SV *sv, STRLEN *lp) |
a092bcfd |
185 | { |
186 | char *s; |
187 | STRLEN len; |
39f4556f |
188 | s = SvPV(sv,len); |
a092bcfd |
189 | if (!SvUTF8(sv)) { |
39f4556f |
190 | SV* tmpsv = sv_2mortal(newSVpvn(s, len)); |
a092bcfd |
191 | if (!SvPOK(tmpsv)) |
39f4556f |
192 | s = SvPV_force(tmpsv,len); |
a092bcfd |
193 | sv_utf8_upgrade(tmpsv); |
39f4556f |
194 | s = SvPV(tmpsv,len); |
a092bcfd |
195 | } |
fe067ad9 |
196 | if (lp) |
197 | *lp = len; |
a092bcfd |
198 | return s; |
199 | } |
200 | |
fe067ad9 |
201 | static |
202 | U8* pv_utf8_decompose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat) |
203 | { |
204 | U8* p = s; |
205 | U8* e = s + slen; |
206 | U8* dstart = *dp; |
207 | U8* d = dstart; |
208 | |
209 | while (p < e) { |
210 | STRLEN retlen; |
211 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
212 | if (!retlen) |
fe067ad9 |
213 | croak(ErrRetlenIsZero, "decompose"); |
214 | p += retlen; |
215 | |
216 | if (Hangul_IsS(uv)) { |
217 | STRLEN cur = d - dstart; |
82e740b6 |
218 | |
fe067ad9 |
219 | if (dlen < cur + UTF8_MAXLEN * 3) { |
220 | dlen += UTF8_MAXLEN * 3; |
221 | Renew(dstart, dlen+1, U8); |
222 | d = dstart + cur; |
223 | } |
224 | d = pv_cat_decompHangul(d, uv); |
225 | } |
ac5ea531 |
226 | else { |
fe067ad9 |
227 | U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv); |
228 | |
229 | if (r) { |
230 | STRLEN len = (STRLEN)strlen((char *)r); |
231 | STRLEN cur = d - dstart; |
232 | if (dlen < cur + len) { |
233 | dlen += len; |
234 | Renew(dstart, dlen+1, U8); |
235 | d = dstart + cur; |
236 | } |
237 | while (len--) |
238 | *d++ = *r++; |
239 | } |
240 | else { |
241 | STRLEN cur = d - dstart; |
242 | |
243 | if (dlen < cur + UTF8_MAXLEN) { |
244 | dlen += UTF8_MAXLEN; |
245 | Renew(dstart, dlen+1, U8); |
246 | d = dstart + cur; |
247 | } |
248 | d = uvuni_to_utf8(d, uv); |
249 | } |
ac5ea531 |
250 | } |
251 | } |
fe067ad9 |
252 | *dp = dstart; |
253 | return d; |
254 | } |
ac5ea531 |
255 | |
fe067ad9 |
256 | static |
257 | U8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen) |
258 | { |
259 | U8* p = s; |
260 | U8* e = s + slen; |
261 | U8* dend = d + dlen; |
262 | |
263 | UNF_cc seq_ary[CC_SEQ_SIZE]; |
264 | UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */ |
265 | UNF_cc* seq_ext = NULL; /* extend if need */ |
266 | STRLEN seq_max = CC_SEQ_SIZE; |
267 | STRLEN cc_pos = 0; |
268 | |
269 | if (dlen < slen || dlen < slen + UTF8_MAXLEN) |
270 | croak(ErrTargetNotEnough, "reorder"); |
271 | dend -= UTF8_MAXLEN; /* safety */ |
272 | |
273 | while (p < e) { |
274 | U8 curCC; |
275 | STRLEN retlen; |
276 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
277 | if (!retlen) |
278 | croak(ErrRetlenIsZero, "reorder"); |
279 | p += retlen; |
ac5ea531 |
280 | |
fe067ad9 |
281 | curCC = getCombinClass(uv); |
ac5ea531 |
282 | |
fe067ad9 |
283 | if (curCC != 0) { |
284 | if (seq_max < cc_pos + 1) { /* extend if need */ |
285 | seq_max = cc_pos + CC_SEQ_STEP; /* new size */ |
286 | if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ |
287 | STRLEN i; |
288 | New(0, seq_ext, seq_max, UNF_cc); |
289 | for (i = 0; i < cc_pos; i++) |
290 | seq_ext[i] = seq_ary[i]; |
291 | } |
292 | else { |
293 | Renew(seq_ext, seq_max, UNF_cc); |
294 | } |
39f4556f |
295 | seq_ptr = seq_ext; /* use seq_ext from now */ |
fe067ad9 |
296 | } |
a092bcfd |
297 | |
fe067ad9 |
298 | seq_ptr[cc_pos].cc = curCC; |
299 | seq_ptr[cc_pos].uv = uv; |
300 | seq_ptr[cc_pos].pos = cc_pos; |
301 | ++cc_pos; |
ac5ea531 |
302 | |
fe067ad9 |
303 | if (p < e) |
304 | continue; |
305 | } |
ac5ea531 |
306 | |
fe067ad9 |
307 | if (cc_pos) { |
308 | STRLEN i; |
309 | |
310 | if (cc_pos > 1) /* reordered if there are two c.c.'s */ |
311 | qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc); |
312 | |
313 | for (i = 0; i < cc_pos; i++) { |
314 | d = uvuni_to_utf8(d, seq_ptr[i].uv); |
315 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ |
316 | croak(ErrLongerThanSrc, "reorder"); |
317 | } |
318 | cc_pos = 0; |
319 | } |
2a204b45 |
320 | |
e524f5b2 |
321 | if (curCC == 0) { |
322 | d = uvuni_to_utf8(d, uv); |
fe067ad9 |
323 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ |
324 | croak(ErrLongerThanSrc, "reorder"); |
e524f5b2 |
325 | } |
fe067ad9 |
326 | } |
327 | if (seq_ext) |
328 | Safefree(seq_ext); |
329 | return d; |
330 | } |
ac5ea531 |
331 | |
fe067ad9 |
332 | static |
333 | U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig) |
334 | { |
335 | U8* p = s; |
336 | U8* e = s + slen; |
337 | U8* dend = d + dlen; |
338 | |
39f4556f |
339 | UV uvS = 0; /* code point of the starter */ |
fe067ad9 |
340 | bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */ |
341 | U8 preCC = 0; |
342 | |
343 | UV seq_ary[CC_SEQ_SIZE]; |
344 | UV* seq_ptr = seq_ary; /* use array at the beginning */ |
345 | UV* seq_ext = NULL; /* extend if need */ |
346 | STRLEN seq_max = CC_SEQ_SIZE; |
347 | STRLEN cc_pos = 0; |
348 | |
349 | if (dlen < slen || dlen < slen + UTF8_MAXLEN) |
350 | croak(ErrTargetNotEnough, "compose"); |
351 | dend -= UTF8_MAXLEN; /* safety */ |
352 | |
353 | while (p < e) { |
354 | U8 curCC; |
355 | STRLEN retlen; |
356 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
357 | if (!retlen) |
358 | croak(ErrRetlenIsZero, "compose"); |
359 | p += retlen; |
ac5ea531 |
360 | |
fe067ad9 |
361 | curCC = getCombinClass(uv); |
82e740b6 |
362 | |
fe067ad9 |
363 | if (!valid_uvS) { |
e524f5b2 |
364 | if (curCC == 0) { |
fe067ad9 |
365 | uvS = uv; /* the first Starter is found */ |
366 | valid_uvS = TRUE; |
367 | if (p < e) |
368 | continue; |
e524f5b2 |
369 | } |
fe067ad9 |
370 | else { |
371 | d = uvuni_to_utf8(d, uv); |
372 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ |
373 | croak(ErrLongerThanSrc, "compose"); |
374 | continue; |
375 | } |
376 | } |
377 | else { |
378 | bool composed; |
379 | |
380 | /* blocked */ |
381 | if (iscontig && cc_pos || /* discontiguous combination */ |
382 | curCC != 0 && preCC == curCC || /* blocked by same CC */ |
383 | preCC > curCC) /* blocked by higher CC: revised D2 */ |
384 | composed = FALSE; |
385 | |
386 | /* not blocked: |
387 | iscontig && cc_pos == 0 -- contiguous combination |
388 | curCC == 0 && preCC == 0 -- starter + starter |
389 | curCC != 0 && preCC < curCC -- lower CC */ |
390 | else { |
391 | /* try composition */ |
392 | UV uvComp = composite_uv(uvS, uv); |
393 | |
394 | if (uvComp && !isExclusion(uvComp)) { |
395 | uvS = uvComp; |
396 | composed = TRUE; |
82e740b6 |
397 | |
fe067ad9 |
398 | /* preCC should not be changed to curCC */ |
399 | /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */ |
400 | if (p < e) |
401 | continue; |
402 | } |
403 | else |
404 | composed = FALSE; |
405 | } |
406 | |
407 | if (!composed) { |
408 | preCC = curCC; |
409 | if (curCC != 0 || !(p < e)) { |
410 | if (seq_max < cc_pos + 1) { /* extend if need */ |
411 | seq_max = cc_pos + CC_SEQ_STEP; /* new size */ |
412 | if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ |
413 | New(0, seq_ext, seq_max, UV); |
414 | Copy(seq_ary, seq_ext, cc_pos, UV); |
415 | } |
416 | else { |
417 | Renew(seq_ext, seq_max, UV); |
418 | } |
419 | seq_ptr = seq_ext; /* till now use seq_ext */ |
420 | } |
421 | seq_ptr[cc_pos] = uv; |
422 | ++cc_pos; |
423 | } |
424 | if (curCC != 0 && p < e) |
425 | continue; |
ac5ea531 |
426 | } |
ac5ea531 |
427 | } |
428 | |
fe067ad9 |
429 | d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */ |
430 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ |
431 | croak(ErrLongerThanSrc, "compose"); |
432 | |
e524f5b2 |
433 | if (cc_pos) { |
fe067ad9 |
434 | STRLEN i; |
ac5ea531 |
435 | |
fe067ad9 |
436 | for (i = 0; i < cc_pos; i++) { |
437 | d = uvuni_to_utf8(d, seq_ptr[i]); |
438 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ |
439 | croak(ErrLongerThanSrc, "compose"); |
440 | } |
441 | cc_pos = 0; |
ac5ea531 |
442 | } |
fe067ad9 |
443 | |
444 | uvS = uv; |
ac5ea531 |
445 | } |
fe067ad9 |
446 | if (seq_ext) |
447 | Safefree(seq_ext); |
448 | return d; |
449 | } |
450 | |
451 | MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize |
452 | |
453 | SV* |
454 | decompose(src, compat = &PL_sv_no) |
455 | SV * src |
456 | SV * compat |
457 | PROTOTYPE: $;$ |
458 | PREINIT: |
459 | SV* dst; |
460 | U8 *s, *d, *dend; |
461 | STRLEN slen, dlen; |
462 | CODE: |
463 | s = (U8*)sv_2pvunicode(src,&slen); |
464 | dst = newSVpvn("", 0); |
465 | dlen = slen; |
466 | New(0, d, dlen+1, U8); |
39f4556f |
467 | dend = pv_utf8_decompose(s, slen, &d, dlen, (bool)SvTRUE(compat)); |
9b374906 |
468 | sv_setpvn(dst, (char *)d, dend - d); |
fe067ad9 |
469 | SvUTF8_on(dst); |
470 | Safefree(d); |
8f118dcd |
471 | RETVAL = dst; |
ac5ea531 |
472 | OUTPUT: |
473 | RETVAL |
474 | |
fe067ad9 |
475 | SV* |
476 | reorder(src) |
477 | SV * src |
478 | PROTOTYPE: $ |
479 | PREINIT: |
480 | SV* dst; |
481 | U8 *s, *d, *dend; |
482 | STRLEN slen, dlen; |
483 | CODE: |
484 | s = (U8*)sv_2pvunicode(src,&slen); |
485 | dst = newSVpvn("", 0); |
486 | dlen = slen + UTF8_MAXLEN; |
487 | d = (U8*)SvGROW(dst,dlen+1); |
488 | SvUTF8_on(dst); |
489 | dend = pv_utf8_reorder(s, slen, d, dlen); |
490 | *dend = '\0'; |
491 | SvCUR_set(dst, dend - d); |
492 | RETVAL = dst; |
493 | OUTPUT: |
494 | RETVAL |
ac5ea531 |
495 | |
2a204b45 |
496 | SV* |
a092bcfd |
497 | compose(src) |
498 | SV * src |
ac5ea531 |
499 | PROTOTYPE: $ |
82e740b6 |
500 | ALIAS: |
501 | composeContiguous = 1 |
ac5ea531 |
502 | PREINIT: |
fe067ad9 |
503 | SV* dst; |
504 | U8 *s, *d, *dend; |
505 | STRLEN slen, dlen; |
2a204b45 |
506 | CODE: |
fe067ad9 |
507 | s = (U8*)sv_2pvunicode(src,&slen); |
508 | dst = newSVpvn("", 0); |
509 | dlen = slen + UTF8_MAXLEN; |
510 | d = (U8*)SvGROW(dst,dlen+1); |
ac5ea531 |
511 | SvUTF8_on(dst); |
fe067ad9 |
512 | dend = pv_utf8_compose(s, slen, d, dlen, (bool)ix); |
513 | *dend = '\0'; |
514 | SvCUR_set(dst, dend - d); |
515 | RETVAL = dst; |
516 | OUTPUT: |
517 | RETVAL |
ac5ea531 |
518 | |
fe067ad9 |
519 | SV* |
520 | NFD(src) |
521 | SV * src |
522 | PROTOTYPE: $ |
523 | ALIAS: |
524 | NFKD = 1 |
525 | PREINIT: |
526 | SV *dst; |
527 | U8 *s, *t, *tend, *d, *dend; |
528 | STRLEN slen, tlen, dlen; |
529 | CODE: |
530 | /* decompose */ |
531 | s = (U8*)sv_2pvunicode(src,&slen); |
532 | tlen = slen; |
533 | New(0, t, tlen+1, U8); |
534 | tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)ix); |
535 | *tend = '\0'; |
536 | tlen = tend - t; /* no longer know real tlen */ |
537 | |
538 | /* reorder */ |
539 | dst = newSVpvn("", 0); |
540 | dlen = tlen + UTF8_MAXLEN; |
541 | d = (U8*)SvGROW(dst,dlen+1); |
542 | SvUTF8_on(dst); |
543 | dend = pv_utf8_reorder(t, tlen, d, dlen); |
544 | *dend = '\0'; |
545 | SvCUR_set(dst, dend - d); |
ac5ea531 |
546 | |
fe067ad9 |
547 | /* return */ |
548 | Safefree(t); |
549 | RETVAL = dst; |
550 | OUTPUT: |
551 | RETVAL |
82e740b6 |
552 | |
fe067ad9 |
553 | SV* |
554 | NFC(src) |
555 | SV * src |
556 | PROTOTYPE: $ |
557 | ALIAS: |
558 | NFKC = 1 |
559 | FCC = 2 |
560 | PREINIT: |
561 | SV *dst; |
562 | U8 *s, *t, *tend, *u, *uend, *d, *dend; |
563 | STRLEN slen, tlen, ulen, dlen; |
564 | CODE: |
565 | /* decompose */ |
566 | s = (U8*)sv_2pvunicode(src,&slen); |
567 | tlen = slen; |
568 | New(0, t, tlen+1, U8); |
569 | tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)(ix==1)); |
570 | *tend = '\0'; |
571 | tlen = tend - t; /* no longer know real tlen */ |
572 | |
573 | /* reorder */ |
574 | ulen = tlen + UTF8_MAXLEN; |
575 | New(0, u, ulen+1, U8); |
576 | uend = pv_utf8_reorder(t, tlen, u, ulen); |
577 | *uend = '\0'; |
578 | ulen = uend - u; |
579 | |
580 | /* compose */ |
581 | dst = newSVpvn("", 0); |
582 | dlen = ulen + UTF8_MAXLEN; |
583 | d = (U8*)SvGROW(dst,dlen+1); |
584 | SvUTF8_on(dst); |
585 | dend = pv_utf8_compose(u, ulen, d, dlen, (bool)(ix==2)); |
586 | *dend = '\0'; |
587 | SvCUR_set(dst, dend - d); |
ac5ea531 |
588 | |
fe067ad9 |
589 | /* return */ |
590 | Safefree(t); |
591 | Safefree(u); |
2a204b45 |
592 | RETVAL = dst; |
593 | OUTPUT: |
594 | RETVAL |
ac5ea531 |
595 | |
8f118dcd |
596 | void |
a092bcfd |
597 | checkNFD(src) |
598 | SV * src |
8f118dcd |
599 | PROTOTYPE: $ |
600 | ALIAS: |
601 | checkNFKD = 1 |
602 | PREINIT: |
8f118dcd |
603 | STRLEN srclen, retlen; |
604 | U8 *s, *e, *p, curCC, preCC; |
82e740b6 |
605 | CODE: |
a092bcfd |
606 | s = (U8*)sv_2pvunicode(src,&srclen); |
8f118dcd |
607 | e = s + srclen; |
608 | |
609 | preCC = 0; |
610 | for (p = s; p < e; p += retlen) { |
fe067ad9 |
611 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
612 | if (!retlen) |
fe067ad9 |
613 | croak(ErrRetlenIsZero, "checkNFD or -NFKD"); |
82e740b6 |
614 | |
8f118dcd |
615 | curCC = getCombinClass(uv); |
616 | if (preCC > curCC && curCC != 0) /* canonical ordering violated */ |
617 | XSRETURN_NO; |
618 | if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) |
619 | XSRETURN_NO; |
620 | preCC = curCC; |
621 | } |
622 | XSRETURN_YES; |
623 | |
624 | |
625 | |
626 | void |
a092bcfd |
627 | checkNFC(src) |
628 | SV * src |
8f118dcd |
629 | PROTOTYPE: $ |
630 | ALIAS: |
631 | checkNFKC = 1 |
632 | PREINIT: |
8f118dcd |
633 | STRLEN srclen, retlen; |
634 | U8 *s, *e, *p, curCC, preCC; |
635 | bool isMAYBE; |
82e740b6 |
636 | CODE: |
a092bcfd |
637 | s = (U8*)sv_2pvunicode(src,&srclen); |
8f118dcd |
638 | e = s + srclen; |
639 | |
640 | preCC = 0; |
641 | isMAYBE = FALSE; |
642 | for (p = s; p < e; p += retlen) { |
fe067ad9 |
643 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
644 | if (!retlen) |
fe067ad9 |
645 | croak(ErrRetlenIsZero, "checkNFC or -NFKC"); |
82e740b6 |
646 | |
8f118dcd |
647 | curCC = getCombinClass(uv); |
8f118dcd |
648 | if (preCC > curCC && curCC != 0) /* canonical ordering violated */ |
649 | XSRETURN_NO; |
650 | |
651 | /* get NFC/NFKC property */ |
652 | if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */ |
653 | ; /* YES */ |
654 | else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) |
655 | XSRETURN_NO; |
656 | else if (isComp2nd(uv)) |
657 | isMAYBE = TRUE; |
658 | else if (ix) { |
659 | char *canon, *compat; |
6c941e0c |
660 | /* NFKC_NO when having compatibility mapping. */ |
8f118dcd |
661 | canon = (char *) dec_canonical(uv); |
662 | compat = (char *) dec_compat(uv); |
6c941e0c |
663 | if (compat && !(canon && strEQ(canon, compat))) |
8f118dcd |
664 | XSRETURN_NO; |
665 | } /* end of get NFC/NFKC property */ |
666 | |
667 | preCC = curCC; |
668 | } |
669 | if (isMAYBE) |
670 | XSRETURN_UNDEF; |
671 | else |
672 | XSRETURN_YES; |
673 | |
674 | |
675 | |
82e740b6 |
676 | void |
a092bcfd |
677 | checkFCD(src) |
678 | SV * src |
82e740b6 |
679 | PROTOTYPE: $ |
680 | ALIAS: |
681 | checkFCC = 1 |
682 | PREINIT: |
fe067ad9 |
683 | STRLEN srclen, retlen; |
82e740b6 |
684 | U8 *s, *e, *p, curCC, preCC; |
82e740b6 |
685 | bool isMAYBE; |
686 | CODE: |
a092bcfd |
687 | s = (U8*)sv_2pvunicode(src,&srclen); |
82e740b6 |
688 | e = s + srclen; |
82e740b6 |
689 | preCC = 0; |
690 | isMAYBE = FALSE; |
691 | for (p = s; p < e; p += retlen) { |
fe067ad9 |
692 | U8 *sCan; |
693 | UV uvLead; |
39f4556f |
694 | STRLEN canlen = 0; |
fe067ad9 |
695 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
696 | if (!retlen) |
fe067ad9 |
697 | croak(ErrRetlenIsZero, "checkFCD or -FCC"); |
82e740b6 |
698 | |
699 | sCan = (U8*) dec_canonical(uv); |
700 | |
701 | if (sCan) { |
39f4556f |
702 | STRLEN canret; |
82e740b6 |
703 | canlen = (STRLEN)strlen((char *) sCan); |
e524f5b2 |
704 | uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF); |
fe067ad9 |
705 | if (!canret) |
706 | croak(ErrRetlenIsZero, "checkFCD or -FCC"); |
82e740b6 |
707 | } |
708 | else { |
709 | uvLead = uv; |
710 | } |
711 | |
712 | curCC = getCombinClass(uvLead); |
713 | |
714 | if (curCC != 0 && curCC < preCC) /* canonical ordering violated */ |
715 | XSRETURN_NO; |
716 | |
717 | if (ix) { |
718 | if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) |
719 | XSRETURN_NO; |
720 | else if (isComp2nd(uv)) |
721 | isMAYBE = TRUE; |
722 | } |
723 | |
724 | if (sCan) { |
39f4556f |
725 | STRLEN canret; |
fe067ad9 |
726 | UV uvTrail; |
727 | U8* eCan = sCan + canlen; |
728 | U8* pCan = utf8_hop(eCan, -1); |
82e740b6 |
729 | if (pCan < sCan) |
730 | croak(ErrHopBeforeStart); |
e524f5b2 |
731 | uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF); |
fe067ad9 |
732 | if (!canret) |
733 | croak(ErrRetlenIsZero, "checkFCD or -FCC"); |
82e740b6 |
734 | preCC = getCombinClass(uvTrail); |
735 | } |
736 | else { |
737 | preCC = curCC; |
738 | } |
739 | } |
740 | if (isMAYBE) |
741 | XSRETURN_UNDEF; |
742 | else |
743 | XSRETURN_YES; |
744 | |
745 | |
746 | |
ac5ea531 |
747 | U8 |
748 | getCombinClass(uv) |
749 | UV uv |
8f118dcd |
750 | PROTOTYPE: $ |
ac5ea531 |
751 | |
752 | bool |
2a204b45 |
753 | isExclusion(uv) |
ac5ea531 |
754 | UV uv |
8f118dcd |
755 | PROTOTYPE: $ |
756 | |
757 | bool |
758 | isSingleton(uv) |
759 | UV uv |
760 | PROTOTYPE: $ |
761 | |
762 | bool |
763 | isNonStDecomp(uv) |
764 | UV uv |
765 | PROTOTYPE: $ |
766 | |
767 | bool |
768 | isComp2nd(uv) |
769 | UV uv |
770 | PROTOTYPE: $ |
771 | ALIAS: |
772 | isNFC_MAYBE = 1 |
773 | isNFKC_MAYBE = 2 |
774 | |
775 | |
776 | |
777 | void |
778 | isNFD_NO(uv) |
779 | UV uv |
780 | PROTOTYPE: $ |
781 | ALIAS: |
782 | isNFKD_NO = 1 |
82e740b6 |
783 | CODE: |
8f118dcd |
784 | if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) |
785 | XSRETURN_YES; /* NFD_NO or NFKD_NO */ |
786 | else |
787 | XSRETURN_NO; |
788 | |
789 | |
790 | |
791 | void |
792 | isComp_Ex(uv) |
793 | UV uv |
794 | PROTOTYPE: $ |
795 | ALIAS: |
796 | isNFC_NO = 0 |
797 | isNFKC_NO = 1 |
82e740b6 |
798 | CODE: |
8f118dcd |
799 | if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) |
800 | XSRETURN_YES; /* NFC_NO or NFKC_NO */ |
801 | else if (ix) { |
802 | char *canon, *compat; |
803 | canon = (char *) dec_canonical(uv); |
804 | compat = (char *) dec_compat(uv); |
805 | if (compat && (!canon || strNE(canon, compat))) |
806 | XSRETURN_YES; /* NFC_NO or NFKC_NO */ |
807 | else |
808 | XSRETURN_NO; |
809 | } |
810 | else |
811 | XSRETURN_NO; |
812 | |
813 | |
ac5ea531 |
814 | |
2a204b45 |
815 | SV* |
ac5ea531 |
816 | getComposite(uv, uv2) |
817 | UV uv |
818 | UV uv2 |
2a204b45 |
819 | PROTOTYPE: $$ |
820 | PREINIT: |
bcdb689b |
821 | UV composite; |
2a204b45 |
822 | CODE: |
bcdb689b |
823 | composite = composite_uv(uv, uv2); |
824 | RETVAL = composite ? newSVuv(composite) : &PL_sv_undef; |
2a204b45 |
825 | OUTPUT: |
826 | RETVAL |
ac5ea531 |
827 | |
8f118dcd |
828 | |
829 | |
ac5ea531 |
830 | SV* |
831 | getCanon(uv) |
832 | UV uv |
833 | PROTOTYPE: $ |
834 | ALIAS: |
835 | getCompat = 1 |
ac5ea531 |
836 | CODE: |
8f118dcd |
837 | if (Hangul_IsS(uv)) { |
fe067ad9 |
838 | U8 tmp[3 * UTF8_MAXLEN + 1]; |
839 | U8 *t = tmp; |
840 | U8 *e = pv_cat_decompHangul(t, uv); |
841 | RETVAL = newSVpvn((char *)t, e - t); |
ac5ea531 |
842 | } else { |
fe067ad9 |
843 | U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv); |
8f118dcd |
844 | if (!rstr) |
845 | XSRETURN_UNDEF; |
ac5ea531 |
846 | RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); |
847 | } |
848 | SvUTF8_on(RETVAL); |
849 | OUTPUT: |
850 | RETVAL |
851 | |
82e740b6 |
852 | |
853 | void |
a092bcfd |
854 | splitOnLastStarter(src) |
855 | SV * src |
82e740b6 |
856 | PREINIT: |
a092bcfd |
857 | SV *svp; |
fe067ad9 |
858 | STRLEN srclen; |
82e740b6 |
859 | U8 *s, *e, *p; |
860 | PPCODE: |
a092bcfd |
861 | s = (U8*)sv_2pvunicode(src,&srclen); |
82e740b6 |
862 | e = s + srclen; |
fe067ad9 |
863 | p = e; |
864 | while (s < p) { |
865 | UV uv; |
82e740b6 |
866 | p = utf8_hop(p, -1); |
867 | if (p < s) |
868 | croak(ErrHopBeforeStart); |
fe067ad9 |
869 | uv = utf8n_to_uvuni(p, e - p, NULL, AllowAnyUTF); |
82e740b6 |
870 | if (getCombinClass(uv) == 0) /* Last Starter found */ |
871 | break; |
872 | } |
873 | |
874 | svp = sv_2mortal(newSVpvn((char*)s, p - s)); |
875 | SvUTF8_on(svp); |
876 | XPUSHs(svp); |
877 | |
878 | svp = sv_2mortal(newSVpvn((char*)p, e - p)); |
879 | SvUTF8_on(svp); |
880 | XPUSHs(svp); |
881 | |