Commit | Line | Data |
ac5ea531 |
1 | |
2 | #include "EXTERN.h" |
3 | #include "perl.h" |
4 | #include "XSUB.h" |
5 | |
6 | /* These 5 files are prepared by mkheader */ |
7 | #include "unfcmb.h" |
8 | #include "unfcan.h" |
9 | #include "unfcpt.h" |
10 | #include "unfcmp.h" |
11 | #include "unfexc.h" |
12 | |
13 | /* Perl 5.6.1 ? */ |
14 | #ifndef uvuni_to_utf8 |
15 | #define uvuni_to_utf8 uv_to_utf8 |
6c941e0c |
16 | #endif /* uvuni_to_utf8 */ |
ac5ea531 |
17 | |
18 | /* Perl 5.6.1 ? */ |
ab8fe378 |
19 | #ifndef utf8n_to_uvuni |
20 | #define utf8n_to_uvuni utf8_to_uv |
6c941e0c |
21 | #endif /* utf8n_to_uvuni */ |
ac5ea531 |
22 | |
e524f5b2 |
23 | /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ |
24 | #ifdef UTF8_ALLOW_BOM |
25 | #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF) |
628bbff0 |
26 | #else |
e524f5b2 |
27 | #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF) |
28 | #endif |
29 | |
30 | /* if utf8n_to_uvuni() sets retlen to 0 (?) */ |
fe067ad9 |
31 | #define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character" |
82e740b6 |
32 | |
33 | /* utf8_hop() hops back before start. Maybe broken UTF-8 */ |
34 | #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" |
35 | |
fe067ad9 |
36 | /* It should never happen as there is no instance in UTF-8 and UTF-EBCDIC; |
37 | according to Versioning and Stability in UAX#15, no new composition |
38 | should come in future. */ |
39 | #define ErrLongerThanSrc "panic (Unicode::Normalize %s): longer than source" |
40 | |
41 | /* uvuni_to_utf8 wants UTF8_MAXBYTES free bytes available */ |
42 | #define ErrTargetNotEnough "panic (Unicode::Normalize %s): target not enough" |
43 | |
ac5ea531 |
44 | /* At present, char > 0x10ffff are unaffected without complaint, right? */ |
45 | #define VALID_UTF_MAX (0x10ffff) |
46 | #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) |
47 | |
fe067ad9 |
48 | /* size of array for combining characters */ |
49 | /* enough as an initial value? */ |
50 | #define CC_SEQ_SIZE (10) |
51 | #define CC_SEQ_STEP (5) |
52 | |
53 | /* HANGUL begin */ |
ac5ea531 |
54 | #define Hangul_SBase 0xAC00 |
55 | #define Hangul_SFinal 0xD7A3 |
56 | #define Hangul_SCount 11172 |
57 | |
58 | #define Hangul_NCount 588 |
59 | |
60 | #define Hangul_LBase 0x1100 |
61 | #define Hangul_LFinal 0x1112 |
62 | #define Hangul_LCount 19 |
63 | |
64 | #define Hangul_VBase 0x1161 |
65 | #define Hangul_VFinal 0x1175 |
66 | #define Hangul_VCount 21 |
67 | |
68 | #define Hangul_TBase 0x11A7 |
69 | #define Hangul_TFinal 0x11C2 |
70 | #define Hangul_TCount 28 |
71 | |
72 | #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) |
2a204b45 |
73 | #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) |
ac5ea531 |
74 | #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) |
75 | #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) |
76 | #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) |
77 | #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) |
fe067ad9 |
78 | /* HANGUL end */ |
ac5ea531 |
79 | |
80 | /* this is used for canonical ordering of combining characters (c.c.). */ |
81 | typedef struct { |
82 | U8 cc; /* combining class */ |
83 | UV uv; /* codepoint */ |
84 | STRLEN pos; /* position */ |
85 | } UNF_cc; |
86 | |
fe067ad9 |
87 | static int compare_cc(const void *a, const void *b) |
ac5ea531 |
88 | { |
89 | int ret_cc; |
6c941e0c |
90 | ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc; |
8f118dcd |
91 | if (ret_cc) |
92 | return ret_cc; |
6c941e0c |
93 | |
94 | return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos ) |
95 | - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos ); |
ac5ea531 |
96 | } |
97 | |
fe067ad9 |
98 | static U8* dec_canonical(UV uv) |
ac5ea531 |
99 | { |
100 | U8 ***plane, **row; |
8f118dcd |
101 | if (OVER_UTF_MAX(uv)) |
102 | return NULL; |
ac5ea531 |
103 | plane = (U8***)UNF_canon[uv >> 16]; |
8f118dcd |
104 | if (! plane) |
105 | return NULL; |
ac5ea531 |
106 | row = plane[(uv >> 8) & 0xff]; |
107 | return row ? row[uv & 0xff] : NULL; |
108 | } |
109 | |
fe067ad9 |
110 | static U8* dec_compat(UV uv) |
ac5ea531 |
111 | { |
112 | U8 ***plane, **row; |
8f118dcd |
113 | if (OVER_UTF_MAX(uv)) |
114 | return NULL; |
ac5ea531 |
115 | plane = (U8***)UNF_compat[uv >> 16]; |
8f118dcd |
116 | if (! plane) |
117 | return NULL; |
ac5ea531 |
118 | row = plane[(uv >> 8) & 0xff]; |
119 | return row ? row[uv & 0xff] : NULL; |
120 | } |
121 | |
fe067ad9 |
122 | static UV composite_uv(UV uv, UV uv2) |
ac5ea531 |
123 | { |
124 | UNF_complist ***plane, **row, *cell, *i; |
125 | |
fe067ad9 |
126 | if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) |
8f118dcd |
127 | return 0; |
ac5ea531 |
128 | |
8f118dcd |
129 | if (Hangul_IsL(uv) && Hangul_IsV(uv2)) { |
fe067ad9 |
130 | UV lindex = uv - Hangul_LBase; |
131 | UV vindex = uv2 - Hangul_VBase; |
132 | return(Hangul_SBase + (lindex * Hangul_VCount + vindex) * |
133 | Hangul_TCount); |
ac5ea531 |
134 | } |
8f118dcd |
135 | if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) { |
fe067ad9 |
136 | UV tindex = uv2 - Hangul_TBase; |
137 | return(uv + tindex); |
ac5ea531 |
138 | } |
139 | plane = UNF_compos[uv >> 16]; |
8f118dcd |
140 | if (! plane) |
141 | return 0; |
ac5ea531 |
142 | row = plane[(uv >> 8) & 0xff]; |
8f118dcd |
143 | if (! row) |
144 | return 0; |
ac5ea531 |
145 | cell = row[uv & 0xff]; |
8f118dcd |
146 | if (! cell) |
147 | return 0; |
148 | for (i = cell; i->nextchar; i++) { |
149 | if (uv2 == i->nextchar) |
150 | return i->composite; |
ac5ea531 |
151 | } |
152 | return 0; |
153 | } |
154 | |
fe067ad9 |
155 | static U8 getCombinClass(UV uv) |
ac5ea531 |
156 | { |
157 | U8 **plane, *row; |
8f118dcd |
158 | if (OVER_UTF_MAX(uv)) |
159 | return 0; |
ac5ea531 |
160 | plane = (U8**)UNF_combin[uv >> 16]; |
8f118dcd |
161 | if (! plane) |
162 | return 0; |
ac5ea531 |
163 | row = plane[(uv >> 8) & 0xff]; |
164 | return row ? row[uv & 0xff] : 0; |
165 | } |
166 | |
fe067ad9 |
167 | static U8* pv_cat_decompHangul(U8* d, UV uv) |
ac5ea531 |
168 | { |
fe067ad9 |
169 | UV sindex = uv - Hangul_SBase; |
170 | UV lindex = sindex / Hangul_NCount; |
171 | UV vindex = (sindex % Hangul_NCount) / Hangul_TCount; |
172 | UV tindex = sindex % Hangul_TCount; |
ac5ea531 |
173 | |
8f118dcd |
174 | if (! Hangul_IsS(uv)) |
fe067ad9 |
175 | return d; |
ac5ea531 |
176 | |
fe067ad9 |
177 | d = uvuni_to_utf8(d, (lindex + Hangul_LBase)); |
178 | d = uvuni_to_utf8(d, (vindex + Hangul_VBase)); |
8f118dcd |
179 | if (tindex) |
fe067ad9 |
180 | d = uvuni_to_utf8(d, (tindex + Hangul_TBase)); |
181 | return d; |
ac5ea531 |
182 | } |
183 | |
39f4556f |
184 | static char* sv_2pvunicode(SV *sv, STRLEN *lp) |
a092bcfd |
185 | { |
186 | char *s; |
187 | STRLEN len; |
39f4556f |
188 | s = SvPV(sv,len); |
a092bcfd |
189 | if (!SvUTF8(sv)) { |
39f4556f |
190 | SV* tmpsv = sv_2mortal(newSVpvn(s, len)); |
a092bcfd |
191 | if (!SvPOK(tmpsv)) |
39f4556f |
192 | s = SvPV_force(tmpsv,len); |
a092bcfd |
193 | sv_utf8_upgrade(tmpsv); |
39f4556f |
194 | s = SvPV(tmpsv,len); |
a092bcfd |
195 | } |
fe067ad9 |
196 | if (lp) |
197 | *lp = len; |
a092bcfd |
198 | return s; |
199 | } |
200 | |
fe067ad9 |
201 | static |
202 | U8* pv_utf8_decompose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat) |
203 | { |
204 | U8* p = s; |
205 | U8* e = s + slen; |
206 | U8* dstart = *dp; |
207 | U8* d = dstart; |
208 | |
209 | while (p < e) { |
210 | STRLEN retlen; |
211 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
212 | if (!retlen) |
fe067ad9 |
213 | croak(ErrRetlenIsZero, "decompose"); |
214 | p += retlen; |
215 | |
216 | if (Hangul_IsS(uv)) { |
217 | STRLEN cur = d - dstart; |
82e740b6 |
218 | |
fe067ad9 |
219 | if (dlen < cur + UTF8_MAXLEN * 3) { |
220 | dlen += UTF8_MAXLEN * 3; |
221 | Renew(dstart, dlen+1, U8); |
222 | d = dstart + cur; |
223 | } |
224 | d = pv_cat_decompHangul(d, uv); |
225 | } |
ac5ea531 |
226 | else { |
fe067ad9 |
227 | U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv); |
228 | |
229 | if (r) { |
230 | STRLEN len = (STRLEN)strlen((char *)r); |
231 | STRLEN cur = d - dstart; |
232 | if (dlen < cur + len) { |
233 | dlen += len; |
234 | Renew(dstart, dlen+1, U8); |
235 | d = dstart + cur; |
236 | } |
237 | while (len--) |
238 | *d++ = *r++; |
239 | } |
240 | else { |
241 | STRLEN cur = d - dstart; |
242 | |
243 | if (dlen < cur + UTF8_MAXLEN) { |
244 | dlen += UTF8_MAXLEN; |
245 | Renew(dstart, dlen+1, U8); |
246 | d = dstart + cur; |
247 | } |
248 | d = uvuni_to_utf8(d, uv); |
249 | } |
ac5ea531 |
250 | } |
251 | } |
fe067ad9 |
252 | *dp = dstart; |
253 | return d; |
254 | } |
ac5ea531 |
255 | |
fe067ad9 |
256 | static |
257 | U8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen) |
258 | { |
259 | U8* p = s; |
260 | U8* e = s + slen; |
261 | U8* dend = d + dlen; |
262 | |
263 | UNF_cc seq_ary[CC_SEQ_SIZE]; |
264 | UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */ |
265 | UNF_cc* seq_ext = NULL; /* extend if need */ |
266 | STRLEN seq_max = CC_SEQ_SIZE; |
267 | STRLEN cc_pos = 0; |
268 | |
269 | if (dlen < slen || dlen < slen + UTF8_MAXLEN) |
270 | croak(ErrTargetNotEnough, "reorder"); |
271 | dend -= UTF8_MAXLEN; /* safety */ |
272 | |
273 | while (p < e) { |
274 | U8 curCC; |
275 | STRLEN retlen; |
276 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
277 | if (!retlen) |
278 | croak(ErrRetlenIsZero, "reorder"); |
279 | p += retlen; |
ac5ea531 |
280 | |
fe067ad9 |
281 | curCC = getCombinClass(uv); |
ac5ea531 |
282 | |
fe067ad9 |
283 | if (curCC != 0) { |
284 | if (seq_max < cc_pos + 1) { /* extend if need */ |
285 | seq_max = cc_pos + CC_SEQ_STEP; /* new size */ |
286 | if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ |
287 | STRLEN i; |
288 | New(0, seq_ext, seq_max, UNF_cc); |
289 | for (i = 0; i < cc_pos; i++) |
290 | seq_ext[i] = seq_ary[i]; |
291 | } |
292 | else { |
293 | Renew(seq_ext, seq_max, UNF_cc); |
294 | } |
39f4556f |
295 | seq_ptr = seq_ext; /* use seq_ext from now */ |
fe067ad9 |
296 | } |
a092bcfd |
297 | |
fe067ad9 |
298 | seq_ptr[cc_pos].cc = curCC; |
299 | seq_ptr[cc_pos].uv = uv; |
300 | seq_ptr[cc_pos].pos = cc_pos; |
301 | ++cc_pos; |
ac5ea531 |
302 | |
fe067ad9 |
303 | if (p < e) |
304 | continue; |
305 | } |
ac5ea531 |
306 | |
fe067ad9 |
307 | if (cc_pos) { |
308 | STRLEN i; |
309 | |
310 | if (cc_pos > 1) /* reordered if there are two c.c.'s */ |
311 | qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc); |
312 | |
313 | for (i = 0; i < cc_pos; i++) { |
314 | d = uvuni_to_utf8(d, seq_ptr[i].uv); |
315 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ |
316 | croak(ErrLongerThanSrc, "reorder"); |
317 | } |
318 | cc_pos = 0; |
319 | } |
2a204b45 |
320 | |
e524f5b2 |
321 | if (curCC == 0) { |
322 | d = uvuni_to_utf8(d, uv); |
fe067ad9 |
323 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ |
324 | croak(ErrLongerThanSrc, "reorder"); |
e524f5b2 |
325 | } |
fe067ad9 |
326 | } |
327 | if (seq_ext) |
328 | Safefree(seq_ext); |
329 | return d; |
330 | } |
ac5ea531 |
331 | |
fe067ad9 |
332 | static |
333 | U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig) |
334 | { |
335 | U8* p = s; |
336 | U8* e = s + slen; |
337 | U8* dend = d + dlen; |
338 | |
39f4556f |
339 | UV uvS = 0; /* code point of the starter */ |
fe067ad9 |
340 | bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */ |
341 | U8 preCC = 0; |
342 | |
343 | UV seq_ary[CC_SEQ_SIZE]; |
344 | UV* seq_ptr = seq_ary; /* use array at the beginning */ |
345 | UV* seq_ext = NULL; /* extend if need */ |
346 | STRLEN seq_max = CC_SEQ_SIZE; |
347 | STRLEN cc_pos = 0; |
348 | |
349 | if (dlen < slen || dlen < slen + UTF8_MAXLEN) |
350 | croak(ErrTargetNotEnough, "compose"); |
351 | dend -= UTF8_MAXLEN; /* safety */ |
352 | |
353 | while (p < e) { |
354 | U8 curCC; |
355 | STRLEN retlen; |
356 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
357 | if (!retlen) |
358 | croak(ErrRetlenIsZero, "compose"); |
359 | p += retlen; |
ac5ea531 |
360 | |
fe067ad9 |
361 | curCC = getCombinClass(uv); |
82e740b6 |
362 | |
fe067ad9 |
363 | if (!valid_uvS) { |
e524f5b2 |
364 | if (curCC == 0) { |
fe067ad9 |
365 | uvS = uv; /* the first Starter is found */ |
366 | valid_uvS = TRUE; |
367 | if (p < e) |
368 | continue; |
e524f5b2 |
369 | } |
fe067ad9 |
370 | else { |
371 | d = uvuni_to_utf8(d, uv); |
372 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ |
373 | croak(ErrLongerThanSrc, "compose"); |
374 | continue; |
375 | } |
376 | } |
377 | else { |
378 | bool composed; |
379 | |
380 | /* blocked */ |
381 | if (iscontig && cc_pos || /* discontiguous combination */ |
382 | curCC != 0 && preCC == curCC || /* blocked by same CC */ |
383 | preCC > curCC) /* blocked by higher CC: revised D2 */ |
384 | composed = FALSE; |
385 | |
386 | /* not blocked: |
387 | iscontig && cc_pos == 0 -- contiguous combination |
388 | curCC == 0 && preCC == 0 -- starter + starter |
389 | curCC != 0 && preCC < curCC -- lower CC */ |
390 | else { |
391 | /* try composition */ |
392 | UV uvComp = composite_uv(uvS, uv); |
393 | |
394 | if (uvComp && !isExclusion(uvComp)) { |
395 | uvS = uvComp; |
396 | composed = TRUE; |
82e740b6 |
397 | |
fe067ad9 |
398 | /* preCC should not be changed to curCC */ |
399 | /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */ |
400 | if (p < e) |
401 | continue; |
402 | } |
403 | else |
404 | composed = FALSE; |
405 | } |
406 | |
407 | if (!composed) { |
408 | preCC = curCC; |
409 | if (curCC != 0 || !(p < e)) { |
410 | if (seq_max < cc_pos + 1) { /* extend if need */ |
411 | seq_max = cc_pos + CC_SEQ_STEP; /* new size */ |
412 | if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ |
413 | New(0, seq_ext, seq_max, UV); |
414 | Copy(seq_ary, seq_ext, cc_pos, UV); |
415 | } |
416 | else { |
417 | Renew(seq_ext, seq_max, UV); |
418 | } |
2b8d773d |
419 | seq_ptr = seq_ext; /* use seq_ext from now */ |
fe067ad9 |
420 | } |
421 | seq_ptr[cc_pos] = uv; |
422 | ++cc_pos; |
423 | } |
424 | if (curCC != 0 && p < e) |
425 | continue; |
ac5ea531 |
426 | } |
ac5ea531 |
427 | } |
428 | |
fe067ad9 |
429 | d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */ |
430 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ |
431 | croak(ErrLongerThanSrc, "compose"); |
432 | |
e524f5b2 |
433 | if (cc_pos) { |
fe067ad9 |
434 | STRLEN i; |
ac5ea531 |
435 | |
fe067ad9 |
436 | for (i = 0; i < cc_pos; i++) { |
437 | d = uvuni_to_utf8(d, seq_ptr[i]); |
438 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ |
439 | croak(ErrLongerThanSrc, "compose"); |
440 | } |
441 | cc_pos = 0; |
ac5ea531 |
442 | } |
fe067ad9 |
443 | |
444 | uvS = uv; |
ac5ea531 |
445 | } |
fe067ad9 |
446 | if (seq_ext) |
447 | Safefree(seq_ext); |
448 | return d; |
449 | } |
450 | |
451 | MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize |
452 | |
453 | SV* |
454 | decompose(src, compat = &PL_sv_no) |
455 | SV * src |
456 | SV * compat |
457 | PROTOTYPE: $;$ |
458 | PREINIT: |
459 | SV* dst; |
460 | U8 *s, *d, *dend; |
461 | STRLEN slen, dlen; |
462 | CODE: |
463 | s = (U8*)sv_2pvunicode(src,&slen); |
464 | dst = newSVpvn("", 0); |
465 | dlen = slen; |
466 | New(0, d, dlen+1, U8); |
39f4556f |
467 | dend = pv_utf8_decompose(s, slen, &d, dlen, (bool)SvTRUE(compat)); |
9b374906 |
468 | sv_setpvn(dst, (char *)d, dend - d); |
fe067ad9 |
469 | SvUTF8_on(dst); |
470 | Safefree(d); |
8f118dcd |
471 | RETVAL = dst; |
ac5ea531 |
472 | OUTPUT: |
473 | RETVAL |
474 | |
fe067ad9 |
475 | SV* |
476 | reorder(src) |
477 | SV * src |
478 | PROTOTYPE: $ |
479 | PREINIT: |
480 | SV* dst; |
481 | U8 *s, *d, *dend; |
482 | STRLEN slen, dlen; |
483 | CODE: |
484 | s = (U8*)sv_2pvunicode(src,&slen); |
485 | dst = newSVpvn("", 0); |
486 | dlen = slen + UTF8_MAXLEN; |
487 | d = (U8*)SvGROW(dst,dlen+1); |
488 | SvUTF8_on(dst); |
489 | dend = pv_utf8_reorder(s, slen, d, dlen); |
490 | *dend = '\0'; |
491 | SvCUR_set(dst, dend - d); |
492 | RETVAL = dst; |
493 | OUTPUT: |
494 | RETVAL |
ac5ea531 |
495 | |
2a204b45 |
496 | SV* |
a092bcfd |
497 | compose(src) |
498 | SV * src |
ac5ea531 |
499 | PROTOTYPE: $ |
82e740b6 |
500 | ALIAS: |
501 | composeContiguous = 1 |
ac5ea531 |
502 | PREINIT: |
fe067ad9 |
503 | SV* dst; |
504 | U8 *s, *d, *dend; |
505 | STRLEN slen, dlen; |
2a204b45 |
506 | CODE: |
fe067ad9 |
507 | s = (U8*)sv_2pvunicode(src,&slen); |
508 | dst = newSVpvn("", 0); |
509 | dlen = slen + UTF8_MAXLEN; |
510 | d = (U8*)SvGROW(dst,dlen+1); |
ac5ea531 |
511 | SvUTF8_on(dst); |
fe067ad9 |
512 | dend = pv_utf8_compose(s, slen, d, dlen, (bool)ix); |
513 | *dend = '\0'; |
514 | SvCUR_set(dst, dend - d); |
515 | RETVAL = dst; |
516 | OUTPUT: |
517 | RETVAL |
ac5ea531 |
518 | |
fe067ad9 |
519 | SV* |
520 | NFD(src) |
521 | SV * src |
522 | PROTOTYPE: $ |
523 | ALIAS: |
524 | NFKD = 1 |
525 | PREINIT: |
526 | SV *dst; |
527 | U8 *s, *t, *tend, *d, *dend; |
528 | STRLEN slen, tlen, dlen; |
529 | CODE: |
530 | /* decompose */ |
531 | s = (U8*)sv_2pvunicode(src,&slen); |
532 | tlen = slen; |
533 | New(0, t, tlen+1, U8); |
534 | tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)ix); |
535 | *tend = '\0'; |
536 | tlen = tend - t; /* no longer know real tlen */ |
537 | |
538 | /* reorder */ |
539 | dst = newSVpvn("", 0); |
540 | dlen = tlen + UTF8_MAXLEN; |
541 | d = (U8*)SvGROW(dst,dlen+1); |
542 | SvUTF8_on(dst); |
543 | dend = pv_utf8_reorder(t, tlen, d, dlen); |
544 | *dend = '\0'; |
545 | SvCUR_set(dst, dend - d); |
ac5ea531 |
546 | |
fe067ad9 |
547 | /* return */ |
548 | Safefree(t); |
549 | RETVAL = dst; |
550 | OUTPUT: |
551 | RETVAL |
82e740b6 |
552 | |
fe067ad9 |
553 | SV* |
554 | NFC(src) |
555 | SV * src |
556 | PROTOTYPE: $ |
557 | ALIAS: |
558 | NFKC = 1 |
559 | FCC = 2 |
560 | PREINIT: |
561 | SV *dst; |
562 | U8 *s, *t, *tend, *u, *uend, *d, *dend; |
563 | STRLEN slen, tlen, ulen, dlen; |
564 | CODE: |
565 | /* decompose */ |
566 | s = (U8*)sv_2pvunicode(src,&slen); |
567 | tlen = slen; |
568 | New(0, t, tlen+1, U8); |
569 | tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)(ix==1)); |
570 | *tend = '\0'; |
571 | tlen = tend - t; /* no longer know real tlen */ |
572 | |
573 | /* reorder */ |
574 | ulen = tlen + UTF8_MAXLEN; |
575 | New(0, u, ulen+1, U8); |
576 | uend = pv_utf8_reorder(t, tlen, u, ulen); |
577 | *uend = '\0'; |
578 | ulen = uend - u; |
579 | |
580 | /* compose */ |
581 | dst = newSVpvn("", 0); |
582 | dlen = ulen + UTF8_MAXLEN; |
583 | d = (U8*)SvGROW(dst,dlen+1); |
584 | SvUTF8_on(dst); |
585 | dend = pv_utf8_compose(u, ulen, d, dlen, (bool)(ix==2)); |
586 | *dend = '\0'; |
587 | SvCUR_set(dst, dend - d); |
ac5ea531 |
588 | |
fe067ad9 |
589 | /* return */ |
590 | Safefree(t); |
591 | Safefree(u); |
2a204b45 |
592 | RETVAL = dst; |
593 | OUTPUT: |
594 | RETVAL |
ac5ea531 |
595 | |
2b8d773d |
596 | SV* |
a092bcfd |
597 | checkNFD(src) |
598 | SV * src |
8f118dcd |
599 | PROTOTYPE: $ |
600 | ALIAS: |
601 | checkNFKD = 1 |
602 | PREINIT: |
8f118dcd |
603 | STRLEN srclen, retlen; |
604 | U8 *s, *e, *p, curCC, preCC; |
2b8d773d |
605 | bool result = TRUE; |
82e740b6 |
606 | CODE: |
a092bcfd |
607 | s = (U8*)sv_2pvunicode(src,&srclen); |
8f118dcd |
608 | e = s + srclen; |
609 | |
610 | preCC = 0; |
611 | for (p = s; p < e; p += retlen) { |
fe067ad9 |
612 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
613 | if (!retlen) |
fe067ad9 |
614 | croak(ErrRetlenIsZero, "checkNFD or -NFKD"); |
82e740b6 |
615 | |
8f118dcd |
616 | curCC = getCombinClass(uv); |
2b8d773d |
617 | if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ |
618 | result = FALSE; |
619 | break; |
620 | } |
621 | if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) { |
622 | result = FALSE; |
623 | break; |
624 | } |
8f118dcd |
625 | preCC = curCC; |
626 | } |
2b8d773d |
627 | RETVAL = boolSV(result); |
628 | OUTPUT: |
629 | RETVAL |
8f118dcd |
630 | |
631 | |
2b8d773d |
632 | SV* |
a092bcfd |
633 | checkNFC(src) |
634 | SV * src |
8f118dcd |
635 | PROTOTYPE: $ |
636 | ALIAS: |
637 | checkNFKC = 1 |
638 | PREINIT: |
8f118dcd |
639 | STRLEN srclen, retlen; |
640 | U8 *s, *e, *p, curCC, preCC; |
2b8d773d |
641 | bool result = TRUE; |
642 | bool isMAYBE = FALSE; |
82e740b6 |
643 | CODE: |
a092bcfd |
644 | s = (U8*)sv_2pvunicode(src,&srclen); |
8f118dcd |
645 | e = s + srclen; |
646 | |
647 | preCC = 0; |
8f118dcd |
648 | for (p = s; p < e; p += retlen) { |
fe067ad9 |
649 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
650 | if (!retlen) |
fe067ad9 |
651 | croak(ErrRetlenIsZero, "checkNFC or -NFKC"); |
82e740b6 |
652 | |
8f118dcd |
653 | curCC = getCombinClass(uv); |
2b8d773d |
654 | if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ |
655 | result = FALSE; |
656 | break; |
657 | } |
8f118dcd |
658 | |
659 | /* get NFC/NFKC property */ |
660 | if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */ |
661 | ; /* YES */ |
2b8d773d |
662 | else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { |
663 | result = FALSE; |
664 | break; |
665 | } |
8f118dcd |
666 | else if (isComp2nd(uv)) |
667 | isMAYBE = TRUE; |
668 | else if (ix) { |
669 | char *canon, *compat; |
6c941e0c |
670 | /* NFKC_NO when having compatibility mapping. */ |
8f118dcd |
671 | canon = (char *) dec_canonical(uv); |
672 | compat = (char *) dec_compat(uv); |
2b8d773d |
673 | if (compat && !(canon && strEQ(canon, compat))) { |
674 | result = FALSE; |
675 | break; |
676 | } |
8f118dcd |
677 | } /* end of get NFC/NFKC property */ |
678 | |
679 | preCC = curCC; |
680 | } |
2b8d773d |
681 | if (isMAYBE && result) /* NO precedes MAYBE */ |
8f118dcd |
682 | XSRETURN_UNDEF; |
2b8d773d |
683 | RETVAL = boolSV(result); |
684 | OUTPUT: |
685 | RETVAL |
8f118dcd |
686 | |
687 | |
2b8d773d |
688 | SV* |
a092bcfd |
689 | checkFCD(src) |
690 | SV * src |
82e740b6 |
691 | PROTOTYPE: $ |
692 | ALIAS: |
693 | checkFCC = 1 |
694 | PREINIT: |
fe067ad9 |
695 | STRLEN srclen, retlen; |
82e740b6 |
696 | U8 *s, *e, *p, curCC, preCC; |
2b8d773d |
697 | bool result = TRUE; |
698 | bool isMAYBE = FALSE; |
82e740b6 |
699 | CODE: |
a092bcfd |
700 | s = (U8*)sv_2pvunicode(src,&srclen); |
82e740b6 |
701 | e = s + srclen; |
82e740b6 |
702 | preCC = 0; |
82e740b6 |
703 | for (p = s; p < e; p += retlen) { |
fe067ad9 |
704 | U8 *sCan; |
705 | UV uvLead; |
39f4556f |
706 | STRLEN canlen = 0; |
fe067ad9 |
707 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
708 | if (!retlen) |
fe067ad9 |
709 | croak(ErrRetlenIsZero, "checkFCD or -FCC"); |
82e740b6 |
710 | |
711 | sCan = (U8*) dec_canonical(uv); |
712 | |
713 | if (sCan) { |
39f4556f |
714 | STRLEN canret; |
82e740b6 |
715 | canlen = (STRLEN)strlen((char *) sCan); |
e524f5b2 |
716 | uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF); |
fe067ad9 |
717 | if (!canret) |
718 | croak(ErrRetlenIsZero, "checkFCD or -FCC"); |
82e740b6 |
719 | } |
720 | else { |
721 | uvLead = uv; |
722 | } |
723 | |
724 | curCC = getCombinClass(uvLead); |
725 | |
2b8d773d |
726 | if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */ |
727 | result = FALSE; |
728 | break; |
729 | } |
82e740b6 |
730 | |
731 | if (ix) { |
2b8d773d |
732 | if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { |
733 | result = FALSE; |
734 | break; |
735 | } |
82e740b6 |
736 | else if (isComp2nd(uv)) |
737 | isMAYBE = TRUE; |
738 | } |
739 | |
740 | if (sCan) { |
39f4556f |
741 | STRLEN canret; |
fe067ad9 |
742 | UV uvTrail; |
743 | U8* eCan = sCan + canlen; |
744 | U8* pCan = utf8_hop(eCan, -1); |
82e740b6 |
745 | if (pCan < sCan) |
746 | croak(ErrHopBeforeStart); |
e524f5b2 |
747 | uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF); |
fe067ad9 |
748 | if (!canret) |
749 | croak(ErrRetlenIsZero, "checkFCD or -FCC"); |
82e740b6 |
750 | preCC = getCombinClass(uvTrail); |
751 | } |
752 | else { |
753 | preCC = curCC; |
754 | } |
755 | } |
2b8d773d |
756 | if (isMAYBE && result) /* NO precedes MAYBE */ |
82e740b6 |
757 | XSRETURN_UNDEF; |
2b8d773d |
758 | RETVAL = boolSV(result); |
759 | OUTPUT: |
760 | RETVAL |
82e740b6 |
761 | |
762 | |
ac5ea531 |
763 | U8 |
764 | getCombinClass(uv) |
765 | UV uv |
8f118dcd |
766 | PROTOTYPE: $ |
ac5ea531 |
767 | |
768 | bool |
2a204b45 |
769 | isExclusion(uv) |
ac5ea531 |
770 | UV uv |
8f118dcd |
771 | PROTOTYPE: $ |
772 | |
773 | bool |
774 | isSingleton(uv) |
775 | UV uv |
776 | PROTOTYPE: $ |
777 | |
778 | bool |
779 | isNonStDecomp(uv) |
780 | UV uv |
781 | PROTOTYPE: $ |
782 | |
783 | bool |
784 | isComp2nd(uv) |
785 | UV uv |
786 | PROTOTYPE: $ |
787 | ALIAS: |
788 | isNFC_MAYBE = 1 |
789 | isNFKC_MAYBE = 2 |
790 | |
791 | |
792 | |
2b8d773d |
793 | SV* |
8f118dcd |
794 | isNFD_NO(uv) |
795 | UV uv |
796 | PROTOTYPE: $ |
797 | ALIAS: |
798 | isNFKD_NO = 1 |
2b8d773d |
799 | PREINIT: |
800 | bool result = FALSE; |
82e740b6 |
801 | CODE: |
8f118dcd |
802 | if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) |
2b8d773d |
803 | result = TRUE; /* NFD_NO or NFKD_NO */ |
804 | RETVAL = boolSV(result); |
805 | OUTPUT: |
806 | RETVAL |
8f118dcd |
807 | |
808 | |
2b8d773d |
809 | SV* |
8f118dcd |
810 | isComp_Ex(uv) |
811 | UV uv |
812 | PROTOTYPE: $ |
813 | ALIAS: |
814 | isNFC_NO = 0 |
815 | isNFKC_NO = 1 |
2b8d773d |
816 | PREINIT: |
817 | bool result = FALSE; |
82e740b6 |
818 | CODE: |
8f118dcd |
819 | if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) |
2b8d773d |
820 | result = TRUE; /* NFC_NO or NFKC_NO */ |
8f118dcd |
821 | else if (ix) { |
822 | char *canon, *compat; |
823 | canon = (char *) dec_canonical(uv); |
824 | compat = (char *) dec_compat(uv); |
825 | if (compat && (!canon || strNE(canon, compat))) |
2b8d773d |
826 | result = TRUE; /* NFC_NO or NFKC_NO */ |
8f118dcd |
827 | } |
2b8d773d |
828 | RETVAL = boolSV(result); |
829 | OUTPUT: |
830 | RETVAL |
ac5ea531 |
831 | |
2a204b45 |
832 | SV* |
ac5ea531 |
833 | getComposite(uv, uv2) |
834 | UV uv |
835 | UV uv2 |
2a204b45 |
836 | PROTOTYPE: $$ |
837 | PREINIT: |
bcdb689b |
838 | UV composite; |
2a204b45 |
839 | CODE: |
bcdb689b |
840 | composite = composite_uv(uv, uv2); |
841 | RETVAL = composite ? newSVuv(composite) : &PL_sv_undef; |
2a204b45 |
842 | OUTPUT: |
843 | RETVAL |
ac5ea531 |
844 | |
8f118dcd |
845 | |
846 | |
ac5ea531 |
847 | SV* |
848 | getCanon(uv) |
849 | UV uv |
850 | PROTOTYPE: $ |
851 | ALIAS: |
852 | getCompat = 1 |
ac5ea531 |
853 | CODE: |
8f118dcd |
854 | if (Hangul_IsS(uv)) { |
fe067ad9 |
855 | U8 tmp[3 * UTF8_MAXLEN + 1]; |
856 | U8 *t = tmp; |
857 | U8 *e = pv_cat_decompHangul(t, uv); |
858 | RETVAL = newSVpvn((char *)t, e - t); |
ac5ea531 |
859 | } else { |
fe067ad9 |
860 | U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv); |
8f118dcd |
861 | if (!rstr) |
862 | XSRETURN_UNDEF; |
ac5ea531 |
863 | RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); |
864 | } |
865 | SvUTF8_on(RETVAL); |
866 | OUTPUT: |
867 | RETVAL |
868 | |
82e740b6 |
869 | |
870 | void |
a092bcfd |
871 | splitOnLastStarter(src) |
872 | SV * src |
82e740b6 |
873 | PREINIT: |
a092bcfd |
874 | SV *svp; |
fe067ad9 |
875 | STRLEN srclen; |
82e740b6 |
876 | U8 *s, *e, *p; |
877 | PPCODE: |
a092bcfd |
878 | s = (U8*)sv_2pvunicode(src,&srclen); |
82e740b6 |
879 | e = s + srclen; |
fe067ad9 |
880 | p = e; |
881 | while (s < p) { |
882 | UV uv; |
82e740b6 |
883 | p = utf8_hop(p, -1); |
884 | if (p < s) |
885 | croak(ErrHopBeforeStart); |
fe067ad9 |
886 | uv = utf8n_to_uvuni(p, e - p, NULL, AllowAnyUTF); |
82e740b6 |
887 | if (getCombinClass(uv) == 0) /* Last Starter found */ |
888 | break; |
889 | } |
890 | |
891 | svp = sv_2mortal(newSVpvn((char*)s, p - s)); |
892 | SvUTF8_on(svp); |
893 | XPUSHs(svp); |
894 | |
895 | svp = sv_2mortal(newSVpvn((char*)p, e - p)); |
896 | SvUTF8_on(svp); |
897 | XPUSHs(svp); |
898 | |