Commit | Line | Data |
db81d362 |
1 | /* |
2 | * This code was copied from perl/toke.c and subsequently butchered |
3 | * by Lukas Mai (2012). |
4 | */ |
7dd35535 |
5 | /* vi: set ft=c: */ |
db81d362 |
6 | |
7 | /* vvvvvvvvvvvvvvvvvvvvv I HAVE NO IDEA WHAT I'M DOING vvvvvvvvvvvvvvvvvvvv */ |
8 | #define PL_linestr (PL_parser->linestr) |
9 | #define PL_copline (PL_parser->copline) |
10 | #define PL_bufptr (PL_parser->bufptr) |
11 | #define PL_bufend (PL_parser->bufend) |
12 | #define PL_multi_start (PL_parser->multi_start) |
13 | #define PL_multi_open (PL_parser->multi_open) |
14 | #define PL_multi_close (PL_parser->multi_close) |
15 | #define PL_multi_end (PL_parser->multi_end) |
16 | #define PL_rsfp (PL_parser->rsfp) |
17 | |
18 | #define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline)) |
19 | |
20 | #ifdef USE_UTF8_SCRIPTS |
21 | # define UTF (!IN_BYTES) |
22 | #else |
23 | # define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8)) |
24 | #endif |
25 | |
7dd35535 |
26 | static STRLEN S_scan_word(pTHX_ const char *start, int allow_package) { |
db81d362 |
27 | const char *s = start; |
28 | for (;;) { |
29 | if (isALNUM(*s) || (!UTF && isALNUMC_L1(*s))) { /* UTF handled below */ |
30 | s++; |
31 | } else if (allow_package && s > start && *s == '\'' && isIDFIRST_lazy_if(s+1, UTF)) { |
32 | s++; |
33 | } else if (allow_package && s[0] == ':' && s[1] == ':' && isIDFIRST_lazy_if(s+2, UTF)) { |
34 | s += 2; |
35 | } else if (UTF && UTF8_IS_START(*s) && isALNUM_utf8((U8*)s)) { |
36 | do { |
37 | s += UTF8SKIP(s); |
38 | } while (UTF8_IS_CONTINUED(*s) && is_utf8_mark((U8*)s)); |
39 | } else { |
40 | return s - start; |
41 | } |
42 | } |
43 | } |
44 | |
45 | static char *S_scan_str(pTHX_ SV *sv, int keep_quoted, int keep_delims) { |
46 | dVAR; |
47 | char *start = PL_bufptr; |
48 | const char *tmps; /* temp string, used for delimiter matching */ |
49 | char *s = start; /* current position in the buffer */ |
50 | char term; /* terminating character */ |
51 | char *to; /* current position in the sv's data */ |
52 | I32 brackets = 1; /* bracket nesting level */ |
53 | bool has_utf8 = FALSE; /* is there any utf8 content? */ |
54 | I32 termcode; /* terminating char. code */ |
55 | U8 termstr[UTF8_MAXBYTES]; /* terminating string */ |
56 | STRLEN termlen; /* length of terminating string */ |
57 | int last_off = 0; /* last position for nesting bracket */ |
58 | |
59 | /* XXX ATTENTION: we don't skip whitespace! */ |
60 | |
61 | /* mark where we are, in case we need to report errors */ |
62 | CLINE; |
63 | |
64 | /* after skipping whitespace, the next character is the terminator */ |
65 | term = *s; |
66 | if (!UTF) { |
67 | termcode = termstr[0] = term; |
68 | termlen = 1; |
69 | } |
70 | else { |
7dd35535 |
71 | #if HAVE_PERL_VERSION(5, 16, 0) |
db81d362 |
72 | termcode = utf8_to_uvchr_buf((U8*)s, (U8*)PL_bufend, &termlen); |
7dd35535 |
73 | #else |
74 | termcode = utf8_to_uvchr((U8*)s, &termlen); |
75 | #endif |
db81d362 |
76 | Copy(s, termstr, termlen, U8); |
77 | if (!UTF8_IS_INVARIANT(term)) |
78 | has_utf8 = TRUE; |
79 | } |
80 | |
81 | /* mark where we are */ |
82 | PL_multi_start = CopLINE(PL_curcop); |
83 | PL_multi_open = term; |
84 | |
85 | /* find corresponding closing delimiter */ |
86 | if (term && (tmps = strchr("([{< )]}> )]}>",term))) |
87 | termcode = termstr[0] = term = tmps[5]; |
88 | |
89 | PL_multi_close = term; |
90 | |
91 | { |
92 | STRLEN dummy; |
93 | SvPV_force(sv, dummy); |
94 | sv_setpvs(sv, ""); |
95 | SvGROW(sv, 80); |
96 | } |
97 | |
98 | /* move past delimiter and try to read a complete string */ |
99 | if (keep_delims) |
100 | sv_catpvn(sv, s, termlen); |
101 | s += termlen; |
102 | for (;;) { |
103 | if (PL_encoding && !UTF) { |
104 | bool cont = TRUE; |
105 | |
106 | while (cont) { |
107 | int offset = s - SvPVX_const(PL_linestr); |
108 | const bool found = sv_cat_decode(sv, PL_encoding, PL_linestr, |
109 | &offset, (char*)termstr, termlen); |
110 | const char * const ns = SvPVX_const(PL_linestr) + offset; |
111 | char * const svlast = SvEND(sv) - 1; |
112 | |
113 | for (; s < ns; s++) { |
7dd35535 |
114 | if (*s == '\n' && !PL_rsfp |
115 | #if HAVE_PERL_VERSION(5, 16, 0) |
116 | && !PL_parser->filtered |
117 | #endif |
118 | ) |
db81d362 |
119 | CopLINE_inc(PL_curcop); |
120 | } |
121 | if (!found) |
122 | goto read_more_line; |
123 | else { |
124 | /* handle quoted delimiters */ |
125 | if (SvCUR(sv) > 1 && *(svlast-1) == '\\') { |
126 | const char *t; |
127 | for (t = svlast-2; t >= SvPVX_const(sv) && *t == '\\';) |
128 | t--; |
129 | if ((svlast-1 - t) % 2) { |
130 | if (!keep_quoted) { |
131 | *(svlast-1) = term; |
132 | *svlast = '\0'; |
133 | SvCUR_set(sv, SvCUR(sv) - 1); |
134 | } |
135 | continue; |
136 | } |
137 | } |
138 | if (PL_multi_open == PL_multi_close) { |
139 | cont = FALSE; |
140 | } |
141 | else { |
142 | const char *t; |
143 | char *w; |
144 | for (t = w = SvPVX(sv)+last_off; t < svlast; w++, t++) { |
145 | /* At here, all closes are "was quoted" one, |
146 | so we don't check PL_multi_close. */ |
147 | if (*t == '\\') { |
148 | if (!keep_quoted && *(t+1) == PL_multi_open) |
149 | t++; |
150 | else |
151 | *w++ = *t++; |
152 | } |
153 | else if (*t == PL_multi_open) |
154 | brackets++; |
155 | |
156 | *w = *t; |
157 | } |
158 | if (w < t) { |
159 | *w++ = term; |
160 | *w = '\0'; |
161 | SvCUR_set(sv, w - SvPVX_const(sv)); |
162 | } |
163 | last_off = w - SvPVX(sv); |
164 | if (--brackets <= 0) |
165 | cont = FALSE; |
166 | } |
167 | } |
168 | } |
169 | if (!keep_delims) { |
170 | SvCUR_set(sv, SvCUR(sv) - 1); |
171 | *SvEND(sv) = '\0'; |
172 | } |
173 | break; |
174 | } |
175 | |
176 | /* extend sv if need be */ |
177 | SvGROW(sv, SvCUR(sv) + (PL_bufend - s) + 1); |
178 | /* set 'to' to the next character in the sv's string */ |
179 | to = SvPVX(sv)+SvCUR(sv); |
180 | |
181 | /* if open delimiter is the close delimiter read unbridle */ |
182 | if (PL_multi_open == PL_multi_close) { |
183 | for (; s < PL_bufend; s++,to++) { |
184 | /* embedded newlines increment the current line number */ |
7dd35535 |
185 | if (*s == '\n' && !PL_rsfp |
186 | #if HAVE_PERL_VERSION(5, 16, 0) |
187 | && !PL_parser->filtered |
188 | #endif |
189 | ) |
db81d362 |
190 | CopLINE_inc(PL_curcop); |
191 | /* handle quoted delimiters */ |
192 | if (*s == '\\' && s+1 < PL_bufend && term != '\\') { |
193 | if (!keep_quoted && s[1] == term) |
194 | s++; |
195 | /* any other quotes are simply copied straight through */ |
196 | else |
197 | *to++ = *s++; |
198 | } |
199 | /* terminate when run out of buffer (the for() condition), or |
200 | have found the terminator */ |
201 | else if (*s == term) { |
202 | if (termlen == 1) |
203 | break; |
204 | if (s+termlen <= PL_bufend && memEQ(s, (char*)termstr, termlen)) |
205 | break; |
206 | } |
207 | else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) |
208 | has_utf8 = TRUE; |
209 | *to = *s; |
210 | } |
211 | } |
212 | |
213 | /* if the terminator isn't the same as the start character (e.g., |
214 | matched brackets), we have to allow more in the quoting, and |
215 | be prepared for nested brackets. |
216 | */ |
217 | else { |
218 | /* read until we run out of string, or we find the terminator */ |
219 | for (; s < PL_bufend; s++,to++) { |
220 | /* embedded newlines increment the line count */ |
7dd35535 |
221 | if (*s == '\n' && !PL_rsfp |
222 | #if HAVE_PERL_VERSION(5, 16, 0) |
223 | && !PL_parser->filtered |
224 | #endif |
225 | ) |
db81d362 |
226 | CopLINE_inc(PL_curcop); |
227 | /* backslashes can escape the open or closing characters */ |
228 | if (*s == '\\' && s+1 < PL_bufend) { |
229 | if (!keep_quoted && |
230 | ((s[1] == PL_multi_open) || (s[1] == PL_multi_close))) |
231 | s++; |
232 | else |
233 | *to++ = *s++; |
234 | } |
235 | /* allow nested opens and closes */ |
236 | else if (*s == PL_multi_close && --brackets <= 0) |
237 | break; |
238 | else if (*s == PL_multi_open) |
239 | brackets++; |
240 | else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) |
241 | has_utf8 = TRUE; |
242 | *to = *s; |
243 | } |
244 | } |
245 | /* terminate the copied string and update the sv's end-of-string */ |
246 | *to = '\0'; |
247 | SvCUR_set(sv, to - SvPVX_const(sv)); |
248 | |
249 | /* |
250 | * this next chunk reads more into the buffer if we're not done yet |
251 | */ |
252 | |
253 | if (s < PL_bufend) |
254 | break; /* handle case where we are done yet :-) */ |
255 | |
256 | #ifndef PERL_STRICT_CR |
257 | if (to - SvPVX_const(sv) >= 2) { |
258 | if ((to[-2] == '\r' && to[-1] == '\n') || |
259 | (to[-2] == '\n' && to[-1] == '\r')) |
260 | { |
261 | to[-2] = '\n'; |
262 | to--; |
263 | SvCUR_set(sv, to - SvPVX_const(sv)); |
264 | } |
265 | else if (to[-1] == '\r') |
266 | to[-1] = '\n'; |
267 | } |
268 | else if (to - SvPVX_const(sv) == 1 && to[-1] == '\r') |
269 | to[-1] = '\n'; |
270 | #endif |
271 | |
272 | read_more_line: |
273 | /* if we're out of file, or a read fails, bail and reset the current |
274 | line marker so we can report where the unterminated string began |
275 | */ |
276 | CopLINE_inc(PL_curcop); |
277 | PL_bufptr = PL_bufend; |
278 | if (!lex_next_chunk(0)) { |
279 | CopLINE_set(PL_curcop, (line_t)PL_multi_start); |
280 | return NULL; |
281 | } |
282 | s = PL_bufptr; |
283 | } |
284 | |
285 | /* at this point, we have successfully read the delimited string */ |
286 | |
287 | if (!PL_encoding || UTF) { |
288 | if (keep_delims) |
289 | sv_catpvn(sv, s, termlen); |
290 | s += termlen; |
291 | } |
292 | if (has_utf8 || PL_encoding) |
293 | SvUTF8_on(sv); |
294 | |
295 | PL_multi_end = CopLINE(PL_curcop); |
296 | |
297 | /* if we allocated too much space, give some back */ |
298 | if (SvCUR(sv) + 5 < SvLEN(sv)) { |
299 | SvLEN_set(sv, SvCUR(sv) + 1); |
300 | SvPV_renew(sv, SvLEN(sv)); |
301 | } |
302 | |
303 | PL_bufptr = s; |
304 | return s; |
305 | } |
306 | /* ^^^^^^^^^^^^^^^^^^^^^ I HAVE NO IDEA WHAT I'M DOING ^^^^^^^^^^^^^^^^^^^^ */ |