Commit | Line | Data |
db81d362 |
1 | /* |
2 | * This code was copied from perl/toke.c and subsequently butchered |
3 | * by Lukas Mai (2012). |
4 | */ |
5 | |
6 | /* vvvvvvvvvvvvvvvvvvvvv I HAVE NO IDEA WHAT I'M DOING vvvvvvvvvvvvvvvvvvvv */ |
7 | #define PL_linestr (PL_parser->linestr) |
8 | #define PL_copline (PL_parser->copline) |
9 | #define PL_bufptr (PL_parser->bufptr) |
10 | #define PL_bufend (PL_parser->bufend) |
11 | #define PL_multi_start (PL_parser->multi_start) |
12 | #define PL_multi_open (PL_parser->multi_open) |
13 | #define PL_multi_close (PL_parser->multi_close) |
14 | #define PL_multi_end (PL_parser->multi_end) |
15 | #define PL_rsfp (PL_parser->rsfp) |
16 | |
17 | #define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline)) |
18 | |
19 | #ifdef USE_UTF8_SCRIPTS |
20 | # define UTF (!IN_BYTES) |
21 | #else |
22 | # define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8)) |
23 | #endif |
24 | |
25 | static STRLEN S_scan_word(const char *start, int allow_package) { |
26 | const char *s = start; |
27 | for (;;) { |
28 | if (isALNUM(*s) || (!UTF && isALNUMC_L1(*s))) { /* UTF handled below */ |
29 | s++; |
30 | } else if (allow_package && s > start && *s == '\'' && isIDFIRST_lazy_if(s+1, UTF)) { |
31 | s++; |
32 | } else if (allow_package && s[0] == ':' && s[1] == ':' && isIDFIRST_lazy_if(s+2, UTF)) { |
33 | s += 2; |
34 | } else if (UTF && UTF8_IS_START(*s) && isALNUM_utf8((U8*)s)) { |
35 | do { |
36 | s += UTF8SKIP(s); |
37 | } while (UTF8_IS_CONTINUED(*s) && is_utf8_mark((U8*)s)); |
38 | } else { |
39 | return s - start; |
40 | } |
41 | } |
42 | } |
43 | |
44 | static char *S_scan_str(pTHX_ SV *sv, int keep_quoted, int keep_delims) { |
45 | dVAR; |
46 | char *start = PL_bufptr; |
47 | const char *tmps; /* temp string, used for delimiter matching */ |
48 | char *s = start; /* current position in the buffer */ |
49 | char term; /* terminating character */ |
50 | char *to; /* current position in the sv's data */ |
51 | I32 brackets = 1; /* bracket nesting level */ |
52 | bool has_utf8 = FALSE; /* is there any utf8 content? */ |
53 | I32 termcode; /* terminating char. code */ |
54 | U8 termstr[UTF8_MAXBYTES]; /* terminating string */ |
55 | STRLEN termlen; /* length of terminating string */ |
56 | int last_off = 0; /* last position for nesting bracket */ |
57 | |
58 | /* XXX ATTENTION: we don't skip whitespace! */ |
59 | |
60 | /* mark where we are, in case we need to report errors */ |
61 | CLINE; |
62 | |
63 | /* after skipping whitespace, the next character is the terminator */ |
64 | term = *s; |
65 | if (!UTF) { |
66 | termcode = termstr[0] = term; |
67 | termlen = 1; |
68 | } |
69 | else { |
70 | termcode = utf8_to_uvchr_buf((U8*)s, (U8*)PL_bufend, &termlen); |
71 | Copy(s, termstr, termlen, U8); |
72 | if (!UTF8_IS_INVARIANT(term)) |
73 | has_utf8 = TRUE; |
74 | } |
75 | |
76 | /* mark where we are */ |
77 | PL_multi_start = CopLINE(PL_curcop); |
78 | PL_multi_open = term; |
79 | |
80 | /* find corresponding closing delimiter */ |
81 | if (term && (tmps = strchr("([{< )]}> )]}>",term))) |
82 | termcode = termstr[0] = term = tmps[5]; |
83 | |
84 | PL_multi_close = term; |
85 | |
86 | { |
87 | STRLEN dummy; |
88 | SvPV_force(sv, dummy); |
89 | sv_setpvs(sv, ""); |
90 | SvGROW(sv, 80); |
91 | } |
92 | |
93 | /* move past delimiter and try to read a complete string */ |
94 | if (keep_delims) |
95 | sv_catpvn(sv, s, termlen); |
96 | s += termlen; |
97 | for (;;) { |
98 | if (PL_encoding && !UTF) { |
99 | bool cont = TRUE; |
100 | |
101 | while (cont) { |
102 | int offset = s - SvPVX_const(PL_linestr); |
103 | const bool found = sv_cat_decode(sv, PL_encoding, PL_linestr, |
104 | &offset, (char*)termstr, termlen); |
105 | const char * const ns = SvPVX_const(PL_linestr) + offset; |
106 | char * const svlast = SvEND(sv) - 1; |
107 | |
108 | for (; s < ns; s++) { |
109 | if (*s == '\n' && !PL_rsfp && !PL_parser->filtered) |
110 | CopLINE_inc(PL_curcop); |
111 | } |
112 | if (!found) |
113 | goto read_more_line; |
114 | else { |
115 | /* handle quoted delimiters */ |
116 | if (SvCUR(sv) > 1 && *(svlast-1) == '\\') { |
117 | const char *t; |
118 | for (t = svlast-2; t >= SvPVX_const(sv) && *t == '\\';) |
119 | t--; |
120 | if ((svlast-1 - t) % 2) { |
121 | if (!keep_quoted) { |
122 | *(svlast-1) = term; |
123 | *svlast = '\0'; |
124 | SvCUR_set(sv, SvCUR(sv) - 1); |
125 | } |
126 | continue; |
127 | } |
128 | } |
129 | if (PL_multi_open == PL_multi_close) { |
130 | cont = FALSE; |
131 | } |
132 | else { |
133 | const char *t; |
134 | char *w; |
135 | for (t = w = SvPVX(sv)+last_off; t < svlast; w++, t++) { |
136 | /* At here, all closes are "was quoted" one, |
137 | so we don't check PL_multi_close. */ |
138 | if (*t == '\\') { |
139 | if (!keep_quoted && *(t+1) == PL_multi_open) |
140 | t++; |
141 | else |
142 | *w++ = *t++; |
143 | } |
144 | else if (*t == PL_multi_open) |
145 | brackets++; |
146 | |
147 | *w = *t; |
148 | } |
149 | if (w < t) { |
150 | *w++ = term; |
151 | *w = '\0'; |
152 | SvCUR_set(sv, w - SvPVX_const(sv)); |
153 | } |
154 | last_off = w - SvPVX(sv); |
155 | if (--brackets <= 0) |
156 | cont = FALSE; |
157 | } |
158 | } |
159 | } |
160 | if (!keep_delims) { |
161 | SvCUR_set(sv, SvCUR(sv) - 1); |
162 | *SvEND(sv) = '\0'; |
163 | } |
164 | break; |
165 | } |
166 | |
167 | /* extend sv if need be */ |
168 | SvGROW(sv, SvCUR(sv) + (PL_bufend - s) + 1); |
169 | /* set 'to' to the next character in the sv's string */ |
170 | to = SvPVX(sv)+SvCUR(sv); |
171 | |
172 | /* if open delimiter is the close delimiter read unbridle */ |
173 | if (PL_multi_open == PL_multi_close) { |
174 | for (; s < PL_bufend; s++,to++) { |
175 | /* embedded newlines increment the current line number */ |
176 | if (*s == '\n' && !PL_rsfp && !PL_parser->filtered) |
177 | CopLINE_inc(PL_curcop); |
178 | /* handle quoted delimiters */ |
179 | if (*s == '\\' && s+1 < PL_bufend && term != '\\') { |
180 | if (!keep_quoted && s[1] == term) |
181 | s++; |
182 | /* any other quotes are simply copied straight through */ |
183 | else |
184 | *to++ = *s++; |
185 | } |
186 | /* terminate when run out of buffer (the for() condition), or |
187 | have found the terminator */ |
188 | else if (*s == term) { |
189 | if (termlen == 1) |
190 | break; |
191 | if (s+termlen <= PL_bufend && memEQ(s, (char*)termstr, termlen)) |
192 | break; |
193 | } |
194 | else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) |
195 | has_utf8 = TRUE; |
196 | *to = *s; |
197 | } |
198 | } |
199 | |
200 | /* if the terminator isn't the same as the start character (e.g., |
201 | matched brackets), we have to allow more in the quoting, and |
202 | be prepared for nested brackets. |
203 | */ |
204 | else { |
205 | /* read until we run out of string, or we find the terminator */ |
206 | for (; s < PL_bufend; s++,to++) { |
207 | /* embedded newlines increment the line count */ |
208 | if (*s == '\n' && !PL_rsfp && !PL_parser->filtered) |
209 | CopLINE_inc(PL_curcop); |
210 | /* backslashes can escape the open or closing characters */ |
211 | if (*s == '\\' && s+1 < PL_bufend) { |
212 | if (!keep_quoted && |
213 | ((s[1] == PL_multi_open) || (s[1] == PL_multi_close))) |
214 | s++; |
215 | else |
216 | *to++ = *s++; |
217 | } |
218 | /* allow nested opens and closes */ |
219 | else if (*s == PL_multi_close && --brackets <= 0) |
220 | break; |
221 | else if (*s == PL_multi_open) |
222 | brackets++; |
223 | else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) |
224 | has_utf8 = TRUE; |
225 | *to = *s; |
226 | } |
227 | } |
228 | /* terminate the copied string and update the sv's end-of-string */ |
229 | *to = '\0'; |
230 | SvCUR_set(sv, to - SvPVX_const(sv)); |
231 | |
232 | /* |
233 | * this next chunk reads more into the buffer if we're not done yet |
234 | */ |
235 | |
236 | if (s < PL_bufend) |
237 | break; /* handle case where we are done yet :-) */ |
238 | |
239 | #ifndef PERL_STRICT_CR |
240 | if (to - SvPVX_const(sv) >= 2) { |
241 | if ((to[-2] == '\r' && to[-1] == '\n') || |
242 | (to[-2] == '\n' && to[-1] == '\r')) |
243 | { |
244 | to[-2] = '\n'; |
245 | to--; |
246 | SvCUR_set(sv, to - SvPVX_const(sv)); |
247 | } |
248 | else if (to[-1] == '\r') |
249 | to[-1] = '\n'; |
250 | } |
251 | else if (to - SvPVX_const(sv) == 1 && to[-1] == '\r') |
252 | to[-1] = '\n'; |
253 | #endif |
254 | |
255 | read_more_line: |
256 | /* if we're out of file, or a read fails, bail and reset the current |
257 | line marker so we can report where the unterminated string began |
258 | */ |
259 | CopLINE_inc(PL_curcop); |
260 | PL_bufptr = PL_bufend; |
261 | if (!lex_next_chunk(0)) { |
262 | CopLINE_set(PL_curcop, (line_t)PL_multi_start); |
263 | return NULL; |
264 | } |
265 | s = PL_bufptr; |
266 | } |
267 | |
268 | /* at this point, we have successfully read the delimited string */ |
269 | |
270 | if (!PL_encoding || UTF) { |
271 | if (keep_delims) |
272 | sv_catpvn(sv, s, termlen); |
273 | s += termlen; |
274 | } |
275 | if (has_utf8 || PL_encoding) |
276 | SvUTF8_on(sv); |
277 | |
278 | PL_multi_end = CopLINE(PL_curcop); |
279 | |
280 | /* if we allocated too much space, give some back */ |
281 | if (SvCUR(sv) + 5 < SvLEN(sv)) { |
282 | SvLEN_set(sv, SvCUR(sv) + 1); |
283 | SvPV_renew(sv, SvLEN(sv)); |
284 | } |
285 | |
286 | PL_bufptr = s; |
287 | return s; |
288 | } |
289 | /* ^^^^^^^^^^^^^^^^^^^^^ I HAVE NO IDEA WHAT I'M DOING ^^^^^^^^^^^^^^^^^^^^ */ |