Commit | Line | Data |
3270c621 |
1 | # EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS. |
2 | # FOR FULL DOCUMENTATION SEE Balanced.pod |
3 | |
4 | use 5.005; |
5 | use strict; |
6 | |
7 | package Text::Balanced; |
8 | |
9 | use Exporter; |
10 | use SelfLoader; |
11 | use vars qw { $VERSION @ISA %EXPORT_TAGS }; |
12 | |
2f250b7c |
13 | $VERSION = '1.85'; |
3270c621 |
14 | @ISA = qw ( Exporter ); |
15 | |
16 | %EXPORT_TAGS = ( ALL => [ qw( |
17 | &extract_delimited |
18 | &extract_bracketed |
19 | &extract_quotelike |
20 | &extract_codeblock |
21 | &extract_variable |
22 | &extract_tagged |
23 | &extract_multiple |
24 | |
25 | &gen_delimited_pat |
26 | &gen_extract_tagged |
27 | |
28 | &delimited_pat |
29 | ) ] ); |
30 | |
31 | Exporter::export_ok_tags('ALL'); |
32 | |
33 | # PROTOTYPES |
34 | |
35 | sub _match_bracketed($$$$$$); |
36 | sub _match_variable($$); |
37 | sub _match_codeblock($$$$$$$); |
38 | sub _match_quotelike($$$$); |
39 | |
40 | # HANDLE RETURN VALUES IN VARIOUS CONTEXTS |
41 | |
42 | sub _failmsg { |
43 | my ($message, $pos) = @_; |
44 | $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg"; |
45 | } |
46 | |
47 | sub _fail |
48 | { |
49 | my ($wantarray, $textref, $message, $pos) = @_; |
50 | _failmsg $message, $pos if $message; |
51 | return ("",$$textref,"") if $wantarray; |
52 | return undef; |
53 | } |
54 | |
55 | sub _succeed |
56 | { |
57 | $@ = undef; |
58 | my ($wantarray,$textref) = splice @_, 0, 2; |
59 | my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0); |
60 | my ($startlen) = $_[5]; |
61 | my $remainderpos = $_[2]; |
62 | if ($wantarray) |
63 | { |
64 | my @res; |
65 | while (my ($from, $len) = splice @_, 0, 2) |
66 | { |
67 | push @res, substr($$textref,$from,$len); |
68 | } |
69 | if ($extralen) { # CORRECT FILLET |
70 | my $extra = substr($res[0], $extrapos-$startlen, $extralen, "\n"); |
71 | $res[1] = "$extra$res[1]"; |
72 | eval { substr($$textref,$remainderpos,0) = $extra; |
73 | substr($$textref,$extrapos,$extralen,"\n")} ; |
74 | #REARRANGE HERE DOC AND FILLET IF POSSIBLE |
75 | pos($$textref) = $remainderpos-$extralen+1; # RESET \G |
76 | } |
77 | else { |
78 | pos($$textref) = $remainderpos; # RESET \G |
79 | } |
80 | return @res; |
81 | } |
82 | else |
83 | { |
84 | my $match = substr($$textref,$_[0],$_[1]); |
85 | substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen; |
86 | my $extra = $extralen |
87 | ? substr($$textref, $extrapos, $extralen)."\n" : ""; |
88 | eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE |
89 | pos($$textref) = $_[4]; # RESET \G |
90 | return $match; |
91 | } |
92 | } |
93 | |
94 | # BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING |
95 | |
96 | sub gen_delimited_pat($;$) # ($delimiters;$escapes) |
97 | { |
98 | my ($dels, $escs) = @_; |
99 | return "" unless $dels =~ /\S/; |
100 | $escs = '\\' unless $escs; |
101 | $escs .= substr($escs,-1) x (length($dels)-length($escs)); |
102 | my @pat = (); |
103 | my $i; |
104 | for ($i=0; $i<length $dels; $i++) |
105 | { |
106 | my $del = quotemeta substr($dels,$i,1); |
107 | my $esc = quotemeta substr($escs,$i,1); |
108 | if ($del eq $esc) |
109 | { |
110 | push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del"; |
111 | } |
112 | else |
113 | { |
114 | push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del"; |
115 | } |
116 | } |
117 | my $pat = join '|', @pat; |
118 | return "(?:$pat)"; |
119 | } |
120 | |
121 | *delimited_pat = \&gen_delimited_pat; |
122 | |
123 | |
124 | # THE EXTRACTION FUNCTIONS |
125 | |
126 | sub extract_delimited (;$$$$) |
127 | { |
128 | my $textref = defined $_[0] ? \$_[0] : \$_; |
129 | my $wantarray = wantarray; |
130 | my $del = defined $_[1] ? $_[1] : qq{\'\"\`}; |
131 | my $pre = defined $_[2] ? $_[2] : '\s*'; |
132 | my $esc = defined $_[3] ? $_[3] : qq{\\}; |
133 | my $pat = gen_delimited_pat($del, $esc); |
134 | my $startpos = pos $$textref || 0; |
135 | return _fail($wantarray, $textref, "Not a delimited pattern", 0) |
136 | unless $$textref =~ m/\G($pre)($pat)/gc; |
137 | my $prelen = length($1); |
138 | my $matchpos = $startpos+$prelen; |
139 | my $endpos = pos $$textref; |
140 | return _succeed $wantarray, $textref, |
141 | $matchpos, $endpos-$matchpos, # MATCH |
142 | $endpos, length($$textref)-$endpos, # REMAINDER |
143 | $startpos, $prelen; # PREFIX |
144 | } |
145 | |
146 | sub extract_bracketed (;$$$) |
147 | { |
148 | my $textref = defined $_[0] ? \$_[0] : \$_; |
149 | my $ldel = defined $_[1] ? $_[1] : '{([<'; |
150 | my $pre = defined $_[2] ? $_[2] : '\s*'; |
151 | my $wantarray = wantarray; |
152 | my $qdel = ""; |
153 | my $quotelike; |
154 | $ldel =~ s/'//g and $qdel .= q{'}; |
155 | $ldel =~ s/"//g and $qdel .= q{"}; |
156 | $ldel =~ s/`//g and $qdel .= q{`}; |
157 | $ldel =~ s/q//g and $quotelike = 1; |
158 | $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds; |
159 | my $rdel = $ldel; |
160 | unless ($rdel =~ tr/[({</])}>/) |
161 | { |
162 | return _fail $wantarray, $textref, |
163 | "Did not find a suitable bracket in delimiter: \"$_[1]\"", |
164 | 0; |
165 | } |
166 | my $posbug = pos; |
167 | $ldel = join('|', map { quotemeta $_ } split('', $ldel)); |
168 | $rdel = join('|', map { quotemeta $_ } split('', $rdel)); |
169 | pos = $posbug; |
170 | |
171 | my $startpos = pos $$textref || 0; |
172 | my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel); |
173 | |
174 | return _fail ($wantarray, $textref) unless @match; |
175 | |
176 | return _succeed ( $wantarray, $textref, |
177 | $match[2], $match[5]+2, # MATCH |
178 | @match[8,9], # REMAINDER |
179 | @match[0,1], # PREFIX |
180 | ); |
181 | } |
182 | |
183 | sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel |
184 | { |
185 | my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_; |
186 | my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0); |
187 | unless ($$textref =~ m/\G$pre/gc) |
188 | { |
189 | _failmsg "Did not find prefix: /$pre/", $startpos; |
190 | return; |
191 | } |
192 | |
193 | $ldelpos = pos $$textref; |
194 | |
195 | unless ($$textref =~ m/\G($ldel)/gc) |
196 | { |
197 | _failmsg "Did not find opening bracket after prefix: \"$pre\"", |
198 | pos $$textref; |
199 | pos $$textref = $startpos; |
200 | return; |
201 | } |
202 | |
203 | my @nesting = ( $1 ); |
204 | my $textlen = length $$textref; |
205 | while (pos $$textref < $textlen) |
206 | { |
207 | next if $$textref =~ m/\G\\./gcs; |
208 | |
209 | if ($$textref =~ m/\G($ldel)/gc) |
210 | { |
211 | push @nesting, $1; |
212 | } |
213 | elsif ($$textref =~ m/\G($rdel)/gc) |
214 | { |
215 | my ($found, $brackettype) = ($1, $1); |
216 | if ($#nesting < 0) |
217 | { |
218 | _failmsg "Unmatched closing bracket: \"$found\"", |
219 | pos $$textref; |
220 | pos $$textref = $startpos; |
221 | return; |
222 | } |
223 | my $expected = pop(@nesting); |
224 | $expected =~ tr/({[</)}]>/; |
225 | if ($expected ne $brackettype) |
226 | { |
227 | _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"}, |
228 | pos $$textref; |
229 | pos $$textref = $startpos; |
230 | return; |
231 | } |
232 | last if $#nesting < 0; |
233 | } |
234 | elsif ($qdel && $$textref =~ m/\G([$qdel])/gc) |
235 | { |
9686a75b |
236 | $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next; |
3270c621 |
237 | _failmsg "Unmatched embedded quote ($1)", |
238 | pos $$textref; |
239 | pos $$textref = $startpos; |
240 | return; |
241 | } |
242 | elsif ($quotelike && _match_quotelike($textref,"",1,0)) |
243 | { |
244 | next; |
245 | } |
246 | |
247 | else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs } |
248 | } |
249 | if ($#nesting>=0) |
250 | { |
251 | _failmsg "Unmatched opening bracket(s): " |
252 | . join("..",@nesting)."..", |
253 | pos $$textref; |
254 | pos $$textref = $startpos; |
255 | return; |
256 | } |
257 | |
258 | $endpos = pos $$textref; |
259 | |
260 | return ( |
261 | $startpos, $ldelpos-$startpos, # PREFIX |
262 | $ldelpos, 1, # OPENING BRACKET |
263 | $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS |
264 | $endpos-1, 1, # CLOSING BRACKET |
265 | $endpos, length($$textref)-$endpos, # REMAINDER |
266 | ); |
267 | } |
268 | |
269 | sub revbracket($) |
270 | { |
271 | my $brack = reverse $_[0]; |
272 | $brack =~ tr/[({</])}>/; |
273 | return $brack; |
274 | } |
275 | |
276 | my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*}; |
277 | |
278 | sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options) |
279 | { |
280 | my $textref = defined $_[0] ? \$_[0] : \$_; |
281 | my $ldel = $_[1]; |
282 | my $rdel = $_[2]; |
283 | my $pre = defined $_[3] ? $_[3] : '\s*'; |
284 | my %options = defined $_[4] ? %{$_[4]} : (); |
285 | my $omode = defined $options{fail} ? $options{fail} : ''; |
286 | my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) |
287 | : defined($options{reject}) ? $options{reject} |
288 | : '' |
289 | ; |
290 | my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) |
291 | : defined($options{ignore}) ? $options{ignore} |
292 | : '' |
293 | ; |
294 | |
295 | if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; } |
296 | $@ = undef; |
297 | |
298 | my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); |
299 | |
300 | return _fail(wantarray, $textref) unless @match; |
301 | return _succeed wantarray, $textref, |
302 | $match[2], $match[3]+$match[5]+$match[7], # MATCH |
303 | @match[8..9,0..1,2..7]; # REM, PRE, BITS |
304 | } |
305 | |
306 | sub _match_tagged # ($$$$$$$) |
307 | { |
308 | my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_; |
309 | my $rdelspec; |
310 | |
311 | my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 ); |
312 | |
313 | unless ($$textref =~ m/\G($pre)/gc) |
314 | { |
315 | _failmsg "Did not find prefix: /$pre/", pos $$textref; |
316 | goto failed; |
317 | } |
318 | |
319 | $opentagpos = pos($$textref); |
320 | |
321 | unless ($$textref =~ m/\G$ldel/gc) |
322 | { |
323 | _failmsg "Did not find opening tag: /$ldel/", pos $$textref; |
324 | goto failed; |
325 | } |
326 | |
327 | $textpos = pos($$textref); |
328 | |
329 | if (!defined $rdel) |
330 | { |
331 | $rdelspec = $&; |
332 | unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes) |
333 | { |
334 | _failmsg "Unable to construct closing tag to match: $rdel", |
335 | pos $$textref; |
336 | goto failed; |
337 | } |
338 | } |
339 | else |
340 | { |
341 | $rdelspec = eval "qq{$rdel}"; |
342 | } |
343 | |
344 | while (pos($$textref) < length($$textref)) |
345 | { |
346 | next if $$textref =~ m/\G\\./gc; |
347 | |
348 | if ($$textref =~ m/\G(\n[ \t]*\n)/gc ) |
349 | { |
350 | $parapos = pos($$textref) - length($1) |
351 | unless defined $parapos; |
352 | } |
353 | elsif ($$textref =~ m/\G($rdelspec)/gc ) |
354 | { |
355 | $closetagpos = pos($$textref)-length($1); |
356 | goto matched; |
357 | } |
358 | elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc) |
359 | { |
360 | next; |
361 | } |
362 | elsif ($bad && $$textref =~ m/\G($bad)/gcs) |
363 | { |
364 | pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS |
365 | goto short if ($omode eq 'PARA' || $omode eq 'MAX'); |
366 | _failmsg "Found invalid nested tag: $1", pos $$textref; |
367 | goto failed; |
368 | } |
369 | elsif ($$textref =~ m/\G($ldel)/gc) |
370 | { |
371 | my $tag = $1; |
372 | pos($$textref) -= length($tag); # REWIND TO NESTED TAG |
373 | unless (_match_tagged(@_)) # MATCH NESTED TAG |
374 | { |
375 | goto short if $omode eq 'PARA' || $omode eq 'MAX'; |
376 | _failmsg "Found unbalanced nested tag: $tag", |
377 | pos $$textref; |
378 | goto failed; |
379 | } |
380 | } |
381 | else { $$textref =~ m/./gcs } |
382 | } |
383 | |
384 | short: |
385 | $closetagpos = pos($$textref); |
386 | goto matched if $omode eq 'MAX'; |
387 | goto failed unless $omode eq 'PARA'; |
388 | |
389 | if (defined $parapos) { pos($$textref) = $parapos } |
390 | else { $parapos = pos($$textref) } |
391 | |
392 | return ( |
393 | $startpos, $opentagpos-$startpos, # PREFIX |
394 | $opentagpos, $textpos-$opentagpos, # OPENING TAG |
395 | $textpos, $parapos-$textpos, # TEXT |
396 | $parapos, 0, # NO CLOSING TAG |
397 | $parapos, length($$textref)-$parapos, # REMAINDER |
398 | ); |
399 | |
400 | matched: |
401 | $endpos = pos($$textref); |
402 | return ( |
403 | $startpos, $opentagpos-$startpos, # PREFIX |
404 | $opentagpos, $textpos-$opentagpos, # OPENING TAG |
405 | $textpos, $closetagpos-$textpos, # TEXT |
406 | $closetagpos, $endpos-$closetagpos, # CLOSING TAG |
407 | $endpos, length($$textref)-$endpos, # REMAINDER |
408 | ); |
409 | |
410 | failed: |
411 | _failmsg "Did not find closing tag", pos $$textref unless $@; |
412 | pos($$textref) = $startpos; |
413 | return; |
414 | } |
415 | |
416 | sub extract_variable (;$$) |
417 | { |
418 | my $textref = defined $_[0] ? \$_[0] : \$_; |
419 | return ("","","") unless defined $$textref; |
420 | my $pre = defined $_[1] ? $_[1] : '\s*'; |
421 | |
422 | my @match = _match_variable($textref,$pre); |
423 | |
424 | return _fail wantarray, $textref unless @match; |
425 | |
426 | return _succeed wantarray, $textref, |
427 | @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX |
428 | } |
429 | |
430 | sub _match_variable($$) |
431 | { |
432 | my ($textref, $pre) = @_; |
433 | my $startpos = pos($$textref) = pos($$textref)||0; |
434 | unless ($$textref =~ m/\G($pre)/gc) |
435 | { |
436 | _failmsg "Did not find prefix: /$pre/", pos $$textref; |
437 | return; |
438 | } |
439 | my $varpos = pos($$textref); |
440 | unless ($$textref =~ m/\G(\$#?|[*\@\%]|\\&)+/gc) |
441 | { |
442 | _failmsg "Did not find leading dereferencer", pos $$textref; |
443 | pos $$textref = $startpos; |
444 | return; |
445 | } |
446 | |
447 | unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci |
448 | or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)) |
449 | { |
450 | _failmsg "Bad identifier after dereferencer", pos $$textref; |
451 | pos $$textref = $startpos; |
452 | return; |
453 | } |
454 | |
455 | while (1) |
456 | { |
457 | next if _match_codeblock($textref, |
2f250b7c |
458 | qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/, |
3270c621 |
459 | qr/[({[]/, qr/[)}\]]/, |
460 | qr/[({[]/, qr/[)}\]]/, 0); |
461 | next if _match_codeblock($textref, |
462 | qr/\s*/, qr/[{[]/, qr/[}\]]/, |
463 | qr/[{[]/, qr/[}\]]/, 0); |
464 | next if _match_variable($textref,'\s*->\s*'); |
465 | next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc; |
466 | last; |
467 | } |
468 | |
469 | my $endpos = pos($$textref); |
470 | return ($startpos, $varpos-$startpos, |
471 | $varpos, $endpos-$varpos, |
472 | $endpos, length($$textref)-$endpos |
473 | ); |
474 | } |
475 | |
476 | sub extract_codeblock (;$$$$$) |
477 | { |
478 | my $textref = defined $_[0] ? \$_[0] : \$_; |
479 | my $wantarray = wantarray; |
480 | my $ldel_inner = defined $_[1] ? $_[1] : '{'; |
481 | my $pre = defined $_[2] ? $_[2] : '\s*'; |
482 | my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner; |
483 | my $rd = $_[4]; |
484 | my $rdel_inner = $ldel_inner; |
485 | my $rdel_outer = $ldel_outer; |
486 | my $posbug = pos; |
487 | for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds } |
488 | for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds } |
489 | for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer) |
490 | { |
491 | $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')' |
492 | } |
493 | pos = $posbug; |
494 | |
495 | my @match = _match_codeblock($textref, $pre, |
496 | $ldel_outer, $rdel_outer, |
497 | $ldel_inner, $rdel_inner, |
498 | $rd); |
499 | return _fail($wantarray, $textref) unless @match; |
500 | return _succeed($wantarray, $textref, |
501 | @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX |
502 | ); |
503 | |
504 | } |
505 | |
506 | sub _match_codeblock($$$$$$$) |
507 | { |
508 | my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_; |
509 | my $startpos = pos($$textref) = pos($$textref) || 0; |
510 | unless ($$textref =~ m/\G($pre)/gc) |
511 | { |
512 | _failmsg qq{Did not match prefix /$pre/ at"} . |
513 | substr($$textref,pos($$textref),20) . |
514 | q{..."}, |
515 | pos $$textref; |
516 | return; |
517 | } |
518 | my $codepos = pos($$textref); |
519 | unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER |
520 | { |
521 | _failmsg qq{Did not find expected opening bracket at "} . |
522 | substr($$textref,pos($$textref),20) . |
523 | q{..."}, |
524 | pos $$textref; |
525 | pos $$textref = $startpos; |
526 | return; |
527 | } |
528 | my $closing = $1; |
529 | $closing =~ tr/([<{/)]>}/; |
530 | my $matched; |
531 | my $patvalid = 1; |
532 | while (pos($$textref) < length($$textref)) |
533 | { |
534 | $matched = ''; |
535 | if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc) |
536 | { |
537 | $patvalid = 0; |
538 | next; |
539 | } |
540 | |
541 | if ($$textref =~ m/\G\s*#.*/gc) |
542 | { |
543 | next; |
544 | } |
545 | |
546 | if ($$textref =~ m/\G\s*($rdel_outer)/gc) |
547 | { |
548 | unless ($matched = ($closing && $1 eq $closing) ) |
549 | { |
550 | next if $1 eq '>'; # MIGHT BE A "LESS THAN" |
551 | _failmsg q{Mismatched closing bracket at "} . |
552 | substr($$textref,pos($$textref),20) . |
553 | qq{...". Expected '$closing'}, |
554 | pos $$textref; |
555 | } |
556 | last; |
557 | } |
558 | |
559 | if (_match_variable($textref,'\s*') || |
560 | _match_quotelike($textref,'\s*',$patvalid,$patvalid) ) |
561 | { |
562 | $patvalid = 0; |
563 | next; |
564 | } |
565 | |
566 | |
567 | # NEED TO COVER MANY MORE CASES HERE!!! |
568 | if ($$textref =~ m#\G\s*( [-+*x/%^&|.]=? |
569 | | =(?!>) |
570 | | (\*\*|&&|\|\||<<|>>)=? |
571 | | [!=][~=] |
572 | | split|grep|map|return |
573 | )#gcx) |
574 | { |
575 | $patvalid = 1; |
576 | next; |
577 | } |
578 | |
579 | if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) ) |
580 | { |
581 | $patvalid = 1; |
582 | next; |
583 | } |
584 | |
585 | if ($$textref =~ m/\G\s*$ldel_outer/gc) |
586 | { |
587 | _failmsg q{Improperly nested codeblock at "} . |
588 | substr($$textref,pos($$textref),20) . |
589 | q{..."}, |
590 | pos $$textref; |
591 | last; |
592 | } |
593 | |
594 | $patvalid = 0; |
595 | $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc; |
596 | } |
597 | continue { $@ = undef } |
598 | |
599 | unless ($matched) |
600 | { |
601 | _failmsg 'No match found for opening bracket', pos $$textref |
602 | unless $@; |
603 | return; |
604 | } |
605 | |
606 | my $endpos = pos($$textref); |
607 | return ( $startpos, $codepos-$startpos, |
608 | $codepos, $endpos-$codepos, |
609 | $endpos, length($$textref)-$endpos, |
610 | ); |
611 | } |
612 | |
613 | |
614 | my %mods = ( |
615 | 'none' => '[cgimsox]*', |
616 | 'm' => '[cgimsox]*', |
617 | 's' => '[cegimsox]*', |
618 | 'tr' => '[cds]*', |
619 | 'y' => '[cds]*', |
620 | 'qq' => '', |
621 | 'qx' => '', |
622 | 'qw' => '', |
623 | 'qr' => '[imsx]*', |
624 | 'q' => '', |
625 | ); |
626 | |
627 | sub extract_quotelike (;$$) |
628 | { |
629 | my $textref = $_[0] ? \$_[0] : \$_; |
630 | my $wantarray = wantarray; |
631 | my $pre = defined $_[1] ? $_[1] : '\s*'; |
632 | |
633 | my @match = _match_quotelike($textref,$pre,1,0); |
634 | return _fail($wantarray, $textref) unless @match; |
635 | return _succeed($wantarray, $textref, |
636 | $match[2], $match[18]-$match[2], # MATCH |
637 | @match[18,19], # REMAINDER |
638 | @match[0,1], # PREFIX |
639 | @match[2..17], # THE BITS |
640 | @match[20,21], # ANY FILLET? |
641 | ); |
642 | }; |
643 | |
644 | sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match) |
645 | { |
646 | my ($textref, $pre, $rawmatch, $qmark) = @_; |
647 | |
648 | my ($textlen,$startpos, |
649 | $oppos, |
650 | $preld1pos,$ld1pos,$str1pos,$rd1pos, |
651 | $preld2pos,$ld2pos,$str2pos,$rd2pos, |
652 | $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 ); |
653 | |
654 | unless ($$textref =~ m/\G($pre)/gc) |
655 | { |
656 | _failmsg qq{Did not find prefix /$pre/ at "} . |
657 | substr($$textref, pos($$textref), 20) . |
658 | q{..."}, |
659 | pos $$textref; |
660 | return; |
661 | } |
662 | $oppos = pos($$textref); |
663 | |
664 | my $initial = substr($$textref,$oppos,1); |
665 | |
666 | if ($initial && $initial =~ m|^[\"\'\`]| |
667 | || $rawmatch && $initial =~ m|^/| |
668 | || $qmark && $initial =~ m|^\?|) |
669 | { |
9686a75b |
670 | unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx) |
3270c621 |
671 | { |
672 | _failmsg qq{Did not find closing delimiter to match '$initial' at "} . |
673 | substr($$textref, $oppos, 20) . |
674 | q{..."}, |
675 | pos $$textref; |
676 | pos $$textref = $startpos; |
677 | return; |
678 | } |
679 | $modpos= pos($$textref); |
680 | $rd1pos = $modpos-1; |
681 | |
682 | if ($initial eq '/' || $initial eq '?') |
683 | { |
684 | $$textref =~ m/\G$mods{none}/gc |
685 | } |
686 | |
687 | my $endpos = pos($$textref); |
688 | return ( |
689 | $startpos, $oppos-$startpos, # PREFIX |
690 | $oppos, 0, # NO OPERATOR |
691 | $oppos, 1, # LEFT DEL |
692 | $oppos+1, $rd1pos-$oppos-1, # STR/PAT |
693 | $rd1pos, 1, # RIGHT DEL |
694 | $modpos, 0, # NO 2ND LDEL |
695 | $modpos, 0, # NO 2ND STR |
696 | $modpos, 0, # NO 2ND RDEL |
697 | $modpos, $endpos-$modpos, # MODIFIERS |
698 | $endpos, $textlen-$endpos, # REMAINDER |
699 | ); |
700 | } |
701 | |
702 | unless ($$textref =~ m{\G((?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc) |
703 | { |
704 | _failmsg q{No quotelike operator found after prefix at "} . |
705 | substr($$textref, pos($$textref), 20) . |
706 | q{..."}, |
707 | pos $$textref; |
708 | pos $$textref = $startpos; |
709 | return; |
710 | } |
711 | |
712 | my $op = $1; |
713 | $preld1pos = pos($$textref); |
714 | if ($op eq '<<') { |
715 | $ld1pos = pos($$textref); |
716 | my $label; |
717 | if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) { |
718 | $label = $1; |
719 | } |
720 | elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) ' |
721 | | \G " ([^"\\]* (?:\\.[^"\\]*)*) " |
722 | | \G ` ([^`\\]* (?:\\.[^`\\]*)*) ` |
9686a75b |
723 | }gcsx) { |
3270c621 |
724 | $label = $+; |
725 | } |
726 | else { |
727 | $label = ""; |
728 | } |
729 | my $extrapos = pos($$textref); |
730 | $$textref =~ m{.*\n}gc; |
731 | $str1pos = pos($$textref); |
732 | unless ($$textref =~ m{.*?\n(?=$label\n)}gc) { |
733 | _failmsg qq{Missing here doc terminator ('$label') after "} . |
734 | substr($$textref, $startpos, 20) . |
735 | q{..."}, |
736 | pos $$textref; |
737 | pos $$textref = $startpos; |
738 | return; |
739 | } |
740 | $rd1pos = pos($$textref); |
741 | $$textref =~ m{$label\n}gc; |
742 | $ld2pos = pos($$textref); |
743 | return ( |
744 | $startpos, $oppos-$startpos, # PREFIX |
745 | $oppos, length($op), # OPERATOR |
746 | $ld1pos, $extrapos-$ld1pos, # LEFT DEL |
747 | $str1pos, $rd1pos-$str1pos, # STR/PAT |
748 | $rd1pos, $ld2pos-$rd1pos, # RIGHT DEL |
749 | $ld2pos, 0, # NO 2ND LDEL |
750 | $ld2pos, 0, # NO 2ND STR |
751 | $ld2pos, 0, # NO 2ND RDEL |
752 | $ld2pos, 0, # NO MODIFIERS |
753 | $ld2pos, $textlen-$ld2pos, # REMAINDER |
754 | $extrapos, $str1pos-$extrapos, # FILLETED BIT |
755 | ); |
756 | } |
757 | |
758 | $$textref =~ m/\G\s*/gc; |
759 | $ld1pos = pos($$textref); |
760 | $str1pos = $ld1pos+1; |
761 | |
762 | unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD |
763 | { |
764 | _failmsg "No block delimiter found after quotelike $op", |
765 | pos $$textref; |
766 | pos $$textref = $startpos; |
767 | return; |
768 | } |
769 | pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN |
770 | my ($ldel1, $rdel1) = ("\Q$1","\Q$1"); |
771 | if ($ldel1 =~ /[[(<{]/) |
772 | { |
773 | $rdel1 =~ tr/[({</])}>/; |
774 | _match_bracketed($textref,"",$ldel1,"","",$rdel1) |
775 | || do { pos $$textref = $startpos; return }; |
776 | } |
777 | else |
778 | { |
9686a75b |
779 | $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs |
3270c621 |
780 | || do { pos $$textref = $startpos; return }; |
781 | } |
782 | $ld2pos = $rd1pos = pos($$textref)-1; |
783 | |
784 | my $second_arg = $op =~ /s|tr|y/ ? 1 : 0; |
785 | if ($second_arg) |
786 | { |
787 | my ($ldel2, $rdel2); |
788 | if ($ldel1 =~ /[[(<{]/) |
789 | { |
790 | unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD |
791 | { |
792 | _failmsg "Missing second block for quotelike $op", |
793 | pos $$textref; |
794 | pos $$textref = $startpos; |
795 | return; |
796 | } |
797 | $ldel2 = $rdel2 = "\Q$1"; |
798 | $rdel2 =~ tr/[({</])}>/; |
799 | } |
800 | else |
801 | { |
802 | $ldel2 = $rdel2 = $ldel1; |
803 | } |
804 | $str2pos = $ld2pos+1; |
805 | |
806 | if ($ldel2 =~ /[[(<{]/) |
807 | { |
808 | pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD |
809 | _match_bracketed($textref,"",$ldel2,"","",$rdel2) |
810 | || do { pos $$textref = $startpos; return }; |
811 | } |
812 | else |
813 | { |
9686a75b |
814 | $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs |
3270c621 |
815 | || do { pos $$textref = $startpos; return }; |
816 | } |
817 | $rd2pos = pos($$textref)-1; |
818 | } |
819 | else |
820 | { |
821 | $ld2pos = $str2pos = $rd2pos = $rd1pos; |
822 | } |
823 | |
824 | $modpos = pos $$textref; |
825 | |
826 | $$textref =~ m/\G($mods{$op})/gc; |
827 | my $endpos = pos $$textref; |
828 | |
829 | return ( |
830 | $startpos, $oppos-$startpos, # PREFIX |
831 | $oppos, length($op), # OPERATOR |
832 | $ld1pos, 1, # LEFT DEL |
833 | $str1pos, $rd1pos-$str1pos, # STR/PAT |
834 | $rd1pos, 1, # RIGHT DEL |
835 | $ld2pos, $second_arg, # 2ND LDEL (MAYBE) |
836 | $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE) |
837 | $rd2pos, $second_arg, # 2ND RDEL (MAYBE) |
838 | $modpos, $endpos-$modpos, # MODIFIERS |
839 | $endpos, $textlen-$endpos, # REMAINDER |
840 | ); |
841 | } |
842 | |
843 | my $def_func = |
844 | [ |
845 | sub { extract_variable($_[0], '') }, |
846 | sub { extract_quotelike($_[0],'') }, |
847 | sub { extract_codeblock($_[0],'{}','') }, |
848 | ]; |
849 | |
850 | sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown) |
851 | { |
852 | my $textref = defined($_[0]) ? \$_[0] : \$_; |
853 | my $posbug = pos; |
854 | my ($lastpos, $firstpos); |
855 | my @fields = (); |
856 | |
857 | for ($$textref) |
858 | { |
859 | my @func = defined $_[1] ? @{$_[1]} : @{$def_func}; |
860 | my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000; |
861 | my $igunk = $_[3]; |
862 | |
863 | pos ||= 0; |
864 | |
865 | unless (wantarray) |
866 | { |
867 | use Carp; |
868 | carp "extract_multiple reset maximal count to 1 in scalar context" |
869 | if $^W && defined($_[2]) && $max > 1; |
870 | $max = 1 |
871 | } |
872 | |
873 | my $unkpos; |
874 | my $func; |
875 | my $class; |
876 | |
877 | my @class; |
878 | foreach $func ( @func ) |
879 | { |
880 | if (ref($func) eq 'HASH') |
881 | { |
882 | push @class, (keys %$func)[0]; |
883 | $func = (values %$func)[0]; |
884 | } |
885 | else |
886 | { |
887 | push @class, undef; |
888 | } |
889 | } |
890 | |
891 | FIELD: while (pos() < length()) |
892 | { |
893 | my $field; |
894 | foreach my $i ( 0..$#func ) |
895 | { |
896 | $func = $func[$i]; |
897 | $class = $class[$i]; |
898 | $lastpos = pos; |
899 | if (ref($func) eq 'CODE') |
900 | { ($field) = $func->($_) } |
901 | elsif (ref($func) eq 'Text::Balanced::Extractor') |
902 | { $field = $func->extract($_) } |
903 | elsif( m/\G$func/gc ) |
904 | { $field = defined($1) ? $1 : $& } |
905 | |
906 | if (defined($field) && length($field)) |
907 | { |
908 | if (defined($unkpos) && !$igunk) |
909 | { |
910 | push @fields, substr($_, $unkpos, $lastpos-$unkpos); |
911 | $firstpos = $unkpos unless defined $firstpos; |
912 | undef $unkpos; |
913 | last FIELD if @fields == $max; |
914 | } |
915 | push @fields, $class |
916 | ? bless(\$field, $class) |
917 | : $field; |
918 | $firstpos = $lastpos unless defined $firstpos; |
919 | $lastpos = pos; |
920 | last FIELD if @fields == $max; |
921 | next FIELD; |
922 | } |
923 | } |
924 | if (/\G(.)/gcs) |
925 | { |
926 | $unkpos = pos()-1 |
927 | unless $igunk || defined $unkpos; |
928 | } |
929 | } |
930 | |
931 | if (defined $unkpos) |
932 | { |
933 | push @fields, substr($_, $unkpos); |
934 | $firstpos = $unkpos unless defined $firstpos; |
935 | $lastpos = length; |
936 | } |
937 | last; |
938 | } |
939 | |
940 | pos $$textref = $lastpos; |
941 | return @fields if wantarray; |
942 | |
943 | $firstpos ||= 0; |
944 | eval { substr($$textref,$firstpos,$lastpos-$firstpos)=""; |
945 | pos $$textref = $firstpos }; |
946 | return $fields[0]; |
947 | } |
948 | |
949 | |
950 | sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options) |
951 | { |
952 | my $ldel = $_[0]; |
953 | my $rdel = $_[1]; |
954 | my $pre = defined $_[2] ? $_[2] : '\s*'; |
955 | my %options = defined $_[3] ? %{$_[3]} : (); |
956 | my $omode = defined $options{fail} ? $options{fail} : ''; |
957 | my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) |
958 | : defined($options{reject}) ? $options{reject} |
959 | : '' |
960 | ; |
961 | my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) |
962 | : defined($options{ignore}) ? $options{ignore} |
963 | : '' |
964 | ; |
965 | |
966 | if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; } |
967 | |
968 | my $posbug = pos; |
969 | for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ } |
970 | pos = $posbug; |
971 | |
972 | my $closure = sub |
973 | { |
974 | my $textref = defined $_[0] ? \$_[0] : \$_; |
975 | my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); |
976 | |
977 | return _fail(wantarray, $textref) unless @match; |
978 | return _succeed wantarray, $textref, |
979 | $match[2], $match[3]+$match[5]+$match[7], # MATCH |
980 | @match[8..9,0..1,2..7]; # REM, PRE, BITS |
981 | }; |
982 | |
983 | bless $closure, 'Text::Balanced::Extractor'; |
984 | } |
985 | |
986 | package Text::Balanced::Extractor; |
987 | |
988 | sub extract($$) # ($self, $text) |
989 | { |
990 | &{$_[0]}($_[1]); |
991 | } |
992 | |
993 | package Text::Balanced::ErrorMsg; |
994 | |
995 | use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" }; |
996 | |
997 | 1; |