Re: ext/Socket/Socket.xs
[p5sagit/p5-mst-13.2.git] / lib / Text / Balanced.pm
CommitLineData
3270c621 1# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
2# FOR FULL DOCUMENTATION SEE Balanced.pod
3
4use 5.005;
5use strict;
6
7package Text::Balanced;
8
9use Exporter;
10use SelfLoader;
11use vars qw { $VERSION @ISA %EXPORT_TAGS };
12
9686a75b 13$VERSION = '1.84';
3270c621 14@ISA = qw ( Exporter );
15
16%EXPORT_TAGS = ( ALL => [ qw(
17 &extract_delimited
18 &extract_bracketed
19 &extract_quotelike
20 &extract_codeblock
21 &extract_variable
22 &extract_tagged
23 &extract_multiple
24
25 &gen_delimited_pat
26 &gen_extract_tagged
27
28 &delimited_pat
29 ) ] );
30
31Exporter::export_ok_tags('ALL');
32
33# PROTOTYPES
34
35sub _match_bracketed($$$$$$);
36sub _match_variable($$);
37sub _match_codeblock($$$$$$$);
38sub _match_quotelike($$$$);
39
40# HANDLE RETURN VALUES IN VARIOUS CONTEXTS
41
42sub _failmsg {
43 my ($message, $pos) = @_;
44 $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
45}
46
47sub _fail
48{
49 my ($wantarray, $textref, $message, $pos) = @_;
50 _failmsg $message, $pos if $message;
51 return ("",$$textref,"") if $wantarray;
52 return undef;
53}
54
55sub _succeed
56{
57 $@ = undef;
58 my ($wantarray,$textref) = splice @_, 0, 2;
59 my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
60 my ($startlen) = $_[5];
61 my $remainderpos = $_[2];
62 if ($wantarray)
63 {
64 my @res;
65 while (my ($from, $len) = splice @_, 0, 2)
66 {
67 push @res, substr($$textref,$from,$len);
68 }
69 if ($extralen) { # CORRECT FILLET
70 my $extra = substr($res[0], $extrapos-$startlen, $extralen, "\n");
71 $res[1] = "$extra$res[1]";
72 eval { substr($$textref,$remainderpos,0) = $extra;
73 substr($$textref,$extrapos,$extralen,"\n")} ;
74 #REARRANGE HERE DOC AND FILLET IF POSSIBLE
75 pos($$textref) = $remainderpos-$extralen+1; # RESET \G
76 }
77 else {
78 pos($$textref) = $remainderpos; # RESET \G
79 }
80 return @res;
81 }
82 else
83 {
84 my $match = substr($$textref,$_[0],$_[1]);
85 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
86 my $extra = $extralen
87 ? substr($$textref, $extrapos, $extralen)."\n" : "";
88 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE
89 pos($$textref) = $_[4]; # RESET \G
90 return $match;
91 }
92}
93
94# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
95
96sub gen_delimited_pat($;$) # ($delimiters;$escapes)
97{
98 my ($dels, $escs) = @_;
99 return "" unless $dels =~ /\S/;
100 $escs = '\\' unless $escs;
101 $escs .= substr($escs,-1) x (length($dels)-length($escs));
102 my @pat = ();
103 my $i;
104 for ($i=0; $i<length $dels; $i++)
105 {
106 my $del = quotemeta substr($dels,$i,1);
107 my $esc = quotemeta substr($escs,$i,1);
108 if ($del eq $esc)
109 {
110 push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
111 }
112 else
113 {
114 push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
115 }
116 }
117 my $pat = join '|', @pat;
118 return "(?:$pat)";
119}
120
121*delimited_pat = \&gen_delimited_pat;
122
123
124# THE EXTRACTION FUNCTIONS
125
126sub extract_delimited (;$$$$)
127{
128 my $textref = defined $_[0] ? \$_[0] : \$_;
129 my $wantarray = wantarray;
130 my $del = defined $_[1] ? $_[1] : qq{\'\"\`};
131 my $pre = defined $_[2] ? $_[2] : '\s*';
132 my $esc = defined $_[3] ? $_[3] : qq{\\};
133 my $pat = gen_delimited_pat($del, $esc);
134 my $startpos = pos $$textref || 0;
135 return _fail($wantarray, $textref, "Not a delimited pattern", 0)
136 unless $$textref =~ m/\G($pre)($pat)/gc;
137 my $prelen = length($1);
138 my $matchpos = $startpos+$prelen;
139 my $endpos = pos $$textref;
140 return _succeed $wantarray, $textref,
141 $matchpos, $endpos-$matchpos, # MATCH
142 $endpos, length($$textref)-$endpos, # REMAINDER
143 $startpos, $prelen; # PREFIX
144}
145
146sub extract_bracketed (;$$$)
147{
148 my $textref = defined $_[0] ? \$_[0] : \$_;
149 my $ldel = defined $_[1] ? $_[1] : '{([<';
150 my $pre = defined $_[2] ? $_[2] : '\s*';
151 my $wantarray = wantarray;
152 my $qdel = "";
153 my $quotelike;
154 $ldel =~ s/'//g and $qdel .= q{'};
155 $ldel =~ s/"//g and $qdel .= q{"};
156 $ldel =~ s/`//g and $qdel .= q{`};
157 $ldel =~ s/q//g and $quotelike = 1;
158 $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
159 my $rdel = $ldel;
160 unless ($rdel =~ tr/[({</])}>/)
161 {
162 return _fail $wantarray, $textref,
163 "Did not find a suitable bracket in delimiter: \"$_[1]\"",
164 0;
165 }
166 my $posbug = pos;
167 $ldel = join('|', map { quotemeta $_ } split('', $ldel));
168 $rdel = join('|', map { quotemeta $_ } split('', $rdel));
169 pos = $posbug;
170
171 my $startpos = pos $$textref || 0;
172 my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
173
174 return _fail ($wantarray, $textref) unless @match;
175
176 return _succeed ( $wantarray, $textref,
177 $match[2], $match[5]+2, # MATCH
178 @match[8,9], # REMAINDER
179 @match[0,1], # PREFIX
180 );
181}
182
183sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
184{
185 my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
186 my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
187 unless ($$textref =~ m/\G$pre/gc)
188 {
189 _failmsg "Did not find prefix: /$pre/", $startpos;
190 return;
191 }
192
193 $ldelpos = pos $$textref;
194
195 unless ($$textref =~ m/\G($ldel)/gc)
196 {
197 _failmsg "Did not find opening bracket after prefix: \"$pre\"",
198 pos $$textref;
199 pos $$textref = $startpos;
200 return;
201 }
202
203 my @nesting = ( $1 );
204 my $textlen = length $$textref;
205 while (pos $$textref < $textlen)
206 {
207 next if $$textref =~ m/\G\\./gcs;
208
209 if ($$textref =~ m/\G($ldel)/gc)
210 {
211 push @nesting, $1;
212 }
213 elsif ($$textref =~ m/\G($rdel)/gc)
214 {
215 my ($found, $brackettype) = ($1, $1);
216 if ($#nesting < 0)
217 {
218 _failmsg "Unmatched closing bracket: \"$found\"",
219 pos $$textref;
220 pos $$textref = $startpos;
221 return;
222 }
223 my $expected = pop(@nesting);
224 $expected =~ tr/({[</)}]>/;
225 if ($expected ne $brackettype)
226 {
227 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
228 pos $$textref;
229 pos $$textref = $startpos;
230 return;
231 }
232 last if $#nesting < 0;
233 }
234 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
235 {
9686a75b 236 $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
3270c621 237 _failmsg "Unmatched embedded quote ($1)",
238 pos $$textref;
239 pos $$textref = $startpos;
240 return;
241 }
242 elsif ($quotelike && _match_quotelike($textref,"",1,0))
243 {
244 next;
245 }
246
247 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
248 }
249 if ($#nesting>=0)
250 {
251 _failmsg "Unmatched opening bracket(s): "
252 . join("..",@nesting)."..",
253 pos $$textref;
254 pos $$textref = $startpos;
255 return;
256 }
257
258 $endpos = pos $$textref;
259
260 return (
261 $startpos, $ldelpos-$startpos, # PREFIX
262 $ldelpos, 1, # OPENING BRACKET
263 $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS
264 $endpos-1, 1, # CLOSING BRACKET
265 $endpos, length($$textref)-$endpos, # REMAINDER
266 );
267}
268
269sub revbracket($)
270{
271 my $brack = reverse $_[0];
272 $brack =~ tr/[({</])}>/;
273 return $brack;
274}
275
276my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
277
278sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
279{
280 my $textref = defined $_[0] ? \$_[0] : \$_;
281 my $ldel = $_[1];
282 my $rdel = $_[2];
283 my $pre = defined $_[3] ? $_[3] : '\s*';
284 my %options = defined $_[4] ? %{$_[4]} : ();
285 my $omode = defined $options{fail} ? $options{fail} : '';
286 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
287 : defined($options{reject}) ? $options{reject}
288 : ''
289 ;
290 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
291 : defined($options{ignore}) ? $options{ignore}
292 : ''
293 ;
294
295 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
296 $@ = undef;
297
298 my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
299
300 return _fail(wantarray, $textref) unless @match;
301 return _succeed wantarray, $textref,
302 $match[2], $match[3]+$match[5]+$match[7], # MATCH
303 @match[8..9,0..1,2..7]; # REM, PRE, BITS
304}
305
306sub _match_tagged # ($$$$$$$)
307{
308 my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
309 my $rdelspec;
310
311 my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
312
313 unless ($$textref =~ m/\G($pre)/gc)
314 {
315 _failmsg "Did not find prefix: /$pre/", pos $$textref;
316 goto failed;
317 }
318
319 $opentagpos = pos($$textref);
320
321 unless ($$textref =~ m/\G$ldel/gc)
322 {
323 _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
324 goto failed;
325 }
326
327 $textpos = pos($$textref);
328
329 if (!defined $rdel)
330 {
331 $rdelspec = $&;
332 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes)
333 {
334 _failmsg "Unable to construct closing tag to match: $rdel",
335 pos $$textref;
336 goto failed;
337 }
338 }
339 else
340 {
341 $rdelspec = eval "qq{$rdel}";
342 }
343
344 while (pos($$textref) < length($$textref))
345 {
346 next if $$textref =~ m/\G\\./gc;
347
348 if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
349 {
350 $parapos = pos($$textref) - length($1)
351 unless defined $parapos;
352 }
353 elsif ($$textref =~ m/\G($rdelspec)/gc )
354 {
355 $closetagpos = pos($$textref)-length($1);
356 goto matched;
357 }
358 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
359 {
360 next;
361 }
362 elsif ($bad && $$textref =~ m/\G($bad)/gcs)
363 {
364 pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS
365 goto short if ($omode eq 'PARA' || $omode eq 'MAX');
366 _failmsg "Found invalid nested tag: $1", pos $$textref;
367 goto failed;
368 }
369 elsif ($$textref =~ m/\G($ldel)/gc)
370 {
371 my $tag = $1;
372 pos($$textref) -= length($tag); # REWIND TO NESTED TAG
373 unless (_match_tagged(@_)) # MATCH NESTED TAG
374 {
375 goto short if $omode eq 'PARA' || $omode eq 'MAX';
376 _failmsg "Found unbalanced nested tag: $tag",
377 pos $$textref;
378 goto failed;
379 }
380 }
381 else { $$textref =~ m/./gcs }
382 }
383
384short:
385 $closetagpos = pos($$textref);
386 goto matched if $omode eq 'MAX';
387 goto failed unless $omode eq 'PARA';
388
389 if (defined $parapos) { pos($$textref) = $parapos }
390 else { $parapos = pos($$textref) }
391
392 return (
393 $startpos, $opentagpos-$startpos, # PREFIX
394 $opentagpos, $textpos-$opentagpos, # OPENING TAG
395 $textpos, $parapos-$textpos, # TEXT
396 $parapos, 0, # NO CLOSING TAG
397 $parapos, length($$textref)-$parapos, # REMAINDER
398 );
399
400matched:
401 $endpos = pos($$textref);
402 return (
403 $startpos, $opentagpos-$startpos, # PREFIX
404 $opentagpos, $textpos-$opentagpos, # OPENING TAG
405 $textpos, $closetagpos-$textpos, # TEXT
406 $closetagpos, $endpos-$closetagpos, # CLOSING TAG
407 $endpos, length($$textref)-$endpos, # REMAINDER
408 );
409
410failed:
411 _failmsg "Did not find closing tag", pos $$textref unless $@;
412 pos($$textref) = $startpos;
413 return;
414}
415
416sub extract_variable (;$$)
417{
418 my $textref = defined $_[0] ? \$_[0] : \$_;
419 return ("","","") unless defined $$textref;
420 my $pre = defined $_[1] ? $_[1] : '\s*';
421
422 my @match = _match_variable($textref,$pre);
423
424 return _fail wantarray, $textref unless @match;
425
426 return _succeed wantarray, $textref,
427 @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX
428}
429
430sub _match_variable($$)
431{
432 my ($textref, $pre) = @_;
433 my $startpos = pos($$textref) = pos($$textref)||0;
434 unless ($$textref =~ m/\G($pre)/gc)
435 {
436 _failmsg "Did not find prefix: /$pre/", pos $$textref;
437 return;
438 }
439 my $varpos = pos($$textref);
440 unless ($$textref =~ m/\G(\$#?|[*\@\%]|\\&)+/gc)
441 {
442 _failmsg "Did not find leading dereferencer", pos $$textref;
443 pos $$textref = $startpos;
444 return;
445 }
446
447 unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
448 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0))
449 {
450 _failmsg "Bad identifier after dereferencer", pos $$textref;
451 pos $$textref = $startpos;
452 return;
453 }
454
455 while (1)
456 {
457 next if _match_codeblock($textref,
458 qr/\s*->\s*(?:[a-zA-Z]\w+\s*)?/,
459 qr/[({[]/, qr/[)}\]]/,
460 qr/[({[]/, qr/[)}\]]/, 0);
461 next if _match_codeblock($textref,
462 qr/\s*/, qr/[{[]/, qr/[}\]]/,
463 qr/[{[]/, qr/[}\]]/, 0);
464 next if _match_variable($textref,'\s*->\s*');
465 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
466 last;
467 }
468
469 my $endpos = pos($$textref);
470 return ($startpos, $varpos-$startpos,
471 $varpos, $endpos-$varpos,
472 $endpos, length($$textref)-$endpos
473 );
474}
475
476sub extract_codeblock (;$$$$$)
477{
478 my $textref = defined $_[0] ? \$_[0] : \$_;
479 my $wantarray = wantarray;
480 my $ldel_inner = defined $_[1] ? $_[1] : '{';
481 my $pre = defined $_[2] ? $_[2] : '\s*';
482 my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
483 my $rd = $_[4];
484 my $rdel_inner = $ldel_inner;
485 my $rdel_outer = $ldel_outer;
486 my $posbug = pos;
487 for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
488 for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
489 for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
490 {
491 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
492 }
493 pos = $posbug;
494
495 my @match = _match_codeblock($textref, $pre,
496 $ldel_outer, $rdel_outer,
497 $ldel_inner, $rdel_inner,
498 $rd);
499 return _fail($wantarray, $textref) unless @match;
500 return _succeed($wantarray, $textref,
501 @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX
502 );
503
504}
505
506sub _match_codeblock($$$$$$$)
507{
508 my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
509 my $startpos = pos($$textref) = pos($$textref) || 0;
510 unless ($$textref =~ m/\G($pre)/gc)
511 {
512 _failmsg qq{Did not match prefix /$pre/ at"} .
513 substr($$textref,pos($$textref),20) .
514 q{..."},
515 pos $$textref;
516 return;
517 }
518 my $codepos = pos($$textref);
519 unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER
520 {
521 _failmsg qq{Did not find expected opening bracket at "} .
522 substr($$textref,pos($$textref),20) .
523 q{..."},
524 pos $$textref;
525 pos $$textref = $startpos;
526 return;
527 }
528 my $closing = $1;
529 $closing =~ tr/([<{/)]>}/;
530 my $matched;
531 my $patvalid = 1;
532 while (pos($$textref) < length($$textref))
533 {
534 $matched = '';
535 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
536 {
537 $patvalid = 0;
538 next;
539 }
540
541 if ($$textref =~ m/\G\s*#.*/gc)
542 {
543 next;
544 }
545
546 if ($$textref =~ m/\G\s*($rdel_outer)/gc)
547 {
548 unless ($matched = ($closing && $1 eq $closing) )
549 {
550 next if $1 eq '>'; # MIGHT BE A "LESS THAN"
551 _failmsg q{Mismatched closing bracket at "} .
552 substr($$textref,pos($$textref),20) .
553 qq{...". Expected '$closing'},
554 pos $$textref;
555 }
556 last;
557 }
558
559 if (_match_variable($textref,'\s*') ||
560 _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
561 {
562 $patvalid = 0;
563 next;
564 }
565
566
567 # NEED TO COVER MANY MORE CASES HERE!!!
568 if ($$textref =~ m#\G\s*( [-+*x/%^&|.]=?
569 | =(?!>)
570 | (\*\*|&&|\|\||<<|>>)=?
571 | [!=][~=]
572 | split|grep|map|return
573 )#gcx)
574 {
575 $patvalid = 1;
576 next;
577 }
578
579 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
580 {
581 $patvalid = 1;
582 next;
583 }
584
585 if ($$textref =~ m/\G\s*$ldel_outer/gc)
586 {
587 _failmsg q{Improperly nested codeblock at "} .
588 substr($$textref,pos($$textref),20) .
589 q{..."},
590 pos $$textref;
591 last;
592 }
593
594 $patvalid = 0;
595 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
596 }
597 continue { $@ = undef }
598
599 unless ($matched)
600 {
601 _failmsg 'No match found for opening bracket', pos $$textref
602 unless $@;
603 return;
604 }
605
606 my $endpos = pos($$textref);
607 return ( $startpos, $codepos-$startpos,
608 $codepos, $endpos-$codepos,
609 $endpos, length($$textref)-$endpos,
610 );
611}
612
613
614my %mods = (
615 'none' => '[cgimsox]*',
616 'm' => '[cgimsox]*',
617 's' => '[cegimsox]*',
618 'tr' => '[cds]*',
619 'y' => '[cds]*',
620 'qq' => '',
621 'qx' => '',
622 'qw' => '',
623 'qr' => '[imsx]*',
624 'q' => '',
625 );
626
627sub extract_quotelike (;$$)
628{
629 my $textref = $_[0] ? \$_[0] : \$_;
630 my $wantarray = wantarray;
631 my $pre = defined $_[1] ? $_[1] : '\s*';
632
633 my @match = _match_quotelike($textref,$pre,1,0);
634 return _fail($wantarray, $textref) unless @match;
635 return _succeed($wantarray, $textref,
636 $match[2], $match[18]-$match[2], # MATCH
637 @match[18,19], # REMAINDER
638 @match[0,1], # PREFIX
639 @match[2..17], # THE BITS
640 @match[20,21], # ANY FILLET?
641 );
642};
643
644sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match)
645{
646 my ($textref, $pre, $rawmatch, $qmark) = @_;
647
648 my ($textlen,$startpos,
649 $oppos,
650 $preld1pos,$ld1pos,$str1pos,$rd1pos,
651 $preld2pos,$ld2pos,$str2pos,$rd2pos,
652 $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
653
654 unless ($$textref =~ m/\G($pre)/gc)
655 {
656 _failmsg qq{Did not find prefix /$pre/ at "} .
657 substr($$textref, pos($$textref), 20) .
658 q{..."},
659 pos $$textref;
660 return;
661 }
662 $oppos = pos($$textref);
663
664 my $initial = substr($$textref,$oppos,1);
665
666 if ($initial && $initial =~ m|^[\"\'\`]|
667 || $rawmatch && $initial =~ m|^/|
668 || $qmark && $initial =~ m|^\?|)
669 {
9686a75b 670 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
3270c621 671 {
672 _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
673 substr($$textref, $oppos, 20) .
674 q{..."},
675 pos $$textref;
676 pos $$textref = $startpos;
677 return;
678 }
679 $modpos= pos($$textref);
680 $rd1pos = $modpos-1;
681
682 if ($initial eq '/' || $initial eq '?')
683 {
684 $$textref =~ m/\G$mods{none}/gc
685 }
686
687 my $endpos = pos($$textref);
688 return (
689 $startpos, $oppos-$startpos, # PREFIX
690 $oppos, 0, # NO OPERATOR
691 $oppos, 1, # LEFT DEL
692 $oppos+1, $rd1pos-$oppos-1, # STR/PAT
693 $rd1pos, 1, # RIGHT DEL
694 $modpos, 0, # NO 2ND LDEL
695 $modpos, 0, # NO 2ND STR
696 $modpos, 0, # NO 2ND RDEL
697 $modpos, $endpos-$modpos, # MODIFIERS
698 $endpos, $textlen-$endpos, # REMAINDER
699 );
700 }
701
702 unless ($$textref =~ m{\G((?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
703 {
704 _failmsg q{No quotelike operator found after prefix at "} .
705 substr($$textref, pos($$textref), 20) .
706 q{..."},
707 pos $$textref;
708 pos $$textref = $startpos;
709 return;
710 }
711
712 my $op = $1;
713 $preld1pos = pos($$textref);
714 if ($op eq '<<') {
715 $ld1pos = pos($$textref);
716 my $label;
717 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
718 $label = $1;
719 }
720 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
721 | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
722 | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
9686a75b 723 }gcsx) {
3270c621 724 $label = $+;
725 }
726 else {
727 $label = "";
728 }
729 my $extrapos = pos($$textref);
730 $$textref =~ m{.*\n}gc;
731 $str1pos = pos($$textref);
732 unless ($$textref =~ m{.*?\n(?=$label\n)}gc) {
733 _failmsg qq{Missing here doc terminator ('$label') after "} .
734 substr($$textref, $startpos, 20) .
735 q{..."},
736 pos $$textref;
737 pos $$textref = $startpos;
738 return;
739 }
740 $rd1pos = pos($$textref);
741 $$textref =~ m{$label\n}gc;
742 $ld2pos = pos($$textref);
743 return (
744 $startpos, $oppos-$startpos, # PREFIX
745 $oppos, length($op), # OPERATOR
746 $ld1pos, $extrapos-$ld1pos, # LEFT DEL
747 $str1pos, $rd1pos-$str1pos, # STR/PAT
748 $rd1pos, $ld2pos-$rd1pos, # RIGHT DEL
749 $ld2pos, 0, # NO 2ND LDEL
750 $ld2pos, 0, # NO 2ND STR
751 $ld2pos, 0, # NO 2ND RDEL
752 $ld2pos, 0, # NO MODIFIERS
753 $ld2pos, $textlen-$ld2pos, # REMAINDER
754 $extrapos, $str1pos-$extrapos, # FILLETED BIT
755 );
756 }
757
758 $$textref =~ m/\G\s*/gc;
759 $ld1pos = pos($$textref);
760 $str1pos = $ld1pos+1;
761
762 unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD
763 {
764 _failmsg "No block delimiter found after quotelike $op",
765 pos $$textref;
766 pos $$textref = $startpos;
767 return;
768 }
769 pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
770 my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
771 if ($ldel1 =~ /[[(<{]/)
772 {
773 $rdel1 =~ tr/[({</])}>/;
774 _match_bracketed($textref,"",$ldel1,"","",$rdel1)
775 || do { pos $$textref = $startpos; return };
776 }
777 else
778 {
9686a75b 779 $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
3270c621 780 || do { pos $$textref = $startpos; return };
781 }
782 $ld2pos = $rd1pos = pos($$textref)-1;
783
784 my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
785 if ($second_arg)
786 {
787 my ($ldel2, $rdel2);
788 if ($ldel1 =~ /[[(<{]/)
789 {
790 unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD
791 {
792 _failmsg "Missing second block for quotelike $op",
793 pos $$textref;
794 pos $$textref = $startpos;
795 return;
796 }
797 $ldel2 = $rdel2 = "\Q$1";
798 $rdel2 =~ tr/[({</])}>/;
799 }
800 else
801 {
802 $ldel2 = $rdel2 = $ldel1;
803 }
804 $str2pos = $ld2pos+1;
805
806 if ($ldel2 =~ /[[(<{]/)
807 {
808 pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD
809 _match_bracketed($textref,"",$ldel2,"","",$rdel2)
810 || do { pos $$textref = $startpos; return };
811 }
812 else
813 {
9686a75b 814 $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
3270c621 815 || do { pos $$textref = $startpos; return };
816 }
817 $rd2pos = pos($$textref)-1;
818 }
819 else
820 {
821 $ld2pos = $str2pos = $rd2pos = $rd1pos;
822 }
823
824 $modpos = pos $$textref;
825
826 $$textref =~ m/\G($mods{$op})/gc;
827 my $endpos = pos $$textref;
828
829 return (
830 $startpos, $oppos-$startpos, # PREFIX
831 $oppos, length($op), # OPERATOR
832 $ld1pos, 1, # LEFT DEL
833 $str1pos, $rd1pos-$str1pos, # STR/PAT
834 $rd1pos, 1, # RIGHT DEL
835 $ld2pos, $second_arg, # 2ND LDEL (MAYBE)
836 $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE)
837 $rd2pos, $second_arg, # 2ND RDEL (MAYBE)
838 $modpos, $endpos-$modpos, # MODIFIERS
839 $endpos, $textlen-$endpos, # REMAINDER
840 );
841}
842
843my $def_func =
844[
845 sub { extract_variable($_[0], '') },
846 sub { extract_quotelike($_[0],'') },
847 sub { extract_codeblock($_[0],'{}','') },
848];
849
850sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown)
851{
852 my $textref = defined($_[0]) ? \$_[0] : \$_;
853 my $posbug = pos;
854 my ($lastpos, $firstpos);
855 my @fields = ();
856
857 for ($$textref)
858 {
859 my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
860 my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
861 my $igunk = $_[3];
862
863 pos ||= 0;
864
865 unless (wantarray)
866 {
867 use Carp;
868 carp "extract_multiple reset maximal count to 1 in scalar context"
869 if $^W && defined($_[2]) && $max > 1;
870 $max = 1
871 }
872
873 my $unkpos;
874 my $func;
875 my $class;
876
877 my @class;
878 foreach $func ( @func )
879 {
880 if (ref($func) eq 'HASH')
881 {
882 push @class, (keys %$func)[0];
883 $func = (values %$func)[0];
884 }
885 else
886 {
887 push @class, undef;
888 }
889 }
890
891 FIELD: while (pos() < length())
892 {
893 my $field;
894 foreach my $i ( 0..$#func )
895 {
896 $func = $func[$i];
897 $class = $class[$i];
898 $lastpos = pos;
899 if (ref($func) eq 'CODE')
900 { ($field) = $func->($_) }
901 elsif (ref($func) eq 'Text::Balanced::Extractor')
902 { $field = $func->extract($_) }
903 elsif( m/\G$func/gc )
904 { $field = defined($1) ? $1 : $& }
905
906 if (defined($field) && length($field))
907 {
908 if (defined($unkpos) && !$igunk)
909 {
910 push @fields, substr($_, $unkpos, $lastpos-$unkpos);
911 $firstpos = $unkpos unless defined $firstpos;
912 undef $unkpos;
913 last FIELD if @fields == $max;
914 }
915 push @fields, $class
916 ? bless(\$field, $class)
917 : $field;
918 $firstpos = $lastpos unless defined $firstpos;
919 $lastpos = pos;
920 last FIELD if @fields == $max;
921 next FIELD;
922 }
923 }
924 if (/\G(.)/gcs)
925 {
926 $unkpos = pos()-1
927 unless $igunk || defined $unkpos;
928 }
929 }
930
931 if (defined $unkpos)
932 {
933 push @fields, substr($_, $unkpos);
934 $firstpos = $unkpos unless defined $firstpos;
935 $lastpos = length;
936 }
937 last;
938 }
939
940 pos $$textref = $lastpos;
941 return @fields if wantarray;
942
943 $firstpos ||= 0;
944 eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
945 pos $$textref = $firstpos };
946 return $fields[0];
947}
948
949
950sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
951{
952 my $ldel = $_[0];
953 my $rdel = $_[1];
954 my $pre = defined $_[2] ? $_[2] : '\s*';
955 my %options = defined $_[3] ? %{$_[3]} : ();
956 my $omode = defined $options{fail} ? $options{fail} : '';
957 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
958 : defined($options{reject}) ? $options{reject}
959 : ''
960 ;
961 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
962 : defined($options{ignore}) ? $options{ignore}
963 : ''
964 ;
965
966 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
967
968 my $posbug = pos;
969 for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
970 pos = $posbug;
971
972 my $closure = sub
973 {
974 my $textref = defined $_[0] ? \$_[0] : \$_;
975 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
976
977 return _fail(wantarray, $textref) unless @match;
978 return _succeed wantarray, $textref,
979 $match[2], $match[3]+$match[5]+$match[7], # MATCH
980 @match[8..9,0..1,2..7]; # REM, PRE, BITS
981 };
982
983 bless $closure, 'Text::Balanced::Extractor';
984}
985
986package Text::Balanced::Extractor;
987
988sub extract($$) # ($self, $text)
989{
990 &{$_[0]}($_[1]);
991}
992
993package Text::Balanced::ErrorMsg;
994
995use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
996
9971;