Re: [PATCH] Make the 'sort' pragma lexically scoped
[p5sagit/p5-mst-13.2.git] / lib / Text / Balanced.pm
CommitLineData
3270c621 1# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
2# FOR FULL DOCUMENTATION SEE Balanced.pod
3
4use 5.005;
5use strict;
6
7package Text::Balanced;
8
9use Exporter;
3270c621 10use vars qw { $VERSION @ISA %EXPORT_TAGS };
11
ce940696 12$VERSION = '1.95_01';
3270c621 13@ISA = qw ( Exporter );
14
15%EXPORT_TAGS = ( ALL => [ qw(
16 &extract_delimited
17 &extract_bracketed
18 &extract_quotelike
19 &extract_codeblock
20 &extract_variable
21 &extract_tagged
22 &extract_multiple
23
24 &gen_delimited_pat
25 &gen_extract_tagged
26
27 &delimited_pat
28 ) ] );
29
30Exporter::export_ok_tags('ALL');
31
32# PROTOTYPES
33
34sub _match_bracketed($$$$$$);
35sub _match_variable($$);
36sub _match_codeblock($$$$$$$);
37sub _match_quotelike($$$$);
38
75c4c974 39sub carp {
40 require Carp; goto &Carp::carp;
41}
42
43sub croak {
44 require Carp; goto &Carp::croak;
45}
46
3270c621 47# HANDLE RETURN VALUES IN VARIOUS CONTEXTS
48
49sub _failmsg {
50 my ($message, $pos) = @_;
51 $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
52}
53
54sub _fail
55{
56 my ($wantarray, $textref, $message, $pos) = @_;
57 _failmsg $message, $pos if $message;
58 return ("",$$textref,"") if $wantarray;
59 return undef;
60}
61
62sub _succeed
63{
64 $@ = undef;
65 my ($wantarray,$textref) = splice @_, 0, 2;
66 my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
67 my ($startlen) = $_[5];
ce3ac4b6 68 my $oppos = $_[6];
3270c621 69 my $remainderpos = $_[2];
70 if ($wantarray)
71 {
72 my @res;
73 while (my ($from, $len) = splice @_, 0, 2)
74 {
75 push @res, substr($$textref,$from,$len);
76 }
77 if ($extralen) { # CORRECT FILLET
ce3ac4b6 78 my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n");
3270c621 79 $res[1] = "$extra$res[1]";
80 eval { substr($$textref,$remainderpos,0) = $extra;
81 substr($$textref,$extrapos,$extralen,"\n")} ;
82 #REARRANGE HERE DOC AND FILLET IF POSSIBLE
83 pos($$textref) = $remainderpos-$extralen+1; # RESET \G
84 }
85 else {
86 pos($$textref) = $remainderpos; # RESET \G
87 }
88 return @res;
89 }
90 else
91 {
92 my $match = substr($$textref,$_[0],$_[1]);
93 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
94 my $extra = $extralen
95 ? substr($$textref, $extrapos, $extralen)."\n" : "";
96 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE
97 pos($$textref) = $_[4]; # RESET \G
98 return $match;
99 }
100}
101
102# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
103
104sub gen_delimited_pat($;$) # ($delimiters;$escapes)
105{
106 my ($dels, $escs) = @_;
107 return "" unless $dels =~ /\S/;
108 $escs = '\\' unless $escs;
109 $escs .= substr($escs,-1) x (length($dels)-length($escs));
110 my @pat = ();
111 my $i;
112 for ($i=0; $i<length $dels; $i++)
113 {
114 my $del = quotemeta substr($dels,$i,1);
115 my $esc = quotemeta substr($escs,$i,1);
116 if ($del eq $esc)
117 {
118 push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
119 }
120 else
121 {
122 push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
123 }
124 }
125 my $pat = join '|', @pat;
126 return "(?:$pat)";
127}
128
129*delimited_pat = \&gen_delimited_pat;
130
131
132# THE EXTRACTION FUNCTIONS
133
134sub extract_delimited (;$$$$)
135{
136 my $textref = defined $_[0] ? \$_[0] : \$_;
137 my $wantarray = wantarray;
138 my $del = defined $_[1] ? $_[1] : qq{\'\"\`};
139 my $pre = defined $_[2] ? $_[2] : '\s*';
140 my $esc = defined $_[3] ? $_[3] : qq{\\};
141 my $pat = gen_delimited_pat($del, $esc);
142 my $startpos = pos $$textref || 0;
143 return _fail($wantarray, $textref, "Not a delimited pattern", 0)
144 unless $$textref =~ m/\G($pre)($pat)/gc;
145 my $prelen = length($1);
146 my $matchpos = $startpos+$prelen;
147 my $endpos = pos $$textref;
148 return _succeed $wantarray, $textref,
149 $matchpos, $endpos-$matchpos, # MATCH
150 $endpos, length($$textref)-$endpos, # REMAINDER
151 $startpos, $prelen; # PREFIX
152}
153
154sub extract_bracketed (;$$$)
155{
156 my $textref = defined $_[0] ? \$_[0] : \$_;
157 my $ldel = defined $_[1] ? $_[1] : '{([<';
158 my $pre = defined $_[2] ? $_[2] : '\s*';
159 my $wantarray = wantarray;
160 my $qdel = "";
161 my $quotelike;
162 $ldel =~ s/'//g and $qdel .= q{'};
163 $ldel =~ s/"//g and $qdel .= q{"};
164 $ldel =~ s/`//g and $qdel .= q{`};
165 $ldel =~ s/q//g and $quotelike = 1;
166 $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
167 my $rdel = $ldel;
168 unless ($rdel =~ tr/[({</])}>/)
169 {
170 return _fail $wantarray, $textref,
171 "Did not find a suitable bracket in delimiter: \"$_[1]\"",
172 0;
173 }
174 my $posbug = pos;
175 $ldel = join('|', map { quotemeta $_ } split('', $ldel));
176 $rdel = join('|', map { quotemeta $_ } split('', $rdel));
177 pos = $posbug;
178
179 my $startpos = pos $$textref || 0;
180 my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
181
182 return _fail ($wantarray, $textref) unless @match;
183
184 return _succeed ( $wantarray, $textref,
185 $match[2], $match[5]+2, # MATCH
186 @match[8,9], # REMAINDER
187 @match[0,1], # PREFIX
188 );
189}
190
191sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
192{
193 my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
194 my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
195 unless ($$textref =~ m/\G$pre/gc)
196 {
197 _failmsg "Did not find prefix: /$pre/", $startpos;
198 return;
199 }
200
201 $ldelpos = pos $$textref;
202
203 unless ($$textref =~ m/\G($ldel)/gc)
204 {
205 _failmsg "Did not find opening bracket after prefix: \"$pre\"",
206 pos $$textref;
207 pos $$textref = $startpos;
208 return;
209 }
210
211 my @nesting = ( $1 );
212 my $textlen = length $$textref;
213 while (pos $$textref < $textlen)
214 {
215 next if $$textref =~ m/\G\\./gcs;
216
217 if ($$textref =~ m/\G($ldel)/gc)
218 {
219 push @nesting, $1;
220 }
221 elsif ($$textref =~ m/\G($rdel)/gc)
222 {
223 my ($found, $brackettype) = ($1, $1);
224 if ($#nesting < 0)
225 {
226 _failmsg "Unmatched closing bracket: \"$found\"",
227 pos $$textref;
228 pos $$textref = $startpos;
229 return;
230 }
231 my $expected = pop(@nesting);
232 $expected =~ tr/({[</)}]>/;
233 if ($expected ne $brackettype)
234 {
235 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
236 pos $$textref;
237 pos $$textref = $startpos;
238 return;
239 }
240 last if $#nesting < 0;
241 }
242 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
243 {
9686a75b 244 $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
3270c621 245 _failmsg "Unmatched embedded quote ($1)",
246 pos $$textref;
247 pos $$textref = $startpos;
248 return;
249 }
250 elsif ($quotelike && _match_quotelike($textref,"",1,0))
251 {
252 next;
253 }
254
255 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
256 }
257 if ($#nesting>=0)
258 {
259 _failmsg "Unmatched opening bracket(s): "
260 . join("..",@nesting)."..",
261 pos $$textref;
262 pos $$textref = $startpos;
263 return;
264 }
265
266 $endpos = pos $$textref;
267
268 return (
269 $startpos, $ldelpos-$startpos, # PREFIX
270 $ldelpos, 1, # OPENING BRACKET
271 $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS
272 $endpos-1, 1, # CLOSING BRACKET
273 $endpos, length($$textref)-$endpos, # REMAINDER
274 );
275}
276
277sub revbracket($)
278{
279 my $brack = reverse $_[0];
280 $brack =~ tr/[({</])}>/;
281 return $brack;
282}
283
284my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
285
286sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
287{
288 my $textref = defined $_[0] ? \$_[0] : \$_;
289 my $ldel = $_[1];
290 my $rdel = $_[2];
291 my $pre = defined $_[3] ? $_[3] : '\s*';
292 my %options = defined $_[4] ? %{$_[4]} : ();
293 my $omode = defined $options{fail} ? $options{fail} : '';
294 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
295 : defined($options{reject}) ? $options{reject}
296 : ''
297 ;
298 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
299 : defined($options{ignore}) ? $options{ignore}
300 : ''
301 ;
302
303 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
304 $@ = undef;
305
306 my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
307
308 return _fail(wantarray, $textref) unless @match;
309 return _succeed wantarray, $textref,
310 $match[2], $match[3]+$match[5]+$match[7], # MATCH
311 @match[8..9,0..1,2..7]; # REM, PRE, BITS
312}
313
314sub _match_tagged # ($$$$$$$)
315{
316 my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
317 my $rdelspec;
318
319 my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
320
321 unless ($$textref =~ m/\G($pre)/gc)
322 {
323 _failmsg "Did not find prefix: /$pre/", pos $$textref;
324 goto failed;
325 }
326
327 $opentagpos = pos($$textref);
328
329 unless ($$textref =~ m/\G$ldel/gc)
330 {
331 _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
332 goto failed;
333 }
334
335 $textpos = pos($$textref);
336
337 if (!defined $rdel)
338 {
48f821bf 339 $rdelspec = $&;
3270c621 340 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes)
341 {
342 _failmsg "Unable to construct closing tag to match: $rdel",
343 pos $$textref;
344 goto failed;
345 }
346 }
347 else
348 {
48f821bf 349 $rdelspec = eval "qq{$rdel}" || do {
350 my $del;
351 for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
352 { next if $rdel =~ /\Q$_/; $del = $_; last }
353 unless ($del) {
75c4c974 354 croak ("Can't interpolate right delimiter $rdel")
48f821bf 355 }
356 eval "qq$del$rdel$del";
357 };
3270c621 358 }
359
360 while (pos($$textref) < length($$textref))
361 {
362 next if $$textref =~ m/\G\\./gc;
363
364 if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
365 {
366 $parapos = pos($$textref) - length($1)
367 unless defined $parapos;
368 }
369 elsif ($$textref =~ m/\G($rdelspec)/gc )
370 {
371 $closetagpos = pos($$textref)-length($1);
372 goto matched;
373 }
374 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
375 {
376 next;
377 }
378 elsif ($bad && $$textref =~ m/\G($bad)/gcs)
379 {
380 pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS
381 goto short if ($omode eq 'PARA' || $omode eq 'MAX');
382 _failmsg "Found invalid nested tag: $1", pos $$textref;
383 goto failed;
384 }
385 elsif ($$textref =~ m/\G($ldel)/gc)
386 {
387 my $tag = $1;
388 pos($$textref) -= length($tag); # REWIND TO NESTED TAG
389 unless (_match_tagged(@_)) # MATCH NESTED TAG
390 {
391 goto short if $omode eq 'PARA' || $omode eq 'MAX';
392 _failmsg "Found unbalanced nested tag: $tag",
393 pos $$textref;
394 goto failed;
395 }
396 }
397 else { $$textref =~ m/./gcs }
398 }
399
400short:
401 $closetagpos = pos($$textref);
402 goto matched if $omode eq 'MAX';
403 goto failed unless $omode eq 'PARA';
404
405 if (defined $parapos) { pos($$textref) = $parapos }
406 else { $parapos = pos($$textref) }
407
408 return (
409 $startpos, $opentagpos-$startpos, # PREFIX
410 $opentagpos, $textpos-$opentagpos, # OPENING TAG
411 $textpos, $parapos-$textpos, # TEXT
412 $parapos, 0, # NO CLOSING TAG
413 $parapos, length($$textref)-$parapos, # REMAINDER
414 );
415
416matched:
417 $endpos = pos($$textref);
418 return (
419 $startpos, $opentagpos-$startpos, # PREFIX
420 $opentagpos, $textpos-$opentagpos, # OPENING TAG
421 $textpos, $closetagpos-$textpos, # TEXT
422 $closetagpos, $endpos-$closetagpos, # CLOSING TAG
423 $endpos, length($$textref)-$endpos, # REMAINDER
424 );
425
426failed:
427 _failmsg "Did not find closing tag", pos $$textref unless $@;
428 pos($$textref) = $startpos;
429 return;
430}
431
432sub extract_variable (;$$)
433{
434 my $textref = defined $_[0] ? \$_[0] : \$_;
435 return ("","","") unless defined $$textref;
436 my $pre = defined $_[1] ? $_[1] : '\s*';
437
438 my @match = _match_variable($textref,$pre);
439
440 return _fail wantarray, $textref unless @match;
441
442 return _succeed wantarray, $textref,
443 @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX
444}
445
446sub _match_variable($$)
447{
a7602084 448# $#
449# $^
450# $$
3270c621 451 my ($textref, $pre) = @_;
452 my $startpos = pos($$textref) = pos($$textref)||0;
453 unless ($$textref =~ m/\G($pre)/gc)
454 {
455 _failmsg "Did not find prefix: /$pre/", pos $$textref;
456 return;
457 }
458 my $varpos = pos($$textref);
48f821bf 459 unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
3270c621 460 {
a7602084 461 unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
462 {
3270c621 463 _failmsg "Did not find leading dereferencer", pos $$textref;
464 pos $$textref = $startpos;
465 return;
a7602084 466 }
467 my $deref = $1;
3270c621 468
a7602084 469 unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
470 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
471 or $deref eq '$#' or $deref eq '$$' )
472 {
3270c621 473 _failmsg "Bad identifier after dereferencer", pos $$textref;
474 pos $$textref = $startpos;
475 return;
a7602084 476 }
3270c621 477 }
478
479 while (1)
480 {
48f821bf 481 next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
3270c621 482 next if _match_codeblock($textref,
2f250b7c 483 qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
3270c621 484 qr/[({[]/, qr/[)}\]]/,
485 qr/[({[]/, qr/[)}\]]/, 0);
486 next if _match_codeblock($textref,
487 qr/\s*/, qr/[{[]/, qr/[}\]]/,
488 qr/[{[]/, qr/[}\]]/, 0);
489 next if _match_variable($textref,'\s*->\s*');
490 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
491 last;
492 }
493
494 my $endpos = pos($$textref);
495 return ($startpos, $varpos-$startpos,
496 $varpos, $endpos-$varpos,
497 $endpos, length($$textref)-$endpos
498 );
499}
500
501sub extract_codeblock (;$$$$$)
502{
503 my $textref = defined $_[0] ? \$_[0] : \$_;
504 my $wantarray = wantarray;
505 my $ldel_inner = defined $_[1] ? $_[1] : '{';
506 my $pre = defined $_[2] ? $_[2] : '\s*';
507 my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
508 my $rd = $_[4];
509 my $rdel_inner = $ldel_inner;
510 my $rdel_outer = $ldel_outer;
511 my $posbug = pos;
512 for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
513 for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
514 for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
515 {
516 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
517 }
518 pos = $posbug;
519
520 my @match = _match_codeblock($textref, $pre,
521 $ldel_outer, $rdel_outer,
522 $ldel_inner, $rdel_inner,
523 $rd);
524 return _fail($wantarray, $textref) unless @match;
525 return _succeed($wantarray, $textref,
526 @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX
527 );
528
529}
530
531sub _match_codeblock($$$$$$$)
532{
533 my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
534 my $startpos = pos($$textref) = pos($$textref) || 0;
535 unless ($$textref =~ m/\G($pre)/gc)
536 {
537 _failmsg qq{Did not match prefix /$pre/ at"} .
538 substr($$textref,pos($$textref),20) .
539 q{..."},
540 pos $$textref;
541 return;
542 }
543 my $codepos = pos($$textref);
544 unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER
545 {
546 _failmsg qq{Did not find expected opening bracket at "} .
547 substr($$textref,pos($$textref),20) .
548 q{..."},
549 pos $$textref;
550 pos $$textref = $startpos;
551 return;
552 }
553 my $closing = $1;
554 $closing =~ tr/([<{/)]>}/;
555 my $matched;
556 my $patvalid = 1;
557 while (pos($$textref) < length($$textref))
558 {
559 $matched = '';
560 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
561 {
562 $patvalid = 0;
563 next;
564 }
565
566 if ($$textref =~ m/\G\s*#.*/gc)
567 {
568 next;
569 }
570
571 if ($$textref =~ m/\G\s*($rdel_outer)/gc)
572 {
573 unless ($matched = ($closing && $1 eq $closing) )
574 {
575 next if $1 eq '>'; # MIGHT BE A "LESS THAN"
576 _failmsg q{Mismatched closing bracket at "} .
577 substr($$textref,pos($$textref),20) .
578 qq{...". Expected '$closing'},
579 pos $$textref;
580 }
581 last;
582 }
583
584 if (_match_variable($textref,'\s*') ||
585 _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
586 {
587 $patvalid = 0;
588 next;
589 }
590
591
592 # NEED TO COVER MANY MORE CASES HERE!!!
ce940696 593 # NB 'case' is included here, because in Switch.pm,
594 # it's followed by a term, not an op
595
48f821bf 596 if ($$textref =~ m#\G\s*(?!$ldel_inner)
597 ( [-+*x/%^&|.]=?
55a1c97c 598 | [!=]~
3270c621 599 | =(?!>)
600 | (\*\*|&&|\|\||<<|>>)=?
ce940696 601 | case|split|grep|map|return
48f821bf 602 | [([]
3270c621 603 )#gcx)
604 {
605 $patvalid = 1;
606 next;
607 }
608
609 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
610 {
611 $patvalid = 1;
612 next;
613 }
614
615 if ($$textref =~ m/\G\s*$ldel_outer/gc)
616 {
617 _failmsg q{Improperly nested codeblock at "} .
618 substr($$textref,pos($$textref),20) .
619 q{..."},
620 pos $$textref;
621 last;
622 }
623
624 $patvalid = 0;
625 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
626 }
627 continue { $@ = undef }
628
629 unless ($matched)
630 {
631 _failmsg 'No match found for opening bracket', pos $$textref
632 unless $@;
633 return;
634 }
635
636 my $endpos = pos($$textref);
637 return ( $startpos, $codepos-$startpos,
638 $codepos, $endpos-$codepos,
639 $endpos, length($$textref)-$endpos,
640 );
641}
642
643
644my %mods = (
645 'none' => '[cgimsox]*',
646 'm' => '[cgimsox]*',
647 's' => '[cegimsox]*',
648 'tr' => '[cds]*',
649 'y' => '[cds]*',
650 'qq' => '',
651 'qx' => '',
652 'qw' => '',
653 'qr' => '[imsx]*',
654 'q' => '',
655 );
656
657sub extract_quotelike (;$$)
658{
659 my $textref = $_[0] ? \$_[0] : \$_;
660 my $wantarray = wantarray;
661 my $pre = defined $_[1] ? $_[1] : '\s*';
662
663 my @match = _match_quotelike($textref,$pre,1,0);
664 return _fail($wantarray, $textref) unless @match;
665 return _succeed($wantarray, $textref,
666 $match[2], $match[18]-$match[2], # MATCH
667 @match[18,19], # REMAINDER
668 @match[0,1], # PREFIX
669 @match[2..17], # THE BITS
670 @match[20,21], # ANY FILLET?
671 );
672};
673
674sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match)
675{
676 my ($textref, $pre, $rawmatch, $qmark) = @_;
677
678 my ($textlen,$startpos,
679 $oppos,
680 $preld1pos,$ld1pos,$str1pos,$rd1pos,
681 $preld2pos,$ld2pos,$str2pos,$rd2pos,
682 $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
683
684 unless ($$textref =~ m/\G($pre)/gc)
685 {
686 _failmsg qq{Did not find prefix /$pre/ at "} .
687 substr($$textref, pos($$textref), 20) .
688 q{..."},
689 pos $$textref;
690 return;
691 }
692 $oppos = pos($$textref);
693
694 my $initial = substr($$textref,$oppos,1);
695
696 if ($initial && $initial =~ m|^[\"\'\`]|
697 || $rawmatch && $initial =~ m|^/|
698 || $qmark && $initial =~ m|^\?|)
699 {
9686a75b 700 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
3270c621 701 {
702 _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
703 substr($$textref, $oppos, 20) .
704 q{..."},
705 pos $$textref;
706 pos $$textref = $startpos;
707 return;
708 }
709 $modpos= pos($$textref);
710 $rd1pos = $modpos-1;
711
712 if ($initial eq '/' || $initial eq '?')
713 {
714 $$textref =~ m/\G$mods{none}/gc
715 }
716
717 my $endpos = pos($$textref);
718 return (
719 $startpos, $oppos-$startpos, # PREFIX
720 $oppos, 0, # NO OPERATOR
721 $oppos, 1, # LEFT DEL
722 $oppos+1, $rd1pos-$oppos-1, # STR/PAT
723 $rd1pos, 1, # RIGHT DEL
724 $modpos, 0, # NO 2ND LDEL
725 $modpos, 0, # NO 2ND STR
726 $modpos, 0, # NO 2ND RDEL
727 $modpos, $endpos-$modpos, # MODIFIERS
728 $endpos, $textlen-$endpos, # REMAINDER
729 );
730 }
731
48f821bf 732 unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
3270c621 733 {
734 _failmsg q{No quotelike operator found after prefix at "} .
735 substr($$textref, pos($$textref), 20) .
736 q{..."},
737 pos $$textref;
738 pos $$textref = $startpos;
739 return;
740 }
741
742 my $op = $1;
743 $preld1pos = pos($$textref);
744 if ($op eq '<<') {
745 $ld1pos = pos($$textref);
746 my $label;
747 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
748 $label = $1;
749 }
750 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
751 | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
752 | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
9686a75b 753 }gcsx) {
3270c621 754 $label = $+;
755 }
756 else {
757 $label = "";
758 }
759 my $extrapos = pos($$textref);
760 $$textref =~ m{.*\n}gc;
ce3ac4b6 761 $str1pos = pos($$textref)--;
762 unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) {
3270c621 763 _failmsg qq{Missing here doc terminator ('$label') after "} .
764 substr($$textref, $startpos, 20) .
765 q{..."},
766 pos $$textref;
767 pos $$textref = $startpos;
768 return;
769 }
770 $rd1pos = pos($$textref);
ce3ac4b6 771 $$textref =~ m{\Q$label\E\n}gc;
3270c621 772 $ld2pos = pos($$textref);
773 return (
774 $startpos, $oppos-$startpos, # PREFIX
775 $oppos, length($op), # OPERATOR
776 $ld1pos, $extrapos-$ld1pos, # LEFT DEL
777 $str1pos, $rd1pos-$str1pos, # STR/PAT
778 $rd1pos, $ld2pos-$rd1pos, # RIGHT DEL
779 $ld2pos, 0, # NO 2ND LDEL
780 $ld2pos, 0, # NO 2ND STR
781 $ld2pos, 0, # NO 2ND RDEL
782 $ld2pos, 0, # NO MODIFIERS
783 $ld2pos, $textlen-$ld2pos, # REMAINDER
784 $extrapos, $str1pos-$extrapos, # FILLETED BIT
785 );
786 }
787
788 $$textref =~ m/\G\s*/gc;
789 $ld1pos = pos($$textref);
790 $str1pos = $ld1pos+1;
791
792 unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD
793 {
794 _failmsg "No block delimiter found after quotelike $op",
795 pos $$textref;
796 pos $$textref = $startpos;
797 return;
798 }
799 pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
800 my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
801 if ($ldel1 =~ /[[(<{]/)
802 {
803 $rdel1 =~ tr/[({</])}>/;
ce3ac4b6 804 defined(_match_bracketed($textref,"",$ldel1,"","",$rdel1))
3270c621 805 || do { pos $$textref = $startpos; return };
806 }
807 else
808 {
9686a75b 809 $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
3270c621 810 || do { pos $$textref = $startpos; return };
811 }
812 $ld2pos = $rd1pos = pos($$textref)-1;
813
814 my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
815 if ($second_arg)
816 {
817 my ($ldel2, $rdel2);
818 if ($ldel1 =~ /[[(<{]/)
819 {
820 unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD
821 {
822 _failmsg "Missing second block for quotelike $op",
823 pos $$textref;
824 pos $$textref = $startpos;
825 return;
826 }
827 $ldel2 = $rdel2 = "\Q$1";
828 $rdel2 =~ tr/[({</])}>/;
829 }
830 else
831 {
832 $ldel2 = $rdel2 = $ldel1;
833 }
834 $str2pos = $ld2pos+1;
835
836 if ($ldel2 =~ /[[(<{]/)
837 {
838 pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD
ce3ac4b6 839 defined(_match_bracketed($textref,"",$ldel2,"","",$rdel2))
3270c621 840 || do { pos $$textref = $startpos; return };
841 }
842 else
843 {
9686a75b 844 $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
3270c621 845 || do { pos $$textref = $startpos; return };
846 }
847 $rd2pos = pos($$textref)-1;
848 }
849 else
850 {
851 $ld2pos = $str2pos = $rd2pos = $rd1pos;
852 }
853
854 $modpos = pos $$textref;
855
856 $$textref =~ m/\G($mods{$op})/gc;
857 my $endpos = pos $$textref;
858
859 return (
860 $startpos, $oppos-$startpos, # PREFIX
861 $oppos, length($op), # OPERATOR
862 $ld1pos, 1, # LEFT DEL
863 $str1pos, $rd1pos-$str1pos, # STR/PAT
864 $rd1pos, 1, # RIGHT DEL
865 $ld2pos, $second_arg, # 2ND LDEL (MAYBE)
866 $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE)
867 $rd2pos, $second_arg, # 2ND RDEL (MAYBE)
868 $modpos, $endpos-$modpos, # MODIFIERS
869 $endpos, $textlen-$endpos, # REMAINDER
870 );
871}
872
873my $def_func =
874[
875 sub { extract_variable($_[0], '') },
876 sub { extract_quotelike($_[0],'') },
877 sub { extract_codeblock($_[0],'{}','') },
878];
879
880sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown)
881{
882 my $textref = defined($_[0]) ? \$_[0] : \$_;
883 my $posbug = pos;
884 my ($lastpos, $firstpos);
885 my @fields = ();
886
a7602084 887 #for ($$textref)
3270c621 888 {
889 my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
890 my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
891 my $igunk = $_[3];
892
a7602084 893 pos $$textref ||= 0;
3270c621 894
895 unless (wantarray)
896 {
75c4c974 897 carp ("extract_multiple reset maximal count to 1 in scalar context")
3270c621 898 if $^W && defined($_[2]) && $max > 1;
899 $max = 1
900 }
901
902 my $unkpos;
903 my $func;
904 my $class;
905
906 my @class;
907 foreach $func ( @func )
908 {
909 if (ref($func) eq 'HASH')
910 {
911 push @class, (keys %$func)[0];
912 $func = (values %$func)[0];
913 }
914 else
915 {
916 push @class, undef;
917 }
918 }
919
a7602084 920 FIELD: while (pos($$textref) < length($$textref))
3270c621 921 {
48f821bf 922 my ($field, $rem);
a7602084 923 my @bits;
3270c621 924 foreach my $i ( 0..$#func )
925 {
a7602084 926 my $pref;
3270c621 927 $func = $func[$i];
928 $class = $class[$i];
a7602084 929 $lastpos = pos $$textref;
3270c621 930 if (ref($func) eq 'CODE')
48f821bf 931 { ($field,$rem,$pref) = @bits = $func->($$textref);
932 # print "[$field|$rem]" if $field;
933 }
3270c621 934 elsif (ref($func) eq 'Text::Balanced::Extractor')
a7602084 935 { @bits = $field = $func->extract($$textref) }
936 elsif( $$textref =~ m/\G$func/gc )
48f821bf 937 { @bits = $field = defined($1) ? $1 : $& }
a7602084 938 $pref ||= "";
3270c621 939 if (defined($field) && length($field))
940 {
a7602084 941 if (!$igunk) {
ce3ac4b6 942 $unkpos = $lastpos
a7602084 943 if length($pref) && !defined($unkpos);
944 if (defined $unkpos)
945 {
946 push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
947 $firstpos = $unkpos unless defined $firstpos;
948 undef $unkpos;
949 last FIELD if @fields == $max;
950 }
3270c621 951 }
a7602084 952 push @fields, $class
953 ? bless (\$field, $class)
3270c621 954 : $field;
955 $firstpos = $lastpos unless defined $firstpos;
a7602084 956 $lastpos = pos $$textref;
3270c621 957 last FIELD if @fields == $max;
958 next FIELD;
959 }
960 }
a7602084 961 if ($$textref =~ /\G(.)/gcs)
3270c621 962 {
a7602084 963 $unkpos = pos($$textref)-1
3270c621 964 unless $igunk || defined $unkpos;
965 }
966 }
967
968 if (defined $unkpos)
969 {
a7602084 970 push @fields, substr($$textref, $unkpos);
3270c621 971 $firstpos = $unkpos unless defined $firstpos;
a7602084 972 $lastpos = length $$textref;
3270c621 973 }
974 last;
975 }
976
977 pos $$textref = $lastpos;
978 return @fields if wantarray;
979
980 $firstpos ||= 0;
981 eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
982 pos $$textref = $firstpos };
983 return $fields[0];
984}
985
986
987sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
988{
989 my $ldel = $_[0];
990 my $rdel = $_[1];
991 my $pre = defined $_[2] ? $_[2] : '\s*';
992 my %options = defined $_[3] ? %{$_[3]} : ();
993 my $omode = defined $options{fail} ? $options{fail} : '';
994 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
995 : defined($options{reject}) ? $options{reject}
996 : ''
997 ;
998 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
999 : defined($options{ignore}) ? $options{ignore}
1000 : ''
1001 ;
1002
1003 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
1004
1005 my $posbug = pos;
1006 for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
1007 pos = $posbug;
1008
1009 my $closure = sub
1010 {
1011 my $textref = defined $_[0] ? \$_[0] : \$_;
1012 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
1013
1014 return _fail(wantarray, $textref) unless @match;
1015 return _succeed wantarray, $textref,
1016 $match[2], $match[3]+$match[5]+$match[7], # MATCH
1017 @match[8..9,0..1,2..7]; # REM, PRE, BITS
1018 };
1019
1020 bless $closure, 'Text::Balanced::Extractor';
1021}
1022
1023package Text::Balanced::Extractor;
1024
1025sub extract($$) # ($self, $text)
1026{
1027 &{$_[0]}($_[1]);
1028}
1029
1030package Text::Balanced::ErrorMsg;
1031
1032use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
1033
10341;
55a1c97c 1035
1036__END__
1037
1038=head1 NAME
1039
1040Text::Balanced - Extract delimited text sequences from strings.
1041
1042
1043=head1 SYNOPSIS
1044
1045 use Text::Balanced qw (
1046 extract_delimited
1047 extract_bracketed
1048 extract_quotelike
1049 extract_codeblock
1050 extract_variable
1051 extract_tagged
1052 extract_multiple
1053
1054 gen_delimited_pat
1055 gen_extract_tagged
1056 );
1057
1058 # Extract the initial substring of $text that is delimited by
1059 # two (unescaped) instances of the first character in $delim.
1060
1061 ($extracted, $remainder) = extract_delimited($text,$delim);
1062
1063
1064 # Extract the initial substring of $text that is bracketed
1065 # with a delimiter(s) specified by $delim (where the string
1066 # in $delim contains one or more of '(){}[]<>').
1067
1068 ($extracted, $remainder) = extract_bracketed($text,$delim);
1069
1070
1071 # Extract the initial substring of $text that is bounded by
48f821bf 1072 # an XML tag.
55a1c97c 1073
1074 ($extracted, $remainder) = extract_tagged($text);
1075
1076
1077 # Extract the initial substring of $text that is bounded by
1078 # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1079
1080 ($extracted, $remainder) =
1081 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1082
1083
1084 # Extract the initial substring of $text that represents a
1085 # Perl "quote or quote-like operation"
1086
1087 ($extracted, $remainder) = extract_quotelike($text);
1088
1089
1090 # Extract the initial substring of $text that represents a block
1091 # of Perl code, bracketed by any of character(s) specified by $delim
1092 # (where the string $delim contains one or more of '(){}[]<>').
1093
1094 ($extracted, $remainder) = extract_codeblock($text,$delim);
1095
1096
1097 # Extract the initial substrings of $text that would be extracted by
1098 # one or more sequential applications of the specified functions
1099 # or regular expressions
1100
1101 @extracted = extract_multiple($text,
1102 [ \&extract_bracketed,
1103 \&extract_quotelike,
1104 \&some_other_extractor_sub,
1105 qr/[xyz]*/,
1106 'literal',
1107 ]);
1108
1109# Create a string representing an optimized pattern (a la Friedl)
1110# that matches a substring delimited by any of the specified characters
1111# (in this case: any type of quote or a slash)
1112
1113 $patstring = gen_delimited_pat(q{'"`/});
1114
1115
1116# Generate a reference to an anonymous sub that is just like extract_tagged
1117# but pre-compiled and optimized for a specific pair of tags, and consequently
1118# much faster (i.e. 3 times faster). It uses qr// for better performance on
1119# repeated calls, so it only works under Perl 5.005 or later.
1120
1121 $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1122
1123 ($extracted, $remainder) = $extract_head->($text);
1124
1125
1126=head1 DESCRIPTION
1127
48f821bf 1128The various C<extract_...> subroutines may be used to
1129extract a delimited substring, possibly after skipping a
1130specified prefix string. By default, that prefix is
1131optional whitespace (C</\s*/>), but you can change it to whatever
1132you wish (see below).
1133
1134The substring to be extracted must appear at the
1135current C<pos> location of the string's variable
1136(or at index zero, if no C<pos> position is defined).
1137In other words, the C<extract_...> subroutines I<don't>
3c4b39be 1138extract the first occurrence of a substring anywhere
48f821bf 1139in a string (like an unanchored regex would). Rather,
3c4b39be 1140they extract an occurrence of the substring appearing
48f821bf 1141immediately at the current matching position in the
1142string (like a C<\G>-anchored regex would).
1143
1144
55a1c97c 1145
1146=head2 General behaviour in list contexts
1147
1148In a list context, all the subroutines return a list, the first three
1149elements of which are always:
1150
1151=over 4
1152
1153=item [0]
1154
1155The extracted string, including the specified delimiters.
1156If the extraction fails an empty string is returned.
1157
1158=item [1]
1159
1160The remainder of the input string (i.e. the characters after the
1161extracted string). On failure, the entire string is returned.
1162
1163=item [2]
1164
1165The skipped prefix (i.e. the characters before the extracted string).
1166On failure, the empty string is returned.
1167
1168=back
1169
1170Note that in a list context, the contents of the original input text (the first
1171argument) are not modified in any way.
1172
1173However, if the input text was passed in a variable, that variable's
1174C<pos> value is updated to point at the first character after the
1175extracted text. That means that in a list context the various
1176subroutines can be used much like regular expressions. For example:
1177
1178 while ( $next = (extract_quotelike($text))[0] )
1179 {
1180 # process next quote-like (in $next)
1181 }
1182
1183
1184=head2 General behaviour in scalar and void contexts
1185
1186In a scalar context, the extracted string is returned, having first been
1187removed from the input text. Thus, the following code also processes
1188each quote-like operation, but actually removes them from $text:
1189
1190 while ( $next = extract_quotelike($text) )
1191 {
1192 # process next quote-like (in $next)
1193 }
1194
1195Note that if the input text is a read-only string (i.e. a literal),
1196no attempt is made to remove the extracted text.
1197
1198In a void context the behaviour of the extraction subroutines is
1199exactly the same as in a scalar context, except (of course) that the
1200extracted substring is not returned.
1201
1202=head2 A note about prefixes
1203
1204Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1205This can bite you if you're expecting a prefix specification like
1206'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1207pattern will only succeed if the <H1> tag is on the current line, since
1208. normally doesn't match newlines.
1209
1210To overcome this limitation, you need to turn on /s matching within
1211the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1212
1213
1214=head2 C<extract_delimited>
1215
1216The C<extract_delimited> function formalizes the common idiom
1217of extracting a single-character-delimited substring from the start of
1218a string. For example, to extract a single-quote delimited string, the
1219following code is typically used:
1220
1221 ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1222 $extracted = $1;
1223
1224but with C<extract_delimited> it can be simplified to:
1225
1226 ($extracted,$remainder) = extract_delimited($text, "'");
1227
1228C<extract_delimited> takes up to four scalars (the input text, the
1229delimiters, a prefix pattern to be skipped, and any escape characters)
1230and extracts the initial substring of the text that
1231is appropriately delimited. If the delimiter string has multiple
1232characters, the first one encountered in the text is taken to delimit
1233the substring.
1234The third argument specifies a prefix pattern that is to be skipped
1235(but must be present!) before the substring is extracted.
1236The final argument specifies the escape character to be used for each
1237delimiter.
1238
1239All arguments are optional. If the escape characters are not specified,
1240every delimiter is escaped with a backslash (C<\>).
1241If the prefix is not specified, the
1242pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1243is also not specified, the set C</["'`]/> is used. If the text to be processed
1244is not specified either, C<$_> is used.
1245
48f821bf 1246In list context, C<extract_delimited> returns a array of three
55a1c97c 1247elements, the extracted substring (I<including the surrounding
1248delimiters>), the remainder of the text, and the skipped prefix (if
1249any). If a suitable delimited substring is not found, the first
1250element of the array is the empty string, the second is the complete
1251original text, and the prefix returned in the third element is an
1252empty string.
1253
1254In a scalar context, just the extracted substring is returned. In
1255a void context, the extracted substring (and any prefix) are simply
1256removed from the beginning of the first argument.
1257
1258Examples:
1259
1260 # Remove a single-quoted substring from the very beginning of $text:
1261
1262 $substring = extract_delimited($text, "'", '');
1263
1264 # Remove a single-quoted Pascalish substring (i.e. one in which
1265 # doubling the quote character escapes it) from the very
1266 # beginning of $text:
1267
1268 $substring = extract_delimited($text, "'", '', "'");
1269
1270 # Extract a single- or double- quoted substring from the
1271 # beginning of $text, optionally after some whitespace
1272 # (note the list context to protect $text from modification):
1273
1274 ($substring) = extract_delimited $text, q{"'};
1275
1276
1277 # Delete the substring delimited by the first '/' in $text:
1278
1279 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1280
1281Note that this last example is I<not> the same as deleting the first
1282quote-like pattern. For instance, if C<$text> contained the string:
1283
1284 "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1285
1286then after the deletion it would contain:
1287
1288 "if ('.$UNIXCMD/s) { $cmd = $1; }"
1289
1290not:
1291
1292 "if ('./cmd' =~ ms) { $cmd = $1; }"
1293
1294
1295See L<"extract_quotelike"> for a (partial) solution to this problem.
1296
1297
1298=head2 C<extract_bracketed>
1299
1300Like C<"extract_delimited">, the C<extract_bracketed> function takes
1301up to three optional scalar arguments: a string to extract from, a delimiter
1302specifier, and a prefix pattern. As before, a missing prefix defaults to
1303optional whitespace and a missing text defaults to C<$_>. However, a missing
1304delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1305
1306C<extract_bracketed> extracts a balanced-bracket-delimited
1307substring (using any one (or more) of the user-specified delimiter
1308brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1309respect quoted unbalanced brackets (see below).
1310
1311A "delimiter bracket" is a bracket in list of delimiters passed as
1312C<extract_bracketed>'s second argument. Delimiter brackets are
1313specified by giving either the left or right (or both!) versions
1314of the required bracket(s). Note that the order in which
1315two or more delimiter brackets are specified is not significant.
1316
1317A "balanced-bracket-delimited substring" is a substring bounded by
1318matched brackets, such that any other (left or right) delimiter
1319bracket I<within> the substring is also matched by an opposite
1320(right or left) delimiter bracket I<at the same level of nesting>. Any
1321type of bracket not in the delimiter list is treated as an ordinary
1322character.
1323
1324In other words, each type of bracket specified as a delimiter must be
1325balanced and correctly nested within the substring, and any other kind of
1326("non-delimiter") bracket in the substring is ignored.
1327
1328For example, given the string:
1329
1330 $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1331
1332then a call to C<extract_bracketed> in a list context:
1333
1334 @result = extract_bracketed( $text, '{}' );
1335
1336would return:
1337
1338 ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1339
1340since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1341(In a scalar context just the first element of the array would be returned. In
1342a void context, C<$text> would be replaced by an empty string.)
1343
1344Likewise the call in:
1345
1346 @result = extract_bracketed( $text, '{[' );
1347
1348would return the same result, since all sets of both types of specified
1349delimiter brackets are correctly nested and balanced.
1350
1351However, the call in:
1352
1353 @result = extract_bracketed( $text, '{([<' );
1354
1355would fail, returning:
1356
1357 ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" );
1358
1359because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1360the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1361return an empty string. In a void context, C<$text> would be unchanged.)
1362
1363Note that the embedded single-quotes in the string don't help in this
1364case, since they have not been specified as acceptable delimiters and are
1365therefore treated as non-delimiter characters (and ignored).
1366
1367However, if a particular species of quote character is included in the
1368delimiter specification, then that type of quote will be correctly handled.
1369for example, if C<$text> is:
1370
1371 $text = '<A HREF=">>>>">link</A>';
1372
1373then
1374
1375 @result = extract_bracketed( $text, '<">' );
1376
1377returns:
1378
1379 ( '<A HREF=">>>>">', 'link</A>', "" )
1380
1381as expected. Without the specification of C<"> as an embedded quoter:
1382
1383 @result = extract_bracketed( $text, '<>' );
1384
1385the result would be:
1386
1387 ( '<A HREF=">', '>>>">link</A>', "" )
1388
1389In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1390quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1391letter 'q' as a delimiter. Hence:
1392
1393 @result = extract_bracketed( $text, '<q>' );
1394
1395would correctly match something like this:
1396
1397 $text = '<leftop: conj /and/ conj>';
1398
1399See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1400
1401
48f821bf 1402=head2 C<extract_variable>
1403
1404C<extract_variable> extracts any valid Perl variable or
1405variable-involved expression, including scalars, arrays, hashes, array
3c4b39be 1406accesses, hash look-ups, method calls through objects, subroutine calls
48f821bf 1407through subroutine references, etc.
1408
1409The subroutine takes up to two optional arguments:
1410
1411=over 4
1412
1413=item 1.
1414
1415A string to be processed (C<$_> if the string is omitted or C<undef>)
1416
1417=item 2.
1418
1419A string specifying a pattern to be matched as a prefix (which is to be
1420skipped). If omitted, optional whitespace is skipped.
1421
1422=back
1423
1424On success in a list context, an array of 3 elements is returned. The
1425elements are:
1426
1427=over 4
1428
1429=item [0]
1430
1431the extracted variable, or variablish expression
1432
1433=item [1]
1434
1435the remainder of the input text,
1436
1437=item [2]
1438
1439the prefix substring (if any),
1440
1441=back
1442
1443On failure, all of these values (except the remaining text) are C<undef>.
1444
1445In a scalar context, C<extract_variable> returns just the complete
1446substring that matched a variablish expression. C<undef> is returned on
1447failure. In addition, the original input text has the returned substring
1448(and any prefix) removed from it.
1449
1450In a void context, the input text just has the matched substring (and
1451any specified prefix) removed.
1452
1453
55a1c97c 1454=head2 C<extract_tagged>
1455
1456C<extract_tagged> extracts and segments text between (balanced)
1457specified tags.
1458
1459The subroutine takes up to five optional arguments:
1460
1461=over 4
1462
1463=item 1.
1464
1465A string to be processed (C<$_> if the string is omitted or C<undef>)
1466
1467=item 2.
1468
1469A string specifying a pattern to be matched as the opening tag.
1470If the pattern string is omitted (or C<undef>) then a pattern
48f821bf 1471that matches any standard XML tag is used.
55a1c97c 1472
1473=item 3.
1474
1475A string specifying a pattern to be matched at the closing tag.
1476If the pattern string is omitted (or C<undef>) then the closing
1477tag is constructed by inserting a C</> after any leading bracket
1478characters in the actual opening tag that was matched (I<not> the pattern
1479that matched the tag). For example, if the opening tag pattern
1480is specified as C<'{{\w+}}'> and actually matched the opening tag
1481C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1482
1483=item 4.
1484
1485A string specifying a pattern to be matched as a prefix (which is to be
1486skipped). If omitted, optional whitespace is skipped.
1487
1488=item 5.
1489
1490A hash reference containing various parsing options (see below)
1491
1492=back
1493
1494The various options that can be specified are:
1495
1496=over 4
1497
1498=item C<reject =E<gt> $listref>
1499
1500The list reference contains one or more strings specifying patterns
1501that must I<not> appear within the tagged text.
1502
1503For example, to extract
1504an HTML link (which should not contain nested links) use:
1505
48f821bf 1506 extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
55a1c97c 1507
1508=item C<ignore =E<gt> $listref>
1509
1510The list reference contains one or more strings specifying patterns
1511that are I<not> be be treated as nested tags within the tagged text
1512(even if they would match the start tag pattern).
1513
1514For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1515
48f821bf 1516 extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
55a1c97c 1517
1518(also see L<"gen_delimited_pat"> below).
1519
1520
1521=item C<fail =E<gt> $str>
1522
1523The C<fail> option indicates the action to be taken if a matching end
1524tag is not encountered (i.e. before the end of the string or some
1525C<reject> pattern matches). By default, a failure to match a closing
1526tag causes C<extract_tagged> to immediately fail.
1527
1528However, if the string value associated with <reject> is "MAX", then
1529C<extract_tagged> returns the complete text up to the point of failure.
1530If the string is "PARA", C<extract_tagged> returns only the first paragraph
1531after the tag (up to the first line that is either empty or contains
1532only whitespace characters).
48f821bf 1533If the string is "", the the default behaviour (i.e. failure) is reinstated.
55a1c97c 1534
1535For example, suppose the start tag "/para" introduces a paragraph, which then
1536continues until the next "/endpara" tag or until another "/para" tag is
1537encountered:
1538
48f821bf 1539 $text = "/para line 1\n\nline 3\n/para line 4";
55a1c97c 1540
48f821bf 1541 extract_tagged($text, '/para', '/endpara', undef,
1542 {reject => '/para', fail => MAX );
55a1c97c 1543
48f821bf 1544 # EXTRACTED: "/para line 1\n\nline 3\n"
55a1c97c 1545
1546Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1547tag refers only to the immediately following paragraph:
1548
48f821bf 1549 $text = "/para line 1\n\nline 3\n/para line 4";
55a1c97c 1550
48f821bf 1551 extract_tagged($text, '/para', '/endpara', undef,
1552 {reject => '/para', fail => MAX );
55a1c97c 1553
48f821bf 1554 # EXTRACTED: "/para line 1\n"
55a1c97c 1555
1556Note that the specified C<fail> behaviour applies to nested tags as well.
1557
1558=back
1559
1560On success in a list context, an array of 6 elements is returned. The elements are:
1561
1562=over 4
1563
1564=item [0]
1565
1566the extracted tagged substring (including the outermost tags),
1567
1568=item [1]
1569
1570the remainder of the input text,
1571
1572=item [2]
1573
1574the prefix substring (if any),
1575
1576=item [3]
1577
1578the opening tag
1579
1580=item [4]
1581
1582the text between the opening and closing tags
1583
1584=item [5]
1585
1586the closing tag (or "" if no closing tag was found)
1587
1588=back
1589
1590On failure, all of these values (except the remaining text) are C<undef>.
1591
1592In a scalar context, C<extract_tagged> returns just the complete
1593substring that matched a tagged text (including the start and end
1594tags). C<undef> is returned on failure. In addition, the original input
1595text has the returned substring (and any prefix) removed from it.
1596
1597In a void context, the input text just has the matched substring (and
1598any specified prefix) removed.
1599
1600
1601=head2 C<gen_extract_tagged>
1602
1603(Note: This subroutine is only available under Perl5.005)
1604
1605C<gen_extract_tagged> generates a new anonymous subroutine which
1606extracts text between (balanced) specified tags. In other words,
1607it generates a function identical in function to C<extract_tagged>.
1608
1609The difference between C<extract_tagged> and the anonymous
1610subroutines generated by
1611C<gen_extract_tagged>, is that those generated subroutines:
1612
1613=over 4
1614
1615=item *
1616
1617do not have to reparse tag specification or parsing options every time
1618they are called (whereas C<extract_tagged> has to effectively rebuild
1619its tag parser on every call);
1620
1621=item *
1622
1623make use of the new qr// construct to pre-compile the regexes they use
1624(whereas C<extract_tagged> uses standard string variable interpolation
1625to create tag-matching patterns).
1626
1627=back
1628
1629The subroutine takes up to four optional arguments (the same set as
1630C<extract_tagged> except for the string to be processed). It returns
1631a reference to a subroutine which in turn takes a single argument (the text to
1632be extracted from).
1633
1634In other words, the implementation of C<extract_tagged> is exactly
1635equivalent to:
1636
48f821bf 1637 sub extract_tagged
1638 {
1639 my $text = shift;
1640 $extractor = gen_extract_tagged(@_);
1641 return $extractor->($text);
1642 }
55a1c97c 1643
1644(although C<extract_tagged> is not currently implemented that way, in order
1645to preserve pre-5.005 compatibility).
1646
1647Using C<gen_extract_tagged> to create extraction functions for specific tags
1648is a good idea if those functions are going to be called more than once, since
1649their performance is typically twice as good as the more general-purpose
1650C<extract_tagged>.
1651
1652
1653=head2 C<extract_quotelike>
1654
1655C<extract_quotelike> attempts to recognize, extract, and segment any
1656one of the various Perl quotes and quotelike operators (see
1657L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1658delimiters (for the quotelike operators), and trailing modifiers are
1659all caught. For example, in:
1660
48f821bf 1661 extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1662
1663 extract_quotelike ' "You said, \"Use sed\"." '
55a1c97c 1664
48f821bf 1665 extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
55a1c97c 1666
48f821bf 1667 extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
55a1c97c 1668
1669the full Perl quotelike operations are all extracted correctly.
1670
1671Note too that, when using the /x modifier on a regex, any comment
1672containing the current pattern delimiter will cause the regex to be
1673immediately terminated. In other words:
1674
48f821bf 1675 'm /
1676 (?i) # CASE INSENSITIVE
1677 [a-z_] # LEADING ALPHABETIC/UNDERSCORE
1678 [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1679 /x'
55a1c97c 1680
1681will be extracted as if it were:
1682
48f821bf 1683 'm /
1684 (?i) # CASE INSENSITIVE
1685 [a-z_] # LEADING ALPHABETIC/'
55a1c97c 1686
1687This behaviour is identical to that of the actual compiler.
1688
1689C<extract_quotelike> takes two arguments: the text to be processed and
1690a prefix to be matched at the very beginning of the text. If no prefix
1691is specified, optional whitespace is the default. If no text is given,
1692C<$_> is used.
1693
1694In a list context, an array of 11 elements is returned. The elements are:
1695
1696=over 4
1697
1698=item [0]
1699
1700the extracted quotelike substring (including trailing modifiers),
1701
1702=item [1]
1703
1704the remainder of the input text,
1705
1706=item [2]
1707
1708the prefix substring (if any),
1709
1710=item [3]
1711
1712the name of the quotelike operator (if any),
1713
1714=item [4]
1715
1716the left delimiter of the first block of the operation,
1717
1718=item [5]
1719
1720the text of the first block of the operation
1721(that is, the contents of
1722a quote, the regex of a match or substitution or the target list of a
1723translation),
1724
1725=item [6]
1726
1727the right delimiter of the first block of the operation,
1728
1729=item [7]
1730
1731the left delimiter of the second block of the operation
48f821bf 1732(that is, if it is a C<s>, C<tr>, or C<y>),
55a1c97c 1733
1734=item [8]
1735
1736the text of the second block of the operation
1737(that is, the replacement of a substitution or the translation list
1738of a translation),
1739
1740=item [9]
1741
1742the right delimiter of the second block of the operation (if any),
1743
1744=item [10]
1745
1746the trailing modifiers on the operation (if any).
1747
1748=back
1749
1750For each of the fields marked "(if any)" the default value on success is
1751an empty string.
1752On failure, all of these values (except the remaining text) are C<undef>.
1753
1754
1755In a scalar context, C<extract_quotelike> returns just the complete substring
1756that matched a quotelike operation (or C<undef> on failure). In a scalar or
1757void context, the input text has the same substring (and any specified
1758prefix) removed.
1759
1760Examples:
1761
48f821bf 1762 # Remove the first quotelike literal that appears in text
55a1c97c 1763
48f821bf 1764 $quotelike = extract_quotelike($text,'.*?');
55a1c97c 1765
48f821bf 1766 # Replace one or more leading whitespace-separated quotelike
1767 # literals in $_ with "<QLL>"
55a1c97c 1768
48f821bf 1769 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
55a1c97c 1770
1771
48f821bf 1772 # Isolate the search pattern in a quotelike operation from $text
55a1c97c 1773
48f821bf 1774 ($op,$pat) = (extract_quotelike $text)[3,5];
1775 if ($op =~ /[ms]/)
1776 {
1777 print "search pattern: $pat\n";
1778 }
1779 else
1780 {
1781 print "$op is not a pattern matching operation\n";
1782 }
55a1c97c 1783
1784
1785=head2 C<extract_quotelike> and "here documents"
1786
1787C<extract_quotelike> can successfully extract "here documents" from an input
1788string, but with an important caveat in list contexts.
1789
1790Unlike other types of quote-like literals, a here document is rarely
1791a contiguous substring. For example, a typical piece of code using
1792here document might look like this:
1793
1794 <<'EOMSG' || die;
1795 This is the message.
1796 EOMSG
48f821bf 1797 exit;
55a1c97c 1798
1799Given this as an input string in a scalar context, C<extract_quotelike>
1800would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1801leaving the string " || die;\nexit;" in the original variable. In other words,
1802the two separate pieces of the here document are successfully extracted and
1803concatenated.
1804
1805In a list context, C<extract_quotelike> would return the list
1806
1807=over 4
1808
1809=item [0]
1810
1811"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1812including fore and aft delimiters),
1813
1814=item [1]
1815
1816" || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1817
1818=item [2]
1819
1820"" (i.e. the prefix substring -- trivial in this case),
1821
1822=item [3]
1823
1824"<<" (i.e. the "name" of the quotelike operator)
1825
1826=item [4]
1827
1828"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1829
1830=item [5]
1831
1832"This is the message.\n" (i.e. the text of the here document),
1833
1834=item [6]
1835
1836"EOMSG" (i.e. the right delimiter of the here document),
1837
1838=item [7..10]
1839
1840"" (a here document has no second left delimiter, second text, second right
1841delimiter, or trailing modifiers).
1842
1843=back
1844
1845However, the matching position of the input variable would be set to
1846"exit;" (i.e. I<after> the closing delimiter of the here document),
1847which would cause the earlier " || die;\nexit;" to be skipped in any
1848sequence of code fragment extractions.
1849
48f821bf 1850To avoid this problem, when it encounters a here document whilst
55a1c97c 1851extracting from a modifiable string, C<extract_quotelike> silently
1852rearranges the string to an equivalent piece of Perl:
1853
1854 <<'EOMSG'
1855 This is the message.
1856 EOMSG
1857 || die;
48f821bf 1858 exit;
55a1c97c 1859
1860in which the here document I<is> contiguous. It still leaves the
1861matching position after the here document, but now the rest of the line
1862on which the here document starts is not skipped.
1863
1864To prevent <extract_quotelike> from mucking about with the input in this way
1865(this is the only case where a list-context C<extract_quotelike> does so),
1866you can pass the input variable as an interpolated literal:
1867
1868 $quotelike = extract_quotelike("$var");
1869
1870
1871=head2 C<extract_codeblock>
1872
1873C<extract_codeblock> attempts to recognize and extract a balanced
1874bracket delimited substring that may contain unbalanced brackets
1875inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1876is like a combination of C<"extract_bracketed"> and
1877C<"extract_quotelike">.
1878
1879C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1880a text to process, a set of delimiter brackets to look for, and a prefix to
1881match first. It also takes an optional fourth parameter, which allows the
1882outermost delimiter brackets to be specified separately (see below).
1883
1884Omitting the first argument (input text) means process C<$_> instead.
1885Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1886Omitting the third argument (prefix argument) implies optional whitespace at the start.
1887Omitting the fourth argument (outermost delimiter brackets) indicates that the
1888value of the second argument is to be used for the outermost delimiters.
1889
48f821bf 1890Once the prefix an dthe outermost opening delimiter bracket have been
55a1c97c 1891recognized, code blocks are extracted by stepping through the input text and
1892trying the following alternatives in sequence:
1893
1894=over 4
1895
1896=item 1.
1897
1898Try and match a closing delimiter bracket. If the bracket was the same
1899species as the last opening bracket, return the substring to that
1900point. If the bracket was mismatched, return an error.
1901
1902=item 2.
1903
1904Try to match a quote or quotelike operator. If found, call
1905C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1906the error it returned. Otherwise go back to step 1.
1907
1908=item 3.
1909
1910Try to match an opening delimiter bracket. If found, call
1911C<extract_codeblock> recursively to eat the embedded block. If the
1912recursive call fails, return an error. Otherwise, go back to step 1.
1913
1914=item 4.
1915
1916Unconditionally match a bareword or any other single character, and
1917then go back to step 1.
1918
1919=back
1920
1921
1922Examples:
1923
1924 # Find a while loop in the text
1925
1926 if ($text =~ s/.*?while\s*\{/{/)
1927 {
1928 $loop = "while " . extract_codeblock($text);
1929 }
1930
1931 # Remove the first round-bracketed list (which may include
1932 # round- or curly-bracketed code blocks or quotelike operators)
1933
1934 extract_codeblock $text, "(){}", '[^(]*';
1935
1936
1937The ability to specify a different outermost delimiter bracket is useful
1938in some circumstances. For example, in the Parse::RecDescent module,
1939parser actions which are to be performed only on a successful parse
1940are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1941
1942 sentence: subject verb object
1943 <defer: {$::theVerb = $item{verb}} >
1944
1945Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1946within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1947
1948A deferred action like this:
1949
1950 <defer: {if ($count>10) {$count--}} >
1951
1952will be incorrectly parsed as:
1953
1954 <defer: {if ($count>
1955
1956because the "less than" operator is interpreted as a closing delimiter.
1957
1958But, by extracting the directive using
1959S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1960the '>' character is only treated as a delimited at the outermost
1961level of the code block, so the directive is parsed correctly.
1962
1963=head2 C<extract_multiple>
1964
1965The C<extract_multiple> subroutine takes a string to be processed and a
1966list of extractors (subroutines or regular expressions) to apply to that string.
1967
1968In an array context C<extract_multiple> returns an array of substrings
1969of the original string, as extracted by the specified extractors.
1970In a scalar context, C<extract_multiple> returns the first
1971substring successfully extracted from the original string. In both
1972scalar and void contexts the original string has the first successfully
1973extracted substring removed from it. In all contexts
1974C<extract_multiple> starts at the current C<pos> of the string, and
1975sets that C<pos> appropriately after it matches.
1976
48f821bf 1977Hence, the aim of of a call to C<extract_multiple> in a list context
55a1c97c 1978is to split the processed string into as many non-overlapping fields as
1979possible, by repeatedly applying each of the specified extractors
1980to the remainder of the string. Thus C<extract_multiple> is
1981a generalized form of Perl's C<split> subroutine.
1982
1983The subroutine takes up to four optional arguments:
1984
1985=over 4
1986
1987=item 1.
1988
1989A string to be processed (C<$_> if the string is omitted or C<undef>)
1990
1991=item 2.
1992
1993A reference to a list of subroutine references and/or qr// objects and/or
1994literal strings and/or hash references, specifying the extractors
1995to be used to split the string. If this argument is omitted (or
1996C<undef>) the list:
1997
1998 [
1999 sub { extract_variable($_[0], '') },
2000 sub { extract_quotelike($_[0],'') },
2001 sub { extract_codeblock($_[0],'{}','') },
2002 ]
2003
2004is used.
2005
2006
2007=item 3.
2008
48f821bf 2009An number specifying the maximum number of fields to return. If this
55a1c97c 2010argument is omitted (or C<undef>), split continues as long as possible.
2011
2012If the third argument is I<N>, then extraction continues until I<N> fields
2013have been successfully extracted, or until the string has been completely
2014processed.
2015
2016Note that in scalar and void contexts the value of this argument is
2017automatically reset to 1 (under C<-w>, a warning is issued if the argument
2018has to be reset).
2019
2020=item 4.
2021
2022A value indicating whether unmatched substrings (see below) within the
2023text should be skipped or returned as fields. If the value is true,
2024such substrings are skipped. Otherwise, they are returned.
2025
2026=back
2027
2028The extraction process works by applying each extractor in
a7602084 2029sequence to the text string.
2030
2031If the extractor is a subroutine it is called in a list context and is
2032expected to return a list of a single element, namely the extracted
2033text. It may optionally also return two further arguments: a string
2034representing the text left after extraction (like $' for a pattern
2035match), and a string representing any prefix skipped before the
2036extraction (like $` in a pattern match). Note that this is designed
2037to facilitate the use of other Text::Balanced subroutines with
2038C<extract_multiple>. Note too that the value returned by an extractor
2039subroutine need not bear any relationship to the corresponding substring
2040of the original text (see examples below).
55a1c97c 2041
2042If the extractor is a precompiled regular expression or a string,
2043it is matched against the text in a scalar context with a leading
2044'\G' and the gc modifiers enabled. The extracted value is either
2045$1 if that variable is defined after the match, or else the
2046complete match (i.e. $&).
2047
2048If the extractor is a hash reference, it must contain exactly one element.
2049The value of that element is one of the
2050above extractor types (subroutine reference, regular expression, or string).
2051The key of that element is the name of a class into which the successful
2052return value of the extractor will be blessed.
2053
2054If an extractor returns a defined value, that value is immediately
2055treated as the next extracted field and pushed onto the list of fields.
2056If the extractor was specified in a hash reference, the field is also
2057blessed into the appropriate class,
2058
2059If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
2060assumed to have failed to extract.
2061If none of the extractor subroutines succeeds, then one
2062character is extracted from the start of the text and the extraction
2063subroutines reapplied. Characters which are thus removed are accumulated and
2064eventually become the next field (unless the fourth argument is true, in which
3c4b39be 2065case they are discarded).
55a1c97c 2066
2067For example, the following extracts substrings that are valid Perl variables:
2068
2069 @fields = extract_multiple($text,
2070 [ sub { extract_variable($_[0]) } ],
2071 undef, 1);
2072
2073This example separates a text into fields which are quote delimited,
2074curly bracketed, and anything else. The delimited and bracketed
2075parts are also blessed to identify them (the "anything else" is unblessed):
2076
2077 @fields = extract_multiple($text,
2078 [
2079 { Delim => sub { extract_delimited($_[0],q{'"}) } },
2080 { Brack => sub { extract_bracketed($_[0],'{}') } },
2081 ]);
2082
2083This call extracts the next single substring that is a valid Perl quotelike
2084operator (and removes it from $text):
2085
2086 $quotelike = extract_multiple($text,
2087 [
2088 sub { extract_quotelike($_[0]) },
2089 ], undef, 1);
2090
2091Finally, here is yet another way to do comma-separated value parsing:
2092
2093 @fields = extract_multiple($csv_text,
2094 [
2095 sub { extract_delimited($_[0],q{'"}) },
2096 qr/([^,]+)(.*)/,
2097 ],
2098 undef,1);
2099
2100The list in the second argument means:
2101I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2102The undef third argument means:
2103I<"...as many times as possible...">,
2104and the true value in the fourth argument means
2105I<"...discarding anything else that appears (i.e. the commas)">.
2106
2107If you wanted the commas preserved as separate fields (i.e. like split
2108does if your split pattern has capturing parentheses), you would
2109just make the last parameter undefined (or remove it).
2110
2111
2112=head2 C<gen_delimited_pat>
2113
2114The C<gen_delimited_pat> subroutine takes a single (string) argument and
2115 > builds a Friedl-style optimized regex that matches a string delimited
2116by any one of the characters in the single argument. For example:
2117
2118 gen_delimited_pat(q{'"})
2119
2120returns the regex:
2121
2122 (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2123
2124Note that the specified delimiters are automatically quotemeta'd.
2125
2126A typical use of C<gen_delimited_pat> would be to build special purpose tags
2127for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2128(which might contain quoted strings):
2129
2130 my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2131
2132 extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2133
2134
2135C<gen_delimited_pat> may also be called with an optional second argument,
2136which specifies the "escape" character(s) to be used for each delimiter.
2137For example to match a Pascal-style string (where ' is the delimiter
2138and '' is a literal ' within the string):
2139
2140 gen_delimited_pat(q{'},q{'});
2141
2142Different escape characters can be specified for different delimiters.
2143For example, to specify that '/' is the escape for single quotes
2144and '%' is the escape for double quotes:
2145
2146 gen_delimited_pat(q{'"},q{/%});
2147
2148If more delimiters than escape chars are specified, the last escape char
2149is used for the remaining delimiters.
2150If no escape char is specified for a given specified delimiter, '\' is used.
2151
2152Note that
2153C<gen_delimited_pat> was previously called
2154C<delimited_pat>. That name may still be used, but is now deprecated.
2155
2156
2157=head1 DIAGNOSTICS
2158
2159In a list context, all the functions return C<(undef,$original_text)>
2160on failure. In a scalar context, failure is indicated by returning C<undef>
2161(in this case the input text is not modified in any way).
2162
2163In addition, on failure in I<any> context, the C<$@> variable is set.
2164Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2165below.
2166Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2167which the error was detected (although not necessarily where it occurred!)
2168Printing C<$@> directly produces the error message, with the offset appended.
2169On success, the C<$@> variable is guaranteed to be C<undef>.
2170
2171The available diagnostics are:
2172
2173=over 4
2174
2175=item C<Did not find a suitable bracket: "%s">
2176
2177The delimiter provided to C<extract_bracketed> was not one of
2178C<'()[]E<lt>E<gt>{}'>.
2179
2180=item C<Did not find prefix: /%s/>
2181
2182A non-optional prefix was specified but wasn't found at the start of the text.
2183
2184=item C<Did not find opening bracket after prefix: "%s">
2185
2186C<extract_bracketed> or C<extract_codeblock> was expecting a
2187particular kind of bracket at the start of the text, and didn't find it.
2188
2189=item C<No quotelike operator found after prefix: "%s">
2190
2191C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2192C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2193it was extracting.
2194
2195=item C<Unmatched closing bracket: "%c">
2196
2197C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2198a closing bracket where none was expected.
2199
2200=item C<Unmatched opening bracket(s): "%s">
2201
2202C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2203out of characters in the text before closing one or more levels of nested
2204brackets.
2205
2206=item C<Unmatched embedded quote (%s)>
2207
2208C<extract_bracketed> attempted to match an embedded quoted substring, but
2209failed to find a closing quote to match it.
2210
2211=item C<Did not find closing delimiter to match '%s'>
2212
2213C<extract_quotelike> was unable to find a closing delimiter to match the
2214one that opened the quote-like operation.
2215
2216=item C<Mismatched closing bracket: expected "%c" but found "%s">
2217
2218C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2219a valid bracket delimiter, but it was the wrong species. This usually
2220indicates a nesting error, but may indicate incorrect quoting or escaping.
2221
2222=item C<No block delimiter found after quotelike "%s">
2223
2224C<extract_quotelike> or C<extract_codeblock> found one of the
2225quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2226without a suitable block after it.
2227
2228=item C<Did not find leading dereferencer>
2229
2230C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2231a variable, but didn't find any of them.
2232
2233=item C<Bad identifier after dereferencer>
2234
2235C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2236character was not followed by a legal Perl identifier.
2237
2238=item C<Did not find expected opening bracket at %s>
2239
2240C<extract_codeblock> failed to find any of the outermost opening brackets
2241that were specified.
2242
2243=item C<Improperly nested codeblock at %s>
2244
2245A nested code block was found that started with a delimiter that was specified
2246as being only to be used as an outermost bracket.
2247
2248=item C<Missing second block for quotelike "%s">
2249
2250C<extract_codeblock> or C<extract_quotelike> found one of the
2251quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2252
2253=item C<No match found for opening bracket>
2254
2255C<extract_codeblock> failed to find a closing bracket to match the outermost
2256opening bracket.
2257
2258=item C<Did not find opening tag: /%s/>
2259
2260C<extract_tagged> did not find a suitable opening tag (after any specified
2261prefix was removed).
2262
2263=item C<Unable to construct closing tag to match: /%s/>
2264
2265C<extract_tagged> matched the specified opening tag and tried to
2266modify the matched text to produce a matching closing tag (because
2267none was specified). It failed to generate the closing tag, almost
2268certainly because the opening tag did not start with a
2269bracket of some kind.
2270
2271=item C<Found invalid nested tag: %s>
2272
2273C<extract_tagged> found a nested tag that appeared in the "reject" list
2274(and the failure mode was not "MAX" or "PARA").
2275
2276=item C<Found unbalanced nested tag: %s>
2277
2278C<extract_tagged> found a nested opening tag that was not matched by a
2279corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2280
2281=item C<Did not find closing tag>
2282
2283C<extract_tagged> reached the end of the text without finding a closing tag
2284to match the original opening tag (and the failure mode was not
2285"MAX" or "PARA").
2286
2287
2288
2289
2290=back
2291
2292
2293=head1 AUTHOR
2294
2295Damian Conway (damian@conway.org)
2296
2297
2298=head1 BUGS AND IRRITATIONS
2299
2300There are undoubtedly serious bugs lurking somewhere in this code, if
2301only because parts of it give the impression of understanding a great deal
2302more about Perl than they really do.
2303
2304Bug reports and other feedback are most welcome.
2305
2306
2307=head1 COPYRIGHT
2308
2309 Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
2310 This module is free software. It may be used, redistributed
2311 and/or modified under the same terms as Perl itself.