Sync Text::Balanced with the CPAN version (1.99.1)
[p5sagit/p5-mst-13.2.git] / lib / Text / Balanced.pm
CommitLineData
3270c621 1# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
2# FOR FULL DOCUMENTATION SEE Balanced.pod
3
4use 5.005;
5use strict;
6
7package Text::Balanced;
8
9use Exporter;
dd6316a9 10use SelfLoader;
3270c621 11use vars qw { $VERSION @ISA %EXPORT_TAGS };
12
dd6316a9 13use version; $VERSION = qv('1.99.1');
3270c621 14@ISA = qw ( Exporter );
15
16%EXPORT_TAGS = ( ALL => [ qw(
17 &extract_delimited
18 &extract_bracketed
19 &extract_quotelike
20 &extract_codeblock
21 &extract_variable
22 &extract_tagged
23 &extract_multiple
24
25 &gen_delimited_pat
26 &gen_extract_tagged
27
28 &delimited_pat
29 ) ] );
30
31Exporter::export_ok_tags('ALL');
32
33# PROTOTYPES
34
35sub _match_bracketed($$$$$$);
36sub _match_variable($$);
37sub _match_codeblock($$$$$$$);
38sub _match_quotelike($$$$);
39
40# HANDLE RETURN VALUES IN VARIOUS CONTEXTS
41
42sub _failmsg {
43 my ($message, $pos) = @_;
44 $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
45}
46
47sub _fail
48{
49 my ($wantarray, $textref, $message, $pos) = @_;
50 _failmsg $message, $pos if $message;
49c03c89 51 return (undef,$$textref,undef) if $wantarray;
3270c621 52 return undef;
53}
54
55sub _succeed
56{
57 $@ = undef;
58 my ($wantarray,$textref) = splice @_, 0, 2;
59 my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
dd6316a9 60 my ($startlen, $oppos) = @_[5,6];
3270c621 61 my $remainderpos = $_[2];
62 if ($wantarray)
63 {
64 my @res;
65 while (my ($from, $len) = splice @_, 0, 2)
66 {
67 push @res, substr($$textref,$from,$len);
68 }
69 if ($extralen) { # CORRECT FILLET
ce3ac4b6 70 my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n");
3270c621 71 $res[1] = "$extra$res[1]";
72 eval { substr($$textref,$remainderpos,0) = $extra;
73 substr($$textref,$extrapos,$extralen,"\n")} ;
74 #REARRANGE HERE DOC AND FILLET IF POSSIBLE
75 pos($$textref) = $remainderpos-$extralen+1; # RESET \G
76 }
77 else {
78 pos($$textref) = $remainderpos; # RESET \G
79 }
80 return @res;
81 }
82 else
83 {
84 my $match = substr($$textref,$_[0],$_[1]);
85 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
86 my $extra = $extralen
87 ? substr($$textref, $extrapos, $extralen)."\n" : "";
88 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE
89 pos($$textref) = $_[4]; # RESET \G
90 return $match;
91 }
92}
93
94# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
95
96sub gen_delimited_pat($;$) # ($delimiters;$escapes)
97{
98 my ($dels, $escs) = @_;
99 return "" unless $dels =~ /\S/;
100 $escs = '\\' unless $escs;
101 $escs .= substr($escs,-1) x (length($dels)-length($escs));
102 my @pat = ();
103 my $i;
104 for ($i=0; $i<length $dels; $i++)
105 {
106 my $del = quotemeta substr($dels,$i,1);
107 my $esc = quotemeta substr($escs,$i,1);
108 if ($del eq $esc)
109 {
110 push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
111 }
112 else
113 {
114 push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
115 }
116 }
117 my $pat = join '|', @pat;
118 return "(?:$pat)";
119}
120
121*delimited_pat = \&gen_delimited_pat;
122
123
124# THE EXTRACTION FUNCTIONS
125
126sub extract_delimited (;$$$$)
127{
128 my $textref = defined $_[0] ? \$_[0] : \$_;
129 my $wantarray = wantarray;
130 my $del = defined $_[1] ? $_[1] : qq{\'\"\`};
131 my $pre = defined $_[2] ? $_[2] : '\s*';
132 my $esc = defined $_[3] ? $_[3] : qq{\\};
133 my $pat = gen_delimited_pat($del, $esc);
134 my $startpos = pos $$textref || 0;
135 return _fail($wantarray, $textref, "Not a delimited pattern", 0)
136 unless $$textref =~ m/\G($pre)($pat)/gc;
137 my $prelen = length($1);
138 my $matchpos = $startpos+$prelen;
139 my $endpos = pos $$textref;
140 return _succeed $wantarray, $textref,
141 $matchpos, $endpos-$matchpos, # MATCH
142 $endpos, length($$textref)-$endpos, # REMAINDER
143 $startpos, $prelen; # PREFIX
144}
145
146sub extract_bracketed (;$$$)
147{
148 my $textref = defined $_[0] ? \$_[0] : \$_;
149 my $ldel = defined $_[1] ? $_[1] : '{([<';
150 my $pre = defined $_[2] ? $_[2] : '\s*';
151 my $wantarray = wantarray;
152 my $qdel = "";
153 my $quotelike;
154 $ldel =~ s/'//g and $qdel .= q{'};
155 $ldel =~ s/"//g and $qdel .= q{"};
156 $ldel =~ s/`//g and $qdel .= q{`};
157 $ldel =~ s/q//g and $quotelike = 1;
158 $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
159 my $rdel = $ldel;
160 unless ($rdel =~ tr/[({</])}>/)
161 {
162 return _fail $wantarray, $textref,
163 "Did not find a suitable bracket in delimiter: \"$_[1]\"",
164 0;
165 }
166 my $posbug = pos;
167 $ldel = join('|', map { quotemeta $_ } split('', $ldel));
168 $rdel = join('|', map { quotemeta $_ } split('', $rdel));
169 pos = $posbug;
170
171 my $startpos = pos $$textref || 0;
172 my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
173
174 return _fail ($wantarray, $textref) unless @match;
175
176 return _succeed ( $wantarray, $textref,
177 $match[2], $match[5]+2, # MATCH
178 @match[8,9], # REMAINDER
179 @match[0,1], # PREFIX
180 );
181}
182
183sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
184{
185 my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
186 my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
187 unless ($$textref =~ m/\G$pre/gc)
188 {
189 _failmsg "Did not find prefix: /$pre/", $startpos;
190 return;
191 }
192
193 $ldelpos = pos $$textref;
194
195 unless ($$textref =~ m/\G($ldel)/gc)
196 {
197 _failmsg "Did not find opening bracket after prefix: \"$pre\"",
198 pos $$textref;
199 pos $$textref = $startpos;
200 return;
201 }
202
203 my @nesting = ( $1 );
204 my $textlen = length $$textref;
205 while (pos $$textref < $textlen)
206 {
207 next if $$textref =~ m/\G\\./gcs;
208
209 if ($$textref =~ m/\G($ldel)/gc)
210 {
211 push @nesting, $1;
212 }
213 elsif ($$textref =~ m/\G($rdel)/gc)
214 {
215 my ($found, $brackettype) = ($1, $1);
216 if ($#nesting < 0)
217 {
218 _failmsg "Unmatched closing bracket: \"$found\"",
219 pos $$textref;
220 pos $$textref = $startpos;
221 return;
222 }
223 my $expected = pop(@nesting);
224 $expected =~ tr/({[</)}]>/;
225 if ($expected ne $brackettype)
226 {
227 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
228 pos $$textref;
229 pos $$textref = $startpos;
230 return;
231 }
232 last if $#nesting < 0;
233 }
234 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
235 {
9686a75b 236 $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
3270c621 237 _failmsg "Unmatched embedded quote ($1)",
238 pos $$textref;
239 pos $$textref = $startpos;
240 return;
241 }
242 elsif ($quotelike && _match_quotelike($textref,"",1,0))
243 {
244 next;
245 }
246
247 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
248 }
249 if ($#nesting>=0)
250 {
251 _failmsg "Unmatched opening bracket(s): "
252 . join("..",@nesting)."..",
253 pos $$textref;
254 pos $$textref = $startpos;
255 return;
256 }
257
258 $endpos = pos $$textref;
259
260 return (
261 $startpos, $ldelpos-$startpos, # PREFIX
262 $ldelpos, 1, # OPENING BRACKET
263 $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS
264 $endpos-1, 1, # CLOSING BRACKET
265 $endpos, length($$textref)-$endpos, # REMAINDER
266 );
267}
268
49c03c89 269sub _revbracket($)
3270c621 270{
271 my $brack = reverse $_[0];
272 $brack =~ tr/[({</])}>/;
273 return $brack;
274}
275
276my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
277
278sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
279{
280 my $textref = defined $_[0] ? \$_[0] : \$_;
281 my $ldel = $_[1];
282 my $rdel = $_[2];
283 my $pre = defined $_[3] ? $_[3] : '\s*';
284 my %options = defined $_[4] ? %{$_[4]} : ();
285 my $omode = defined $options{fail} ? $options{fail} : '';
286 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
287 : defined($options{reject}) ? $options{reject}
288 : ''
289 ;
290 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
291 : defined($options{ignore}) ? $options{ignore}
292 : ''
293 ;
294
295 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
296 $@ = undef;
297
298 my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
299
300 return _fail(wantarray, $textref) unless @match;
301 return _succeed wantarray, $textref,
302 $match[2], $match[3]+$match[5]+$match[7], # MATCH
303 @match[8..9,0..1,2..7]; # REM, PRE, BITS
304}
305
306sub _match_tagged # ($$$$$$$)
307{
308 my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
309 my $rdelspec;
310
311 my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
312
313 unless ($$textref =~ m/\G($pre)/gc)
314 {
315 _failmsg "Did not find prefix: /$pre/", pos $$textref;
316 goto failed;
317 }
318
319 $opentagpos = pos($$textref);
320
321 unless ($$textref =~ m/\G$ldel/gc)
322 {
323 _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
324 goto failed;
325 }
326
327 $textpos = pos($$textref);
328
329 if (!defined $rdel)
330 {
dd6316a9 331 $rdelspec = substr($$textref, $-[0], $+[0] - $-[0]);
49c03c89 332 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes)
3270c621 333 {
334 _failmsg "Unable to construct closing tag to match: $rdel",
335 pos $$textref;
336 goto failed;
337 }
338 }
339 else
340 {
48f821bf 341 $rdelspec = eval "qq{$rdel}" || do {
342 my $del;
343 for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
344 { next if $rdel =~ /\Q$_/; $del = $_; last }
345 unless ($del) {
dd6316a9 346 use Carp;
347 croak "Can't interpolate right delimiter $rdel"
48f821bf 348 }
349 eval "qq$del$rdel$del";
350 };
3270c621 351 }
352
353 while (pos($$textref) < length($$textref))
354 {
355 next if $$textref =~ m/\G\\./gc;
356
357 if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
358 {
359 $parapos = pos($$textref) - length($1)
360 unless defined $parapos;
361 }
362 elsif ($$textref =~ m/\G($rdelspec)/gc )
363 {
364 $closetagpos = pos($$textref)-length($1);
365 goto matched;
366 }
367 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
368 {
369 next;
370 }
371 elsif ($bad && $$textref =~ m/\G($bad)/gcs)
372 {
373 pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS
374 goto short if ($omode eq 'PARA' || $omode eq 'MAX');
375 _failmsg "Found invalid nested tag: $1", pos $$textref;
376 goto failed;
377 }
378 elsif ($$textref =~ m/\G($ldel)/gc)
379 {
380 my $tag = $1;
381 pos($$textref) -= length($tag); # REWIND TO NESTED TAG
382 unless (_match_tagged(@_)) # MATCH NESTED TAG
383 {
384 goto short if $omode eq 'PARA' || $omode eq 'MAX';
385 _failmsg "Found unbalanced nested tag: $tag",
386 pos $$textref;
387 goto failed;
388 }
389 }
390 else { $$textref =~ m/./gcs }
391 }
392
393short:
394 $closetagpos = pos($$textref);
395 goto matched if $omode eq 'MAX';
396 goto failed unless $omode eq 'PARA';
397
398 if (defined $parapos) { pos($$textref) = $parapos }
399 else { $parapos = pos($$textref) }
400
401 return (
402 $startpos, $opentagpos-$startpos, # PREFIX
403 $opentagpos, $textpos-$opentagpos, # OPENING TAG
404 $textpos, $parapos-$textpos, # TEXT
405 $parapos, 0, # NO CLOSING TAG
406 $parapos, length($$textref)-$parapos, # REMAINDER
407 );
408
409matched:
410 $endpos = pos($$textref);
411 return (
412 $startpos, $opentagpos-$startpos, # PREFIX
413 $opentagpos, $textpos-$opentagpos, # OPENING TAG
414 $textpos, $closetagpos-$textpos, # TEXT
415 $closetagpos, $endpos-$closetagpos, # CLOSING TAG
416 $endpos, length($$textref)-$endpos, # REMAINDER
417 );
418
419failed:
420 _failmsg "Did not find closing tag", pos $$textref unless $@;
421 pos($$textref) = $startpos;
422 return;
423}
424
425sub extract_variable (;$$)
426{
427 my $textref = defined $_[0] ? \$_[0] : \$_;
428 return ("","","") unless defined $$textref;
429 my $pre = defined $_[1] ? $_[1] : '\s*';
430
431 my @match = _match_variable($textref,$pre);
432
433 return _fail wantarray, $textref unless @match;
434
435 return _succeed wantarray, $textref,
436 @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX
437}
438
439sub _match_variable($$)
440{
a7602084 441# $#
442# $^
443# $$
3270c621 444 my ($textref, $pre) = @_;
445 my $startpos = pos($$textref) = pos($$textref)||0;
446 unless ($$textref =~ m/\G($pre)/gc)
447 {
448 _failmsg "Did not find prefix: /$pre/", pos $$textref;
449 return;
450 }
451 my $varpos = pos($$textref);
48f821bf 452 unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
3270c621 453 {
a7602084 454 unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
455 {
3270c621 456 _failmsg "Did not find leading dereferencer", pos $$textref;
457 pos $$textref = $startpos;
458 return;
a7602084 459 }
460 my $deref = $1;
3270c621 461
a7602084 462 unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
463 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
464 or $deref eq '$#' or $deref eq '$$' )
465 {
3270c621 466 _failmsg "Bad identifier after dereferencer", pos $$textref;
467 pos $$textref = $startpos;
468 return;
a7602084 469 }
3270c621 470 }
471
472 while (1)
473 {
48f821bf 474 next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
3270c621 475 next if _match_codeblock($textref,
2f250b7c 476 qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
3270c621 477 qr/[({[]/, qr/[)}\]]/,
478 qr/[({[]/, qr/[)}\]]/, 0);
479 next if _match_codeblock($textref,
480 qr/\s*/, qr/[{[]/, qr/[}\]]/,
481 qr/[{[]/, qr/[}\]]/, 0);
482 next if _match_variable($textref,'\s*->\s*');
483 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
484 last;
485 }
486
487 my $endpos = pos($$textref);
488 return ($startpos, $varpos-$startpos,
489 $varpos, $endpos-$varpos,
490 $endpos, length($$textref)-$endpos
491 );
492}
493
494sub extract_codeblock (;$$$$$)
495{
496 my $textref = defined $_[0] ? \$_[0] : \$_;
497 my $wantarray = wantarray;
498 my $ldel_inner = defined $_[1] ? $_[1] : '{';
499 my $pre = defined $_[2] ? $_[2] : '\s*';
500 my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
501 my $rd = $_[4];
502 my $rdel_inner = $ldel_inner;
503 my $rdel_outer = $ldel_outer;
504 my $posbug = pos;
505 for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
506 for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
507 for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
508 {
509 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
510 }
511 pos = $posbug;
512
513 my @match = _match_codeblock($textref, $pre,
514 $ldel_outer, $rdel_outer,
515 $ldel_inner, $rdel_inner,
516 $rd);
517 return _fail($wantarray, $textref) unless @match;
518 return _succeed($wantarray, $textref,
519 @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX
520 );
521
522}
523
524sub _match_codeblock($$$$$$$)
525{
526 my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
527 my $startpos = pos($$textref) = pos($$textref) || 0;
528 unless ($$textref =~ m/\G($pre)/gc)
529 {
530 _failmsg qq{Did not match prefix /$pre/ at"} .
531 substr($$textref,pos($$textref),20) .
532 q{..."},
533 pos $$textref;
534 return;
535 }
536 my $codepos = pos($$textref);
537 unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER
538 {
539 _failmsg qq{Did not find expected opening bracket at "} .
540 substr($$textref,pos($$textref),20) .
541 q{..."},
542 pos $$textref;
543 pos $$textref = $startpos;
544 return;
545 }
546 my $closing = $1;
547 $closing =~ tr/([<{/)]>}/;
548 my $matched;
549 my $patvalid = 1;
550 while (pos($$textref) < length($$textref))
551 {
552 $matched = '';
553 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
554 {
555 $patvalid = 0;
556 next;
557 }
558
559 if ($$textref =~ m/\G\s*#.*/gc)
560 {
561 next;
562 }
563
564 if ($$textref =~ m/\G\s*($rdel_outer)/gc)
565 {
566 unless ($matched = ($closing && $1 eq $closing) )
567 {
568 next if $1 eq '>'; # MIGHT BE A "LESS THAN"
569 _failmsg q{Mismatched closing bracket at "} .
570 substr($$textref,pos($$textref),20) .
571 qq{...". Expected '$closing'},
572 pos $$textref;
573 }
574 last;
575 }
576
577 if (_match_variable($textref,'\s*') ||
578 _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
579 {
580 $patvalid = 0;
581 next;
582 }
583
584
585 # NEED TO COVER MANY MORE CASES HERE!!!
48f821bf 586 if ($$textref =~ m#\G\s*(?!$ldel_inner)
587 ( [-+*x/%^&|.]=?
55a1c97c 588 | [!=]~
3270c621 589 | =(?!>)
590 | (\*\*|&&|\|\||<<|>>)=?
dd6316a9 591 | split|grep|map|return
48f821bf 592 | [([]
3270c621 593 )#gcx)
594 {
595 $patvalid = 1;
596 next;
597 }
598
599 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
600 {
601 $patvalid = 1;
602 next;
603 }
604
605 if ($$textref =~ m/\G\s*$ldel_outer/gc)
606 {
607 _failmsg q{Improperly nested codeblock at "} .
608 substr($$textref,pos($$textref),20) .
609 q{..."},
610 pos $$textref;
611 last;
612 }
613
614 $patvalid = 0;
615 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
616 }
617 continue { $@ = undef }
618
619 unless ($matched)
620 {
621 _failmsg 'No match found for opening bracket', pos $$textref
622 unless $@;
623 return;
624 }
625
626 my $endpos = pos($$textref);
627 return ( $startpos, $codepos-$startpos,
628 $codepos, $endpos-$codepos,
629 $endpos, length($$textref)-$endpos,
630 );
631}
632
633
634my %mods = (
635 'none' => '[cgimsox]*',
636 'm' => '[cgimsox]*',
637 's' => '[cegimsox]*',
638 'tr' => '[cds]*',
639 'y' => '[cds]*',
640 'qq' => '',
641 'qx' => '',
642 'qw' => '',
643 'qr' => '[imsx]*',
644 'q' => '',
645 );
646
647sub extract_quotelike (;$$)
648{
649 my $textref = $_[0] ? \$_[0] : \$_;
650 my $wantarray = wantarray;
651 my $pre = defined $_[1] ? $_[1] : '\s*';
652
653 my @match = _match_quotelike($textref,$pre,1,0);
654 return _fail($wantarray, $textref) unless @match;
655 return _succeed($wantarray, $textref,
656 $match[2], $match[18]-$match[2], # MATCH
657 @match[18,19], # REMAINDER
658 @match[0,1], # PREFIX
659 @match[2..17], # THE BITS
660 @match[20,21], # ANY FILLET?
661 );
662};
663
664sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match)
665{
666 my ($textref, $pre, $rawmatch, $qmark) = @_;
667
668 my ($textlen,$startpos,
669 $oppos,
670 $preld1pos,$ld1pos,$str1pos,$rd1pos,
671 $preld2pos,$ld2pos,$str2pos,$rd2pos,
672 $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
673
674 unless ($$textref =~ m/\G($pre)/gc)
675 {
676 _failmsg qq{Did not find prefix /$pre/ at "} .
677 substr($$textref, pos($$textref), 20) .
678 q{..."},
679 pos $$textref;
680 return;
681 }
682 $oppos = pos($$textref);
683
684 my $initial = substr($$textref,$oppos,1);
685
686 if ($initial && $initial =~ m|^[\"\'\`]|
687 || $rawmatch && $initial =~ m|^/|
688 || $qmark && $initial =~ m|^\?|)
689 {
9686a75b 690 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
3270c621 691 {
692 _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
693 substr($$textref, $oppos, 20) .
694 q{..."},
695 pos $$textref;
696 pos $$textref = $startpos;
697 return;
698 }
699 $modpos= pos($$textref);
700 $rd1pos = $modpos-1;
701
702 if ($initial eq '/' || $initial eq '?')
703 {
704 $$textref =~ m/\G$mods{none}/gc
705 }
706
707 my $endpos = pos($$textref);
708 return (
709 $startpos, $oppos-$startpos, # PREFIX
710 $oppos, 0, # NO OPERATOR
711 $oppos, 1, # LEFT DEL
712 $oppos+1, $rd1pos-$oppos-1, # STR/PAT
713 $rd1pos, 1, # RIGHT DEL
714 $modpos, 0, # NO 2ND LDEL
715 $modpos, 0, # NO 2ND STR
716 $modpos, 0, # NO 2ND RDEL
717 $modpos, $endpos-$modpos, # MODIFIERS
718 $endpos, $textlen-$endpos, # REMAINDER
719 );
720 }
721
dd6316a9 722 unless ($$textref =~
723 m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<(?=\s*["'A-Za-z_]))}gc)
3270c621 724 {
725 _failmsg q{No quotelike operator found after prefix at "} .
726 substr($$textref, pos($$textref), 20) .
727 q{..."},
728 pos $$textref;
729 pos $$textref = $startpos;
730 return;
731 }
732
733 my $op = $1;
734 $preld1pos = pos($$textref);
735 if ($op eq '<<') {
736 $ld1pos = pos($$textref);
737 my $label;
738 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
739 $label = $1;
740 }
741 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
742 | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
743 | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
9686a75b 744 }gcsx) {
3270c621 745 $label = $+;
746 }
747 else {
748 $label = "";
749 }
750 my $extrapos = pos($$textref);
751 $$textref =~ m{.*\n}gc;
ce3ac4b6 752 $str1pos = pos($$textref)--;
753 unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) {
3270c621 754 _failmsg qq{Missing here doc terminator ('$label') after "} .
755 substr($$textref, $startpos, 20) .
756 q{..."},
757 pos $$textref;
758 pos $$textref = $startpos;
759 return;
760 }
761 $rd1pos = pos($$textref);
dd6316a9 762 $$textref =~ m{\Q$label\E\n}gc;
3270c621 763 $ld2pos = pos($$textref);
764 return (
765 $startpos, $oppos-$startpos, # PREFIX
766 $oppos, length($op), # OPERATOR
767 $ld1pos, $extrapos-$ld1pos, # LEFT DEL
768 $str1pos, $rd1pos-$str1pos, # STR/PAT
769 $rd1pos, $ld2pos-$rd1pos, # RIGHT DEL
770 $ld2pos, 0, # NO 2ND LDEL
771 $ld2pos, 0, # NO 2ND STR
772 $ld2pos, 0, # NO 2ND RDEL
773 $ld2pos, 0, # NO MODIFIERS
774 $ld2pos, $textlen-$ld2pos, # REMAINDER
775 $extrapos, $str1pos-$extrapos, # FILLETED BIT
776 );
777 }
778
779 $$textref =~ m/\G\s*/gc;
780 $ld1pos = pos($$textref);
781 $str1pos = $ld1pos+1;
782
783 unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD
784 {
785 _failmsg "No block delimiter found after quotelike $op",
786 pos $$textref;
787 pos $$textref = $startpos;
788 return;
789 }
790 pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
791 my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
792 if ($ldel1 =~ /[[(<{]/)
793 {
794 $rdel1 =~ tr/[({</])}>/;
ce3ac4b6 795 defined(_match_bracketed($textref,"",$ldel1,"","",$rdel1))
3270c621 796 || do { pos $$textref = $startpos; return };
797 }
798 else
799 {
9686a75b 800 $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
3270c621 801 || do { pos $$textref = $startpos; return };
802 }
803 $ld2pos = $rd1pos = pos($$textref)-1;
804
805 my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
806 if ($second_arg)
807 {
808 my ($ldel2, $rdel2);
809 if ($ldel1 =~ /[[(<{]/)
810 {
811 unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD
812 {
813 _failmsg "Missing second block for quotelike $op",
814 pos $$textref;
815 pos $$textref = $startpos;
816 return;
817 }
818 $ldel2 = $rdel2 = "\Q$1";
819 $rdel2 =~ tr/[({</])}>/;
820 }
821 else
822 {
823 $ldel2 = $rdel2 = $ldel1;
824 }
825 $str2pos = $ld2pos+1;
826
827 if ($ldel2 =~ /[[(<{]/)
828 {
829 pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD
ce3ac4b6 830 defined(_match_bracketed($textref,"",$ldel2,"","",$rdel2))
3270c621 831 || do { pos $$textref = $startpos; return };
832 }
833 else
834 {
9686a75b 835 $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
3270c621 836 || do { pos $$textref = $startpos; return };
837 }
838 $rd2pos = pos($$textref)-1;
839 }
840 else
841 {
842 $ld2pos = $str2pos = $rd2pos = $rd1pos;
843 }
844
845 $modpos = pos $$textref;
846
847 $$textref =~ m/\G($mods{$op})/gc;
848 my $endpos = pos $$textref;
849
850 return (
851 $startpos, $oppos-$startpos, # PREFIX
852 $oppos, length($op), # OPERATOR
853 $ld1pos, 1, # LEFT DEL
854 $str1pos, $rd1pos-$str1pos, # STR/PAT
855 $rd1pos, 1, # RIGHT DEL
856 $ld2pos, $second_arg, # 2ND LDEL (MAYBE)
857 $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE)
858 $rd2pos, $second_arg, # 2ND RDEL (MAYBE)
859 $modpos, $endpos-$modpos, # MODIFIERS
860 $endpos, $textlen-$endpos, # REMAINDER
861 );
862}
863
864my $def_func =
865[
866 sub { extract_variable($_[0], '') },
867 sub { extract_quotelike($_[0],'') },
868 sub { extract_codeblock($_[0],'{}','') },
869];
870
871sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown)
872{
873 my $textref = defined($_[0]) ? \$_[0] : \$_;
874 my $posbug = pos;
875 my ($lastpos, $firstpos);
876 my @fields = ();
877
a7602084 878 #for ($$textref)
3270c621 879 {
880 my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
881 my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
882 my $igunk = $_[3];
883
a7602084 884 pos $$textref ||= 0;
3270c621 885
886 unless (wantarray)
887 {
dd6316a9 888 use Carp;
889 carp "extract_multiple reset maximal count to 1 in scalar context"
3270c621 890 if $^W && defined($_[2]) && $max > 1;
891 $max = 1
892 }
893
894 my $unkpos;
895 my $func;
896 my $class;
897
898 my @class;
899 foreach $func ( @func )
900 {
901 if (ref($func) eq 'HASH')
902 {
903 push @class, (keys %$func)[0];
904 $func = (values %$func)[0];
905 }
906 else
907 {
908 push @class, undef;
909 }
910 }
911
a7602084 912 FIELD: while (pos($$textref) < length($$textref))
3270c621 913 {
48f821bf 914 my ($field, $rem);
a7602084 915 my @bits;
3270c621 916 foreach my $i ( 0..$#func )
917 {
a7602084 918 my $pref;
3270c621 919 $func = $func[$i];
920 $class = $class[$i];
a7602084 921 $lastpos = pos $$textref;
3270c621 922 if (ref($func) eq 'CODE')
49c03c89 923 { ($field,$rem,$pref) = @bits = $func->($$textref) }
3270c621 924 elsif (ref($func) eq 'Text::Balanced::Extractor')
a7602084 925 { @bits = $field = $func->extract($$textref) }
926 elsif( $$textref =~ m/\G$func/gc )
dd6316a9 927 { @bits = $field = defined($1)
928 ? $1
929 : substr($$textref, $-[0], $+[0] - $-[0])
930 }
a7602084 931 $pref ||= "";
3270c621 932 if (defined($field) && length($field))
933 {
a7602084 934 if (!$igunk) {
ce3ac4b6 935 $unkpos = $lastpos
a7602084 936 if length($pref) && !defined($unkpos);
937 if (defined $unkpos)
938 {
939 push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
940 $firstpos = $unkpos unless defined $firstpos;
941 undef $unkpos;
942 last FIELD if @fields == $max;
943 }
3270c621 944 }
a7602084 945 push @fields, $class
946 ? bless (\$field, $class)
3270c621 947 : $field;
948 $firstpos = $lastpos unless defined $firstpos;
a7602084 949 $lastpos = pos $$textref;
3270c621 950 last FIELD if @fields == $max;
951 next FIELD;
952 }
953 }
a7602084 954 if ($$textref =~ /\G(.)/gcs)
3270c621 955 {
a7602084 956 $unkpos = pos($$textref)-1
3270c621 957 unless $igunk || defined $unkpos;
958 }
959 }
960
961 if (defined $unkpos)
962 {
a7602084 963 push @fields, substr($$textref, $unkpos);
3270c621 964 $firstpos = $unkpos unless defined $firstpos;
a7602084 965 $lastpos = length $$textref;
3270c621 966 }
967 last;
968 }
969
970 pos $$textref = $lastpos;
971 return @fields if wantarray;
972
973 $firstpos ||= 0;
974 eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
975 pos $$textref = $firstpos };
976 return $fields[0];
977}
978
979
980sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
981{
982 my $ldel = $_[0];
983 my $rdel = $_[1];
984 my $pre = defined $_[2] ? $_[2] : '\s*';
985 my %options = defined $_[3] ? %{$_[3]} : ();
986 my $omode = defined $options{fail} ? $options{fail} : '';
987 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
988 : defined($options{reject}) ? $options{reject}
989 : ''
990 ;
991 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
992 : defined($options{ignore}) ? $options{ignore}
993 : ''
994 ;
995
996 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
997
998 my $posbug = pos;
999 for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
1000 pos = $posbug;
1001
1002 my $closure = sub
1003 {
1004 my $textref = defined $_[0] ? \$_[0] : \$_;
1005 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
1006
1007 return _fail(wantarray, $textref) unless @match;
1008 return _succeed wantarray, $textref,
1009 $match[2], $match[3]+$match[5]+$match[7], # MATCH
1010 @match[8..9,0..1,2..7]; # REM, PRE, BITS
1011 };
1012
1013 bless $closure, 'Text::Balanced::Extractor';
1014}
1015
1016package Text::Balanced::Extractor;
1017
1018sub extract($$) # ($self, $text)
1019{
1020 &{$_[0]}($_[1]);
1021}
1022
1023package Text::Balanced::ErrorMsg;
1024
1025use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
1026
10271;
55a1c97c 1028
1029__END__
1030
1031=head1 NAME
1032
1033Text::Balanced - Extract delimited text sequences from strings.
1034
1035
1036=head1 SYNOPSIS
1037
1038 use Text::Balanced qw (
1039 extract_delimited
1040 extract_bracketed
1041 extract_quotelike
1042 extract_codeblock
1043 extract_variable
1044 extract_tagged
1045 extract_multiple
1046
1047 gen_delimited_pat
1048 gen_extract_tagged
1049 );
1050
1051 # Extract the initial substring of $text that is delimited by
1052 # two (unescaped) instances of the first character in $delim.
1053
1054 ($extracted, $remainder) = extract_delimited($text,$delim);
1055
1056
1057 # Extract the initial substring of $text that is bracketed
1058 # with a delimiter(s) specified by $delim (where the string
1059 # in $delim contains one or more of '(){}[]<>').
1060
1061 ($extracted, $remainder) = extract_bracketed($text,$delim);
1062
1063
1064 # Extract the initial substring of $text that is bounded by
48f821bf 1065 # an XML tag.
55a1c97c 1066
1067 ($extracted, $remainder) = extract_tagged($text);
1068
1069
1070 # Extract the initial substring of $text that is bounded by
1071 # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1072
1073 ($extracted, $remainder) =
1074 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1075
1076
1077 # Extract the initial substring of $text that represents a
1078 # Perl "quote or quote-like operation"
1079
1080 ($extracted, $remainder) = extract_quotelike($text);
1081
1082
1083 # Extract the initial substring of $text that represents a block
1084 # of Perl code, bracketed by any of character(s) specified by $delim
1085 # (where the string $delim contains one or more of '(){}[]<>').
1086
1087 ($extracted, $remainder) = extract_codeblock($text,$delim);
1088
1089
1090 # Extract the initial substrings of $text that would be extracted by
1091 # one or more sequential applications of the specified functions
1092 # or regular expressions
1093
1094 @extracted = extract_multiple($text,
1095 [ \&extract_bracketed,
1096 \&extract_quotelike,
1097 \&some_other_extractor_sub,
1098 qr/[xyz]*/,
1099 'literal',
1100 ]);
1101
1102# Create a string representing an optimized pattern (a la Friedl)
1103# that matches a substring delimited by any of the specified characters
1104# (in this case: any type of quote or a slash)
1105
1106 $patstring = gen_delimited_pat(q{'"`/});
1107
1108
1109# Generate a reference to an anonymous sub that is just like extract_tagged
1110# but pre-compiled and optimized for a specific pair of tags, and consequently
1111# much faster (i.e. 3 times faster). It uses qr// for better performance on
1112# repeated calls, so it only works under Perl 5.005 or later.
1113
1114 $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1115
1116 ($extracted, $remainder) = $extract_head->($text);
1117
1118
1119=head1 DESCRIPTION
1120
48f821bf 1121The various C<extract_...> subroutines may be used to
1122extract a delimited substring, possibly after skipping a
1123specified prefix string. By default, that prefix is
1124optional whitespace (C</\s*/>), but you can change it to whatever
1125you wish (see below).
1126
1127The substring to be extracted must appear at the
1128current C<pos> location of the string's variable
1129(or at index zero, if no C<pos> position is defined).
1130In other words, the C<extract_...> subroutines I<don't>
dd6316a9 1131extract the first occurance of a substring anywhere
48f821bf 1132in a string (like an unanchored regex would). Rather,
dd6316a9 1133they extract an occurance of the substring appearing
48f821bf 1134immediately at the current matching position in the
1135string (like a C<\G>-anchored regex would).
1136
1137
55a1c97c 1138
1139=head2 General behaviour in list contexts
1140
1141In a list context, all the subroutines return a list, the first three
1142elements of which are always:
1143
1144=over 4
1145
1146=item [0]
1147
1148The extracted string, including the specified delimiters.
49c03c89 1149If the extraction fails C<undef> is returned.
55a1c97c 1150
1151=item [1]
1152
1153The remainder of the input string (i.e. the characters after the
1154extracted string). On failure, the entire string is returned.
1155
1156=item [2]
1157
1158The skipped prefix (i.e. the characters before the extracted string).
49c03c89 1159On failure, C<undef> is returned.
55a1c97c 1160
1161=back
1162
1163Note that in a list context, the contents of the original input text (the first
1164argument) are not modified in any way.
1165
1166However, if the input text was passed in a variable, that variable's
1167C<pos> value is updated to point at the first character after the
1168extracted text. That means that in a list context the various
1169subroutines can be used much like regular expressions. For example:
1170
1171 while ( $next = (extract_quotelike($text))[0] )
1172 {
1173 # process next quote-like (in $next)
1174 }
1175
1176
1177=head2 General behaviour in scalar and void contexts
1178
1179In a scalar context, the extracted string is returned, having first been
1180removed from the input text. Thus, the following code also processes
1181each quote-like operation, but actually removes them from $text:
1182
1183 while ( $next = extract_quotelike($text) )
1184 {
1185 # process next quote-like (in $next)
1186 }
1187
1188Note that if the input text is a read-only string (i.e. a literal),
1189no attempt is made to remove the extracted text.
1190
1191In a void context the behaviour of the extraction subroutines is
1192exactly the same as in a scalar context, except (of course) that the
1193extracted substring is not returned.
1194
1195=head2 A note about prefixes
1196
1197Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1198This can bite you if you're expecting a prefix specification like
1199'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1200pattern will only succeed if the <H1> tag is on the current line, since
1201. normally doesn't match newlines.
1202
1203To overcome this limitation, you need to turn on /s matching within
1204the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1205
1206
1207=head2 C<extract_delimited>
1208
1209The C<extract_delimited> function formalizes the common idiom
1210of extracting a single-character-delimited substring from the start of
1211a string. For example, to extract a single-quote delimited string, the
1212following code is typically used:
1213
1214 ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1215 $extracted = $1;
1216
1217but with C<extract_delimited> it can be simplified to:
1218
1219 ($extracted,$remainder) = extract_delimited($text, "'");
1220
1221C<extract_delimited> takes up to four scalars (the input text, the
1222delimiters, a prefix pattern to be skipped, and any escape characters)
1223and extracts the initial substring of the text that
1224is appropriately delimited. If the delimiter string has multiple
1225characters, the first one encountered in the text is taken to delimit
1226the substring.
1227The third argument specifies a prefix pattern that is to be skipped
1228(but must be present!) before the substring is extracted.
1229The final argument specifies the escape character to be used for each
1230delimiter.
1231
1232All arguments are optional. If the escape characters are not specified,
1233every delimiter is escaped with a backslash (C<\>).
1234If the prefix is not specified, the
1235pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1236is also not specified, the set C</["'`]/> is used. If the text to be processed
1237is not specified either, C<$_> is used.
1238
48f821bf 1239In list context, C<extract_delimited> returns a array of three
55a1c97c 1240elements, the extracted substring (I<including the surrounding
1241delimiters>), the remainder of the text, and the skipped prefix (if
1242any). If a suitable delimited substring is not found, the first
1243element of the array is the empty string, the second is the complete
1244original text, and the prefix returned in the third element is an
1245empty string.
1246
1247In a scalar context, just the extracted substring is returned. In
1248a void context, the extracted substring (and any prefix) are simply
1249removed from the beginning of the first argument.
1250
1251Examples:
1252
1253 # Remove a single-quoted substring from the very beginning of $text:
1254
1255 $substring = extract_delimited($text, "'", '');
1256
1257 # Remove a single-quoted Pascalish substring (i.e. one in which
1258 # doubling the quote character escapes it) from the very
1259 # beginning of $text:
1260
1261 $substring = extract_delimited($text, "'", '', "'");
1262
1263 # Extract a single- or double- quoted substring from the
1264 # beginning of $text, optionally after some whitespace
1265 # (note the list context to protect $text from modification):
1266
1267 ($substring) = extract_delimited $text, q{"'};
1268
1269
1270 # Delete the substring delimited by the first '/' in $text:
1271
1272 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1273
1274Note that this last example is I<not> the same as deleting the first
1275quote-like pattern. For instance, if C<$text> contained the string:
1276
1277 "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1278
1279then after the deletion it would contain:
1280
1281 "if ('.$UNIXCMD/s) { $cmd = $1; }"
1282
1283not:
1284
1285 "if ('./cmd' =~ ms) { $cmd = $1; }"
1286
1287
1288See L<"extract_quotelike"> for a (partial) solution to this problem.
1289
1290
1291=head2 C<extract_bracketed>
1292
1293Like C<"extract_delimited">, the C<extract_bracketed> function takes
1294up to three optional scalar arguments: a string to extract from, a delimiter
1295specifier, and a prefix pattern. As before, a missing prefix defaults to
1296optional whitespace and a missing text defaults to C<$_>. However, a missing
1297delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1298
1299C<extract_bracketed> extracts a balanced-bracket-delimited
1300substring (using any one (or more) of the user-specified delimiter
1301brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1302respect quoted unbalanced brackets (see below).
1303
1304A "delimiter bracket" is a bracket in list of delimiters passed as
1305C<extract_bracketed>'s second argument. Delimiter brackets are
1306specified by giving either the left or right (or both!) versions
1307of the required bracket(s). Note that the order in which
1308two or more delimiter brackets are specified is not significant.
1309
1310A "balanced-bracket-delimited substring" is a substring bounded by
1311matched brackets, such that any other (left or right) delimiter
1312bracket I<within> the substring is also matched by an opposite
1313(right or left) delimiter bracket I<at the same level of nesting>. Any
1314type of bracket not in the delimiter list is treated as an ordinary
1315character.
1316
1317In other words, each type of bracket specified as a delimiter must be
1318balanced and correctly nested within the substring, and any other kind of
1319("non-delimiter") bracket in the substring is ignored.
1320
1321For example, given the string:
1322
1323 $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1324
1325then a call to C<extract_bracketed> in a list context:
1326
1327 @result = extract_bracketed( $text, '{}' );
1328
1329would return:
1330
1331 ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1332
1333since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1334(In a scalar context just the first element of the array would be returned. In
1335a void context, C<$text> would be replaced by an empty string.)
1336
1337Likewise the call in:
1338
1339 @result = extract_bracketed( $text, '{[' );
1340
1341would return the same result, since all sets of both types of specified
1342delimiter brackets are correctly nested and balanced.
1343
1344However, the call in:
1345
1346 @result = extract_bracketed( $text, '{([<' );
1347
1348would fail, returning:
1349
1350 ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" );
1351
1352because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1353the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1354return an empty string. In a void context, C<$text> would be unchanged.)
1355
1356Note that the embedded single-quotes in the string don't help in this
1357case, since they have not been specified as acceptable delimiters and are
1358therefore treated as non-delimiter characters (and ignored).
1359
1360However, if a particular species of quote character is included in the
1361delimiter specification, then that type of quote will be correctly handled.
1362for example, if C<$text> is:
1363
1364 $text = '<A HREF=">>>>">link</A>';
1365
1366then
1367
1368 @result = extract_bracketed( $text, '<">' );
1369
1370returns:
1371
1372 ( '<A HREF=">>>>">', 'link</A>', "" )
1373
1374as expected. Without the specification of C<"> as an embedded quoter:
1375
1376 @result = extract_bracketed( $text, '<>' );
1377
1378the result would be:
1379
1380 ( '<A HREF=">', '>>>">link</A>', "" )
1381
1382In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1383quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1384letter 'q' as a delimiter. Hence:
1385
1386 @result = extract_bracketed( $text, '<q>' );
1387
1388would correctly match something like this:
1389
1390 $text = '<leftop: conj /and/ conj>';
1391
1392See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1393
1394
48f821bf 1395=head2 C<extract_variable>
1396
1397C<extract_variable> extracts any valid Perl variable or
1398variable-involved expression, including scalars, arrays, hashes, array
dd6316a9 1399accesses, hash look-ups, method calls through objects, subroutine calles
48f821bf 1400through subroutine references, etc.
1401
1402The subroutine takes up to two optional arguments:
1403
1404=over 4
1405
1406=item 1.
1407
1408A string to be processed (C<$_> if the string is omitted or C<undef>)
1409
1410=item 2.
1411
1412A string specifying a pattern to be matched as a prefix (which is to be
1413skipped). If omitted, optional whitespace is skipped.
1414
1415=back
1416
1417On success in a list context, an array of 3 elements is returned. The
1418elements are:
1419
1420=over 4
1421
1422=item [0]
1423
1424the extracted variable, or variablish expression
1425
1426=item [1]
1427
1428the remainder of the input text,
1429
1430=item [2]
1431
1432the prefix substring (if any),
1433
1434=back
1435
1436On failure, all of these values (except the remaining text) are C<undef>.
1437
1438In a scalar context, C<extract_variable> returns just the complete
1439substring that matched a variablish expression. C<undef> is returned on
1440failure. In addition, the original input text has the returned substring
1441(and any prefix) removed from it.
1442
1443In a void context, the input text just has the matched substring (and
1444any specified prefix) removed.
1445
1446
55a1c97c 1447=head2 C<extract_tagged>
1448
1449C<extract_tagged> extracts and segments text between (balanced)
1450specified tags.
1451
1452The subroutine takes up to five optional arguments:
1453
1454=over 4
1455
1456=item 1.
1457
1458A string to be processed (C<$_> if the string is omitted or C<undef>)
1459
1460=item 2.
1461
1462A string specifying a pattern to be matched as the opening tag.
1463If the pattern string is omitted (or C<undef>) then a pattern
48f821bf 1464that matches any standard XML tag is used.
55a1c97c 1465
1466=item 3.
1467
1468A string specifying a pattern to be matched at the closing tag.
1469If the pattern string is omitted (or C<undef>) then the closing
1470tag is constructed by inserting a C</> after any leading bracket
1471characters in the actual opening tag that was matched (I<not> the pattern
1472that matched the tag). For example, if the opening tag pattern
1473is specified as C<'{{\w+}}'> and actually matched the opening tag
1474C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1475
1476=item 4.
1477
1478A string specifying a pattern to be matched as a prefix (which is to be
1479skipped). If omitted, optional whitespace is skipped.
1480
1481=item 5.
1482
1483A hash reference containing various parsing options (see below)
1484
1485=back
1486
1487The various options that can be specified are:
1488
1489=over 4
1490
1491=item C<reject =E<gt> $listref>
1492
1493The list reference contains one or more strings specifying patterns
1494that must I<not> appear within the tagged text.
1495
1496For example, to extract
1497an HTML link (which should not contain nested links) use:
1498
48f821bf 1499 extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
55a1c97c 1500
1501=item C<ignore =E<gt> $listref>
1502
1503The list reference contains one or more strings specifying patterns
1504that are I<not> be be treated as nested tags within the tagged text
1505(even if they would match the start tag pattern).
1506
1507For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1508
48f821bf 1509 extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
55a1c97c 1510
1511(also see L<"gen_delimited_pat"> below).
1512
1513
1514=item C<fail =E<gt> $str>
1515
1516The C<fail> option indicates the action to be taken if a matching end
1517tag is not encountered (i.e. before the end of the string or some
1518C<reject> pattern matches). By default, a failure to match a closing
1519tag causes C<extract_tagged> to immediately fail.
1520
1521However, if the string value associated with <reject> is "MAX", then
1522C<extract_tagged> returns the complete text up to the point of failure.
1523If the string is "PARA", C<extract_tagged> returns only the first paragraph
1524after the tag (up to the first line that is either empty or contains
1525only whitespace characters).
48f821bf 1526If the string is "", the the default behaviour (i.e. failure) is reinstated.
55a1c97c 1527
1528For example, suppose the start tag "/para" introduces a paragraph, which then
1529continues until the next "/endpara" tag or until another "/para" tag is
1530encountered:
1531
48f821bf 1532 $text = "/para line 1\n\nline 3\n/para line 4";
55a1c97c 1533
48f821bf 1534 extract_tagged($text, '/para', '/endpara', undef,
1535 {reject => '/para', fail => MAX );
55a1c97c 1536
48f821bf 1537 # EXTRACTED: "/para line 1\n\nline 3\n"
55a1c97c 1538
1539Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1540tag refers only to the immediately following paragraph:
1541
48f821bf 1542 $text = "/para line 1\n\nline 3\n/para line 4";
55a1c97c 1543
48f821bf 1544 extract_tagged($text, '/para', '/endpara', undef,
1545 {reject => '/para', fail => MAX );
55a1c97c 1546
48f821bf 1547 # EXTRACTED: "/para line 1\n"
55a1c97c 1548
1549Note that the specified C<fail> behaviour applies to nested tags as well.
1550
1551=back
1552
1553On success in a list context, an array of 6 elements is returned. The elements are:
1554
1555=over 4
1556
1557=item [0]
1558
1559the extracted tagged substring (including the outermost tags),
1560
1561=item [1]
1562
1563the remainder of the input text,
1564
1565=item [2]
1566
1567the prefix substring (if any),
1568
1569=item [3]
1570
1571the opening tag
1572
1573=item [4]
1574
1575the text between the opening and closing tags
1576
1577=item [5]
1578
1579the closing tag (or "" if no closing tag was found)
1580
1581=back
1582
1583On failure, all of these values (except the remaining text) are C<undef>.
1584
1585In a scalar context, C<extract_tagged> returns just the complete
1586substring that matched a tagged text (including the start and end
1587tags). C<undef> is returned on failure. In addition, the original input
1588text has the returned substring (and any prefix) removed from it.
1589
1590In a void context, the input text just has the matched substring (and
1591any specified prefix) removed.
1592
1593
1594=head2 C<gen_extract_tagged>
1595
1596(Note: This subroutine is only available under Perl5.005)
1597
1598C<gen_extract_tagged> generates a new anonymous subroutine which
1599extracts text between (balanced) specified tags. In other words,
1600it generates a function identical in function to C<extract_tagged>.
1601
1602The difference between C<extract_tagged> and the anonymous
1603subroutines generated by
1604C<gen_extract_tagged>, is that those generated subroutines:
1605
1606=over 4
1607
1608=item *
1609
1610do not have to reparse tag specification or parsing options every time
1611they are called (whereas C<extract_tagged> has to effectively rebuild
1612its tag parser on every call);
1613
1614=item *
1615
1616make use of the new qr// construct to pre-compile the regexes they use
1617(whereas C<extract_tagged> uses standard string variable interpolation
1618to create tag-matching patterns).
1619
1620=back
1621
1622The subroutine takes up to four optional arguments (the same set as
1623C<extract_tagged> except for the string to be processed). It returns
1624a reference to a subroutine which in turn takes a single argument (the text to
1625be extracted from).
1626
1627In other words, the implementation of C<extract_tagged> is exactly
1628equivalent to:
1629
48f821bf 1630 sub extract_tagged
1631 {
1632 my $text = shift;
1633 $extractor = gen_extract_tagged(@_);
1634 return $extractor->($text);
1635 }
55a1c97c 1636
1637(although C<extract_tagged> is not currently implemented that way, in order
1638to preserve pre-5.005 compatibility).
1639
1640Using C<gen_extract_tagged> to create extraction functions for specific tags
1641is a good idea if those functions are going to be called more than once, since
1642their performance is typically twice as good as the more general-purpose
1643C<extract_tagged>.
1644
1645
1646=head2 C<extract_quotelike>
1647
1648C<extract_quotelike> attempts to recognize, extract, and segment any
1649one of the various Perl quotes and quotelike operators (see
1650L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1651delimiters (for the quotelike operators), and trailing modifiers are
1652all caught. For example, in:
1653
48f821bf 1654 extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1655
1656 extract_quotelike ' "You said, \"Use sed\"." '
55a1c97c 1657
48f821bf 1658 extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
55a1c97c 1659
48f821bf 1660 extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
55a1c97c 1661
1662the full Perl quotelike operations are all extracted correctly.
1663
1664Note too that, when using the /x modifier on a regex, any comment
1665containing the current pattern delimiter will cause the regex to be
1666immediately terminated. In other words:
1667
48f821bf 1668 'm /
1669 (?i) # CASE INSENSITIVE
1670 [a-z_] # LEADING ALPHABETIC/UNDERSCORE
1671 [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1672 /x'
55a1c97c 1673
1674will be extracted as if it were:
1675
48f821bf 1676 'm /
1677 (?i) # CASE INSENSITIVE
1678 [a-z_] # LEADING ALPHABETIC/'
55a1c97c 1679
1680This behaviour is identical to that of the actual compiler.
1681
1682C<extract_quotelike> takes two arguments: the text to be processed and
1683a prefix to be matched at the very beginning of the text. If no prefix
1684is specified, optional whitespace is the default. If no text is given,
1685C<$_> is used.
1686
1687In a list context, an array of 11 elements is returned. The elements are:
1688
1689=over 4
1690
1691=item [0]
1692
1693the extracted quotelike substring (including trailing modifiers),
1694
1695=item [1]
1696
1697the remainder of the input text,
1698
1699=item [2]
1700
1701the prefix substring (if any),
1702
1703=item [3]
1704
1705the name of the quotelike operator (if any),
1706
1707=item [4]
1708
1709the left delimiter of the first block of the operation,
1710
1711=item [5]
1712
1713the text of the first block of the operation
1714(that is, the contents of
1715a quote, the regex of a match or substitution or the target list of a
1716translation),
1717
1718=item [6]
1719
1720the right delimiter of the first block of the operation,
1721
1722=item [7]
1723
1724the left delimiter of the second block of the operation
48f821bf 1725(that is, if it is a C<s>, C<tr>, or C<y>),
55a1c97c 1726
1727=item [8]
1728
1729the text of the second block of the operation
1730(that is, the replacement of a substitution or the translation list
1731of a translation),
1732
1733=item [9]
1734
1735the right delimiter of the second block of the operation (if any),
1736
1737=item [10]
1738
1739the trailing modifiers on the operation (if any).
1740
1741=back
1742
1743For each of the fields marked "(if any)" the default value on success is
1744an empty string.
1745On failure, all of these values (except the remaining text) are C<undef>.
1746
1747
1748In a scalar context, C<extract_quotelike> returns just the complete substring
1749that matched a quotelike operation (or C<undef> on failure). In a scalar or
1750void context, the input text has the same substring (and any specified
1751prefix) removed.
1752
1753Examples:
1754
48f821bf 1755 # Remove the first quotelike literal that appears in text
55a1c97c 1756
48f821bf 1757 $quotelike = extract_quotelike($text,'.*?');
55a1c97c 1758
48f821bf 1759 # Replace one or more leading whitespace-separated quotelike
1760 # literals in $_ with "<QLL>"
55a1c97c 1761
48f821bf 1762 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
55a1c97c 1763
1764
48f821bf 1765 # Isolate the search pattern in a quotelike operation from $text
55a1c97c 1766
48f821bf 1767 ($op,$pat) = (extract_quotelike $text)[3,5];
1768 if ($op =~ /[ms]/)
1769 {
1770 print "search pattern: $pat\n";
1771 }
1772 else
1773 {
1774 print "$op is not a pattern matching operation\n";
1775 }
55a1c97c 1776
1777
1778=head2 C<extract_quotelike> and "here documents"
1779
1780C<extract_quotelike> can successfully extract "here documents" from an input
1781string, but with an important caveat in list contexts.
1782
1783Unlike other types of quote-like literals, a here document is rarely
1784a contiguous substring. For example, a typical piece of code using
1785here document might look like this:
1786
1787 <<'EOMSG' || die;
1788 This is the message.
1789 EOMSG
48f821bf 1790 exit;
55a1c97c 1791
1792Given this as an input string in a scalar context, C<extract_quotelike>
1793would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1794leaving the string " || die;\nexit;" in the original variable. In other words,
1795the two separate pieces of the here document are successfully extracted and
1796concatenated.
1797
1798In a list context, C<extract_quotelike> would return the list
1799
1800=over 4
1801
1802=item [0]
1803
1804"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1805including fore and aft delimiters),
1806
1807=item [1]
1808
1809" || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1810
1811=item [2]
1812
1813"" (i.e. the prefix substring -- trivial in this case),
1814
1815=item [3]
1816
1817"<<" (i.e. the "name" of the quotelike operator)
1818
1819=item [4]
1820
1821"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1822
1823=item [5]
1824
1825"This is the message.\n" (i.e. the text of the here document),
1826
1827=item [6]
1828
1829"EOMSG" (i.e. the right delimiter of the here document),
1830
1831=item [7..10]
1832
1833"" (a here document has no second left delimiter, second text, second right
1834delimiter, or trailing modifiers).
1835
1836=back
1837
1838However, the matching position of the input variable would be set to
1839"exit;" (i.e. I<after> the closing delimiter of the here document),
1840which would cause the earlier " || die;\nexit;" to be skipped in any
1841sequence of code fragment extractions.
1842
48f821bf 1843To avoid this problem, when it encounters a here document whilst
55a1c97c 1844extracting from a modifiable string, C<extract_quotelike> silently
1845rearranges the string to an equivalent piece of Perl:
1846
1847 <<'EOMSG'
1848 This is the message.
1849 EOMSG
1850 || die;
48f821bf 1851 exit;
55a1c97c 1852
1853in which the here document I<is> contiguous. It still leaves the
1854matching position after the here document, but now the rest of the line
1855on which the here document starts is not skipped.
1856
1857To prevent <extract_quotelike> from mucking about with the input in this way
1858(this is the only case where a list-context C<extract_quotelike> does so),
1859you can pass the input variable as an interpolated literal:
1860
1861 $quotelike = extract_quotelike("$var");
1862
1863
1864=head2 C<extract_codeblock>
1865
1866C<extract_codeblock> attempts to recognize and extract a balanced
1867bracket delimited substring that may contain unbalanced brackets
1868inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1869is like a combination of C<"extract_bracketed"> and
1870C<"extract_quotelike">.
1871
1872C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1873a text to process, a set of delimiter brackets to look for, and a prefix to
1874match first. It also takes an optional fourth parameter, which allows the
1875outermost delimiter brackets to be specified separately (see below).
1876
1877Omitting the first argument (input text) means process C<$_> instead.
1878Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1879Omitting the third argument (prefix argument) implies optional whitespace at the start.
1880Omitting the fourth argument (outermost delimiter brackets) indicates that the
1881value of the second argument is to be used for the outermost delimiters.
1882
48f821bf 1883Once the prefix an dthe outermost opening delimiter bracket have been
55a1c97c 1884recognized, code blocks are extracted by stepping through the input text and
1885trying the following alternatives in sequence:
1886
1887=over 4
1888
1889=item 1.
1890
1891Try and match a closing delimiter bracket. If the bracket was the same
1892species as the last opening bracket, return the substring to that
1893point. If the bracket was mismatched, return an error.
1894
1895=item 2.
1896
1897Try to match a quote or quotelike operator. If found, call
1898C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1899the error it returned. Otherwise go back to step 1.
1900
1901=item 3.
1902
1903Try to match an opening delimiter bracket. If found, call
1904C<extract_codeblock> recursively to eat the embedded block. If the
1905recursive call fails, return an error. Otherwise, go back to step 1.
1906
1907=item 4.
1908
1909Unconditionally match a bareword or any other single character, and
1910then go back to step 1.
1911
1912=back
1913
1914
1915Examples:
1916
1917 # Find a while loop in the text
1918
1919 if ($text =~ s/.*?while\s*\{/{/)
1920 {
1921 $loop = "while " . extract_codeblock($text);
1922 }
1923
1924 # Remove the first round-bracketed list (which may include
1925 # round- or curly-bracketed code blocks or quotelike operators)
1926
1927 extract_codeblock $text, "(){}", '[^(]*';
1928
1929
1930The ability to specify a different outermost delimiter bracket is useful
1931in some circumstances. For example, in the Parse::RecDescent module,
1932parser actions which are to be performed only on a successful parse
1933are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1934
1935 sentence: subject verb object
1936 <defer: {$::theVerb = $item{verb}} >
1937
1938Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1939within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1940
1941A deferred action like this:
1942
1943 <defer: {if ($count>10) {$count--}} >
1944
1945will be incorrectly parsed as:
1946
1947 <defer: {if ($count>
1948
1949because the "less than" operator is interpreted as a closing delimiter.
1950
1951But, by extracting the directive using
1952S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1953the '>' character is only treated as a delimited at the outermost
1954level of the code block, so the directive is parsed correctly.
1955
1956=head2 C<extract_multiple>
1957
1958The C<extract_multiple> subroutine takes a string to be processed and a
1959list of extractors (subroutines or regular expressions) to apply to that string.
1960
1961In an array context C<extract_multiple> returns an array of substrings
1962of the original string, as extracted by the specified extractors.
1963In a scalar context, C<extract_multiple> returns the first
1964substring successfully extracted from the original string. In both
1965scalar and void contexts the original string has the first successfully
1966extracted substring removed from it. In all contexts
1967C<extract_multiple> starts at the current C<pos> of the string, and
1968sets that C<pos> appropriately after it matches.
1969
48f821bf 1970Hence, the aim of of a call to C<extract_multiple> in a list context
55a1c97c 1971is to split the processed string into as many non-overlapping fields as
1972possible, by repeatedly applying each of the specified extractors
1973to the remainder of the string. Thus C<extract_multiple> is
1974a generalized form of Perl's C<split> subroutine.
1975
1976The subroutine takes up to four optional arguments:
1977
1978=over 4
1979
1980=item 1.
1981
1982A string to be processed (C<$_> if the string is omitted or C<undef>)
1983
1984=item 2.
1985
1986A reference to a list of subroutine references and/or qr// objects and/or
1987literal strings and/or hash references, specifying the extractors
1988to be used to split the string. If this argument is omitted (or
1989C<undef>) the list:
1990
1991 [
1992 sub { extract_variable($_[0], '') },
1993 sub { extract_quotelike($_[0],'') },
1994 sub { extract_codeblock($_[0],'{}','') },
1995 ]
1996
1997is used.
1998
1999
2000=item 3.
2001
48f821bf 2002An number specifying the maximum number of fields to return. If this
55a1c97c 2003argument is omitted (or C<undef>), split continues as long as possible.
2004
2005If the third argument is I<N>, then extraction continues until I<N> fields
2006have been successfully extracted, or until the string has been completely
2007processed.
2008
2009Note that in scalar and void contexts the value of this argument is
2010automatically reset to 1 (under C<-w>, a warning is issued if the argument
2011has to be reset).
2012
2013=item 4.
2014
2015A value indicating whether unmatched substrings (see below) within the
2016text should be skipped or returned as fields. If the value is true,
2017such substrings are skipped. Otherwise, they are returned.
2018
2019=back
2020
2021The extraction process works by applying each extractor in
a7602084 2022sequence to the text string.
2023
2024If the extractor is a subroutine it is called in a list context and is
2025expected to return a list of a single element, namely the extracted
2026text. It may optionally also return two further arguments: a string
2027representing the text left after extraction (like $' for a pattern
2028match), and a string representing any prefix skipped before the
2029extraction (like $` in a pattern match). Note that this is designed
2030to facilitate the use of other Text::Balanced subroutines with
2031C<extract_multiple>. Note too that the value returned by an extractor
2032subroutine need not bear any relationship to the corresponding substring
2033of the original text (see examples below).
55a1c97c 2034
2035If the extractor is a precompiled regular expression or a string,
2036it is matched against the text in a scalar context with a leading
2037'\G' and the gc modifiers enabled. The extracted value is either
2038$1 if that variable is defined after the match, or else the
2039complete match (i.e. $&).
2040
2041If the extractor is a hash reference, it must contain exactly one element.
2042The value of that element is one of the
2043above extractor types (subroutine reference, regular expression, or string).
2044The key of that element is the name of a class into which the successful
2045return value of the extractor will be blessed.
2046
2047If an extractor returns a defined value, that value is immediately
2048treated as the next extracted field and pushed onto the list of fields.
2049If the extractor was specified in a hash reference, the field is also
2050blessed into the appropriate class,
2051
2052If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
2053assumed to have failed to extract.
2054If none of the extractor subroutines succeeds, then one
2055character is extracted from the start of the text and the extraction
2056subroutines reapplied. Characters which are thus removed are accumulated and
2057eventually become the next field (unless the fourth argument is true, in which
dd6316a9 2058case they are disgarded).
55a1c97c 2059
2060For example, the following extracts substrings that are valid Perl variables:
2061
2062 @fields = extract_multiple($text,
2063 [ sub { extract_variable($_[0]) } ],
2064 undef, 1);
2065
2066This example separates a text into fields which are quote delimited,
2067curly bracketed, and anything else. The delimited and bracketed
2068parts are also blessed to identify them (the "anything else" is unblessed):
2069
2070 @fields = extract_multiple($text,
2071 [
2072 { Delim => sub { extract_delimited($_[0],q{'"}) } },
2073 { Brack => sub { extract_bracketed($_[0],'{}') } },
2074 ]);
2075
2076This call extracts the next single substring that is a valid Perl quotelike
2077operator (and removes it from $text):
2078
2079 $quotelike = extract_multiple($text,
2080 [
2081 sub { extract_quotelike($_[0]) },
2082 ], undef, 1);
2083
2084Finally, here is yet another way to do comma-separated value parsing:
2085
2086 @fields = extract_multiple($csv_text,
2087 [
2088 sub { extract_delimited($_[0],q{'"}) },
2089 qr/([^,]+)(.*)/,
2090 ],
2091 undef,1);
2092
2093The list in the second argument means:
2094I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2095The undef third argument means:
2096I<"...as many times as possible...">,
2097and the true value in the fourth argument means
2098I<"...discarding anything else that appears (i.e. the commas)">.
2099
2100If you wanted the commas preserved as separate fields (i.e. like split
2101does if your split pattern has capturing parentheses), you would
2102just make the last parameter undefined (or remove it).
2103
2104
2105=head2 C<gen_delimited_pat>
2106
2107The C<gen_delimited_pat> subroutine takes a single (string) argument and
2108 > builds a Friedl-style optimized regex that matches a string delimited
2109by any one of the characters in the single argument. For example:
2110
2111 gen_delimited_pat(q{'"})
2112
2113returns the regex:
2114
2115 (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2116
2117Note that the specified delimiters are automatically quotemeta'd.
2118
2119A typical use of C<gen_delimited_pat> would be to build special purpose tags
2120for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2121(which might contain quoted strings):
2122
2123 my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2124
2125 extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2126
2127
2128C<gen_delimited_pat> may also be called with an optional second argument,
2129which specifies the "escape" character(s) to be used for each delimiter.
2130For example to match a Pascal-style string (where ' is the delimiter
2131and '' is a literal ' within the string):
2132
2133 gen_delimited_pat(q{'},q{'});
2134
2135Different escape characters can be specified for different delimiters.
2136For example, to specify that '/' is the escape for single quotes
2137and '%' is the escape for double quotes:
2138
2139 gen_delimited_pat(q{'"},q{/%});
2140
2141If more delimiters than escape chars are specified, the last escape char
2142is used for the remaining delimiters.
2143If no escape char is specified for a given specified delimiter, '\' is used.
2144
49c03c89 2145=head2 C<delimited_pat>
2146
2147Note that C<gen_delimited_pat> was previously called C<delimited_pat>.
2148That name may still be used, but is now deprecated.
55a1c97c 2149
2150
2151=head1 DIAGNOSTICS
2152
2153In a list context, all the functions return C<(undef,$original_text)>
2154on failure. In a scalar context, failure is indicated by returning C<undef>
2155(in this case the input text is not modified in any way).
2156
2157In addition, on failure in I<any> context, the C<$@> variable is set.
2158Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2159below.
2160Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2161which the error was detected (although not necessarily where it occurred!)
2162Printing C<$@> directly produces the error message, with the offset appended.
2163On success, the C<$@> variable is guaranteed to be C<undef>.
2164
2165The available diagnostics are:
2166
2167=over 4
2168
2169=item C<Did not find a suitable bracket: "%s">
2170
2171The delimiter provided to C<extract_bracketed> was not one of
2172C<'()[]E<lt>E<gt>{}'>.
2173
2174=item C<Did not find prefix: /%s/>
2175
2176A non-optional prefix was specified but wasn't found at the start of the text.
2177
2178=item C<Did not find opening bracket after prefix: "%s">
2179
2180C<extract_bracketed> or C<extract_codeblock> was expecting a
2181particular kind of bracket at the start of the text, and didn't find it.
2182
2183=item C<No quotelike operator found after prefix: "%s">
2184
2185C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2186C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2187it was extracting.
2188
2189=item C<Unmatched closing bracket: "%c">
2190
2191C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2192a closing bracket where none was expected.
2193
2194=item C<Unmatched opening bracket(s): "%s">
2195
2196C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2197out of characters in the text before closing one or more levels of nested
2198brackets.
2199
2200=item C<Unmatched embedded quote (%s)>
2201
2202C<extract_bracketed> attempted to match an embedded quoted substring, but
2203failed to find a closing quote to match it.
2204
2205=item C<Did not find closing delimiter to match '%s'>
2206
2207C<extract_quotelike> was unable to find a closing delimiter to match the
2208one that opened the quote-like operation.
2209
2210=item C<Mismatched closing bracket: expected "%c" but found "%s">
2211
2212C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2213a valid bracket delimiter, but it was the wrong species. This usually
2214indicates a nesting error, but may indicate incorrect quoting or escaping.
2215
2216=item C<No block delimiter found after quotelike "%s">
2217
2218C<extract_quotelike> or C<extract_codeblock> found one of the
2219quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2220without a suitable block after it.
2221
2222=item C<Did not find leading dereferencer>
2223
2224C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2225a variable, but didn't find any of them.
2226
2227=item C<Bad identifier after dereferencer>
2228
2229C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2230character was not followed by a legal Perl identifier.
2231
2232=item C<Did not find expected opening bracket at %s>
2233
2234C<extract_codeblock> failed to find any of the outermost opening brackets
2235that were specified.
2236
2237=item C<Improperly nested codeblock at %s>
2238
2239A nested code block was found that started with a delimiter that was specified
2240as being only to be used as an outermost bracket.
2241
2242=item C<Missing second block for quotelike "%s">
2243
2244C<extract_codeblock> or C<extract_quotelike> found one of the
2245quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2246
2247=item C<No match found for opening bracket>
2248
2249C<extract_codeblock> failed to find a closing bracket to match the outermost
2250opening bracket.
2251
2252=item C<Did not find opening tag: /%s/>
2253
2254C<extract_tagged> did not find a suitable opening tag (after any specified
2255prefix was removed).
2256
2257=item C<Unable to construct closing tag to match: /%s/>
2258
2259C<extract_tagged> matched the specified opening tag and tried to
2260modify the matched text to produce a matching closing tag (because
2261none was specified). It failed to generate the closing tag, almost
2262certainly because the opening tag did not start with a
2263bracket of some kind.
2264
2265=item C<Found invalid nested tag: %s>
2266
2267C<extract_tagged> found a nested tag that appeared in the "reject" list
2268(and the failure mode was not "MAX" or "PARA").
2269
2270=item C<Found unbalanced nested tag: %s>
2271
2272C<extract_tagged> found a nested opening tag that was not matched by a
2273corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2274
2275=item C<Did not find closing tag>
2276
2277C<extract_tagged> reached the end of the text without finding a closing tag
2278to match the original opening tag (and the failure mode was not
2279"MAX" or "PARA").
2280
2281
2282
2283
2284=back
2285
2286
2287=head1 AUTHOR
2288
2289Damian Conway (damian@conway.org)
2290
2291
2292=head1 BUGS AND IRRITATIONS
2293
2294There are undoubtedly serious bugs lurking somewhere in this code, if
2295only because parts of it give the impression of understanding a great deal
2296more about Perl than they really do.
2297
2298Bug reports and other feedback are most welcome.
2299
2300
2301=head1 COPYRIGHT
2302
2303 Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
2304 This module is free software. It may be used, redistributed
2305 and/or modified under the same terms as Perl itself.