Upgrade to Locale::Codes 2.02.
[p5sagit/p5-mst-13.2.git] / lib / Text / Balanced.pm
CommitLineData
3270c621 1# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
2# FOR FULL DOCUMENTATION SEE Balanced.pod
3
4use 5.005;
5use strict;
6
7package Text::Balanced;
8
9use Exporter;
10use SelfLoader;
11use vars qw { $VERSION @ISA %EXPORT_TAGS };
12
a7602084 13$VERSION = '1.89';
3270c621 14@ISA = qw ( Exporter );
15
16%EXPORT_TAGS = ( ALL => [ qw(
17 &extract_delimited
18 &extract_bracketed
19 &extract_quotelike
20 &extract_codeblock
21 &extract_variable
22 &extract_tagged
23 &extract_multiple
24
25 &gen_delimited_pat
26 &gen_extract_tagged
27
28 &delimited_pat
29 ) ] );
30
31Exporter::export_ok_tags('ALL');
32
33# PROTOTYPES
34
35sub _match_bracketed($$$$$$);
36sub _match_variable($$);
37sub _match_codeblock($$$$$$$);
38sub _match_quotelike($$$$);
39
40# HANDLE RETURN VALUES IN VARIOUS CONTEXTS
41
42sub _failmsg {
43 my ($message, $pos) = @_;
44 $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
45}
46
47sub _fail
48{
49 my ($wantarray, $textref, $message, $pos) = @_;
50 _failmsg $message, $pos if $message;
51 return ("",$$textref,"") if $wantarray;
52 return undef;
53}
54
55sub _succeed
56{
57 $@ = undef;
58 my ($wantarray,$textref) = splice @_, 0, 2;
59 my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
60 my ($startlen) = $_[5];
61 my $remainderpos = $_[2];
62 if ($wantarray)
63 {
64 my @res;
65 while (my ($from, $len) = splice @_, 0, 2)
66 {
67 push @res, substr($$textref,$from,$len);
68 }
69 if ($extralen) { # CORRECT FILLET
70 my $extra = substr($res[0], $extrapos-$startlen, $extralen, "\n");
71 $res[1] = "$extra$res[1]";
72 eval { substr($$textref,$remainderpos,0) = $extra;
73 substr($$textref,$extrapos,$extralen,"\n")} ;
74 #REARRANGE HERE DOC AND FILLET IF POSSIBLE
75 pos($$textref) = $remainderpos-$extralen+1; # RESET \G
76 }
77 else {
78 pos($$textref) = $remainderpos; # RESET \G
79 }
80 return @res;
81 }
82 else
83 {
84 my $match = substr($$textref,$_[0],$_[1]);
85 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
86 my $extra = $extralen
87 ? substr($$textref, $extrapos, $extralen)."\n" : "";
88 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE
89 pos($$textref) = $_[4]; # RESET \G
90 return $match;
91 }
92}
93
94# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
95
96sub gen_delimited_pat($;$) # ($delimiters;$escapes)
97{
98 my ($dels, $escs) = @_;
99 return "" unless $dels =~ /\S/;
100 $escs = '\\' unless $escs;
101 $escs .= substr($escs,-1) x (length($dels)-length($escs));
102 my @pat = ();
103 my $i;
104 for ($i=0; $i<length $dels; $i++)
105 {
106 my $del = quotemeta substr($dels,$i,1);
107 my $esc = quotemeta substr($escs,$i,1);
108 if ($del eq $esc)
109 {
110 push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
111 }
112 else
113 {
114 push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
115 }
116 }
117 my $pat = join '|', @pat;
118 return "(?:$pat)";
119}
120
121*delimited_pat = \&gen_delimited_pat;
122
123
124# THE EXTRACTION FUNCTIONS
125
126sub extract_delimited (;$$$$)
127{
128 my $textref = defined $_[0] ? \$_[0] : \$_;
129 my $wantarray = wantarray;
130 my $del = defined $_[1] ? $_[1] : qq{\'\"\`};
131 my $pre = defined $_[2] ? $_[2] : '\s*';
132 my $esc = defined $_[3] ? $_[3] : qq{\\};
133 my $pat = gen_delimited_pat($del, $esc);
134 my $startpos = pos $$textref || 0;
135 return _fail($wantarray, $textref, "Not a delimited pattern", 0)
136 unless $$textref =~ m/\G($pre)($pat)/gc;
137 my $prelen = length($1);
138 my $matchpos = $startpos+$prelen;
139 my $endpos = pos $$textref;
140 return _succeed $wantarray, $textref,
141 $matchpos, $endpos-$matchpos, # MATCH
142 $endpos, length($$textref)-$endpos, # REMAINDER
143 $startpos, $prelen; # PREFIX
144}
145
146sub extract_bracketed (;$$$)
147{
148 my $textref = defined $_[0] ? \$_[0] : \$_;
149 my $ldel = defined $_[1] ? $_[1] : '{([<';
150 my $pre = defined $_[2] ? $_[2] : '\s*';
151 my $wantarray = wantarray;
152 my $qdel = "";
153 my $quotelike;
154 $ldel =~ s/'//g and $qdel .= q{'};
155 $ldel =~ s/"//g and $qdel .= q{"};
156 $ldel =~ s/`//g and $qdel .= q{`};
157 $ldel =~ s/q//g and $quotelike = 1;
158 $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
159 my $rdel = $ldel;
160 unless ($rdel =~ tr/[({</])}>/)
161 {
162 return _fail $wantarray, $textref,
163 "Did not find a suitable bracket in delimiter: \"$_[1]\"",
164 0;
165 }
166 my $posbug = pos;
167 $ldel = join('|', map { quotemeta $_ } split('', $ldel));
168 $rdel = join('|', map { quotemeta $_ } split('', $rdel));
169 pos = $posbug;
170
171 my $startpos = pos $$textref || 0;
172 my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
173
174 return _fail ($wantarray, $textref) unless @match;
175
176 return _succeed ( $wantarray, $textref,
177 $match[2], $match[5]+2, # MATCH
178 @match[8,9], # REMAINDER
179 @match[0,1], # PREFIX
180 );
181}
182
183sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
184{
185 my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
186 my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
187 unless ($$textref =~ m/\G$pre/gc)
188 {
189 _failmsg "Did not find prefix: /$pre/", $startpos;
190 return;
191 }
192
193 $ldelpos = pos $$textref;
194
195 unless ($$textref =~ m/\G($ldel)/gc)
196 {
197 _failmsg "Did not find opening bracket after prefix: \"$pre\"",
198 pos $$textref;
199 pos $$textref = $startpos;
200 return;
201 }
202
203 my @nesting = ( $1 );
204 my $textlen = length $$textref;
205 while (pos $$textref < $textlen)
206 {
207 next if $$textref =~ m/\G\\./gcs;
208
209 if ($$textref =~ m/\G($ldel)/gc)
210 {
211 push @nesting, $1;
212 }
213 elsif ($$textref =~ m/\G($rdel)/gc)
214 {
215 my ($found, $brackettype) = ($1, $1);
216 if ($#nesting < 0)
217 {
218 _failmsg "Unmatched closing bracket: \"$found\"",
219 pos $$textref;
220 pos $$textref = $startpos;
221 return;
222 }
223 my $expected = pop(@nesting);
224 $expected =~ tr/({[</)}]>/;
225 if ($expected ne $brackettype)
226 {
227 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
228 pos $$textref;
229 pos $$textref = $startpos;
230 return;
231 }
232 last if $#nesting < 0;
233 }
234 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
235 {
9686a75b 236 $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
3270c621 237 _failmsg "Unmatched embedded quote ($1)",
238 pos $$textref;
239 pos $$textref = $startpos;
240 return;
241 }
242 elsif ($quotelike && _match_quotelike($textref,"",1,0))
243 {
244 next;
245 }
246
247 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
248 }
249 if ($#nesting>=0)
250 {
251 _failmsg "Unmatched opening bracket(s): "
252 . join("..",@nesting)."..",
253 pos $$textref;
254 pos $$textref = $startpos;
255 return;
256 }
257
258 $endpos = pos $$textref;
259
260 return (
261 $startpos, $ldelpos-$startpos, # PREFIX
262 $ldelpos, 1, # OPENING BRACKET
263 $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS
264 $endpos-1, 1, # CLOSING BRACKET
265 $endpos, length($$textref)-$endpos, # REMAINDER
266 );
267}
268
269sub revbracket($)
270{
271 my $brack = reverse $_[0];
272 $brack =~ tr/[({</])}>/;
273 return $brack;
274}
275
276my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
277
278sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
279{
280 my $textref = defined $_[0] ? \$_[0] : \$_;
281 my $ldel = $_[1];
282 my $rdel = $_[2];
283 my $pre = defined $_[3] ? $_[3] : '\s*';
284 my %options = defined $_[4] ? %{$_[4]} : ();
285 my $omode = defined $options{fail} ? $options{fail} : '';
286 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
287 : defined($options{reject}) ? $options{reject}
288 : ''
289 ;
290 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
291 : defined($options{ignore}) ? $options{ignore}
292 : ''
293 ;
294
295 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
296 $@ = undef;
297
298 my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
299
300 return _fail(wantarray, $textref) unless @match;
301 return _succeed wantarray, $textref,
302 $match[2], $match[3]+$match[5]+$match[7], # MATCH
303 @match[8..9,0..1,2..7]; # REM, PRE, BITS
304}
305
306sub _match_tagged # ($$$$$$$)
307{
308 my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
309 my $rdelspec;
310
311 my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
312
313 unless ($$textref =~ m/\G($pre)/gc)
314 {
315 _failmsg "Did not find prefix: /$pre/", pos $$textref;
316 goto failed;
317 }
318
319 $opentagpos = pos($$textref);
320
321 unless ($$textref =~ m/\G$ldel/gc)
322 {
323 _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
324 goto failed;
325 }
326
327 $textpos = pos($$textref);
328
329 if (!defined $rdel)
330 {
331 $rdelspec = $&;
332 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes)
333 {
334 _failmsg "Unable to construct closing tag to match: $rdel",
335 pos $$textref;
336 goto failed;
337 }
338 }
339 else
340 {
341 $rdelspec = eval "qq{$rdel}";
342 }
343
344 while (pos($$textref) < length($$textref))
345 {
346 next if $$textref =~ m/\G\\./gc;
347
348 if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
349 {
350 $parapos = pos($$textref) - length($1)
351 unless defined $parapos;
352 }
353 elsif ($$textref =~ m/\G($rdelspec)/gc )
354 {
355 $closetagpos = pos($$textref)-length($1);
356 goto matched;
357 }
358 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
359 {
360 next;
361 }
362 elsif ($bad && $$textref =~ m/\G($bad)/gcs)
363 {
364 pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS
365 goto short if ($omode eq 'PARA' || $omode eq 'MAX');
366 _failmsg "Found invalid nested tag: $1", pos $$textref;
367 goto failed;
368 }
369 elsif ($$textref =~ m/\G($ldel)/gc)
370 {
371 my $tag = $1;
372 pos($$textref) -= length($tag); # REWIND TO NESTED TAG
373 unless (_match_tagged(@_)) # MATCH NESTED TAG
374 {
375 goto short if $omode eq 'PARA' || $omode eq 'MAX';
376 _failmsg "Found unbalanced nested tag: $tag",
377 pos $$textref;
378 goto failed;
379 }
380 }
381 else { $$textref =~ m/./gcs }
382 }
383
384short:
385 $closetagpos = pos($$textref);
386 goto matched if $omode eq 'MAX';
387 goto failed unless $omode eq 'PARA';
388
389 if (defined $parapos) { pos($$textref) = $parapos }
390 else { $parapos = pos($$textref) }
391
392 return (
393 $startpos, $opentagpos-$startpos, # PREFIX
394 $opentagpos, $textpos-$opentagpos, # OPENING TAG
395 $textpos, $parapos-$textpos, # TEXT
396 $parapos, 0, # NO CLOSING TAG
397 $parapos, length($$textref)-$parapos, # REMAINDER
398 );
399
400matched:
401 $endpos = pos($$textref);
402 return (
403 $startpos, $opentagpos-$startpos, # PREFIX
404 $opentagpos, $textpos-$opentagpos, # OPENING TAG
405 $textpos, $closetagpos-$textpos, # TEXT
406 $closetagpos, $endpos-$closetagpos, # CLOSING TAG
407 $endpos, length($$textref)-$endpos, # REMAINDER
408 );
409
410failed:
411 _failmsg "Did not find closing tag", pos $$textref unless $@;
412 pos($$textref) = $startpos;
413 return;
414}
415
416sub extract_variable (;$$)
417{
418 my $textref = defined $_[0] ? \$_[0] : \$_;
419 return ("","","") unless defined $$textref;
420 my $pre = defined $_[1] ? $_[1] : '\s*';
421
422 my @match = _match_variable($textref,$pre);
423
424 return _fail wantarray, $textref unless @match;
425
426 return _succeed wantarray, $textref,
427 @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX
428}
429
430sub _match_variable($$)
431{
a7602084 432# $#
433# $^
434# $$
3270c621 435 my ($textref, $pre) = @_;
436 my $startpos = pos($$textref) = pos($$textref)||0;
437 unless ($$textref =~ m/\G($pre)/gc)
438 {
439 _failmsg "Did not find prefix: /$pre/", pos $$textref;
440 return;
441 }
442 my $varpos = pos($$textref);
a7602084 443 unless ($$textref =~ m{\G\$\s*(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
3270c621 444 {
a7602084 445 unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
446 {
3270c621 447 _failmsg "Did not find leading dereferencer", pos $$textref;
448 pos $$textref = $startpos;
449 return;
a7602084 450 }
451 my $deref = $1;
3270c621 452
a7602084 453 unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
454 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
455 or $deref eq '$#' or $deref eq '$$' )
456 {
3270c621 457 _failmsg "Bad identifier after dereferencer", pos $$textref;
458 pos $$textref = $startpos;
459 return;
a7602084 460 }
3270c621 461 }
462
463 while (1)
464 {
465 next if _match_codeblock($textref,
2f250b7c 466 qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
3270c621 467 qr/[({[]/, qr/[)}\]]/,
468 qr/[({[]/, qr/[)}\]]/, 0);
469 next if _match_codeblock($textref,
470 qr/\s*/, qr/[{[]/, qr/[}\]]/,
471 qr/[{[]/, qr/[}\]]/, 0);
472 next if _match_variable($textref,'\s*->\s*');
473 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
474 last;
475 }
476
477 my $endpos = pos($$textref);
478 return ($startpos, $varpos-$startpos,
479 $varpos, $endpos-$varpos,
480 $endpos, length($$textref)-$endpos
481 );
482}
483
484sub extract_codeblock (;$$$$$)
485{
486 my $textref = defined $_[0] ? \$_[0] : \$_;
487 my $wantarray = wantarray;
488 my $ldel_inner = defined $_[1] ? $_[1] : '{';
489 my $pre = defined $_[2] ? $_[2] : '\s*';
490 my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
491 my $rd = $_[4];
492 my $rdel_inner = $ldel_inner;
493 my $rdel_outer = $ldel_outer;
494 my $posbug = pos;
495 for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
496 for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
497 for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
498 {
499 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
500 }
501 pos = $posbug;
502
503 my @match = _match_codeblock($textref, $pre,
504 $ldel_outer, $rdel_outer,
505 $ldel_inner, $rdel_inner,
506 $rd);
507 return _fail($wantarray, $textref) unless @match;
508 return _succeed($wantarray, $textref,
509 @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX
510 );
511
512}
513
514sub _match_codeblock($$$$$$$)
515{
516 my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
517 my $startpos = pos($$textref) = pos($$textref) || 0;
518 unless ($$textref =~ m/\G($pre)/gc)
519 {
520 _failmsg qq{Did not match prefix /$pre/ at"} .
521 substr($$textref,pos($$textref),20) .
522 q{..."},
523 pos $$textref;
524 return;
525 }
526 my $codepos = pos($$textref);
527 unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER
528 {
529 _failmsg qq{Did not find expected opening bracket at "} .
530 substr($$textref,pos($$textref),20) .
531 q{..."},
532 pos $$textref;
533 pos $$textref = $startpos;
534 return;
535 }
536 my $closing = $1;
537 $closing =~ tr/([<{/)]>}/;
538 my $matched;
539 my $patvalid = 1;
540 while (pos($$textref) < length($$textref))
541 {
542 $matched = '';
543 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
544 {
545 $patvalid = 0;
546 next;
547 }
548
549 if ($$textref =~ m/\G\s*#.*/gc)
550 {
551 next;
552 }
553
554 if ($$textref =~ m/\G\s*($rdel_outer)/gc)
555 {
556 unless ($matched = ($closing && $1 eq $closing) )
557 {
558 next if $1 eq '>'; # MIGHT BE A "LESS THAN"
559 _failmsg q{Mismatched closing bracket at "} .
560 substr($$textref,pos($$textref),20) .
561 qq{...". Expected '$closing'},
562 pos $$textref;
563 }
564 last;
565 }
566
567 if (_match_variable($textref,'\s*') ||
568 _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
569 {
570 $patvalid = 0;
571 next;
572 }
573
574
575 # NEED TO COVER MANY MORE CASES HERE!!!
576 if ($$textref =~ m#\G\s*( [-+*x/%^&|.]=?
55a1c97c 577 | [!=]~
3270c621 578 | =(?!>)
579 | (\*\*|&&|\|\||<<|>>)=?
3270c621 580 | split|grep|map|return
581 )#gcx)
582 {
583 $patvalid = 1;
584 next;
585 }
586
587 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
588 {
589 $patvalid = 1;
590 next;
591 }
592
593 if ($$textref =~ m/\G\s*$ldel_outer/gc)
594 {
595 _failmsg q{Improperly nested codeblock at "} .
596 substr($$textref,pos($$textref),20) .
597 q{..."},
598 pos $$textref;
599 last;
600 }
601
602 $patvalid = 0;
603 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
604 }
605 continue { $@ = undef }
606
607 unless ($matched)
608 {
609 _failmsg 'No match found for opening bracket', pos $$textref
610 unless $@;
611 return;
612 }
613
614 my $endpos = pos($$textref);
615 return ( $startpos, $codepos-$startpos,
616 $codepos, $endpos-$codepos,
617 $endpos, length($$textref)-$endpos,
618 );
619}
620
621
622my %mods = (
623 'none' => '[cgimsox]*',
624 'm' => '[cgimsox]*',
625 's' => '[cegimsox]*',
626 'tr' => '[cds]*',
627 'y' => '[cds]*',
628 'qq' => '',
629 'qx' => '',
630 'qw' => '',
631 'qr' => '[imsx]*',
632 'q' => '',
633 );
634
635sub extract_quotelike (;$$)
636{
637 my $textref = $_[0] ? \$_[0] : \$_;
638 my $wantarray = wantarray;
639 my $pre = defined $_[1] ? $_[1] : '\s*';
640
641 my @match = _match_quotelike($textref,$pre,1,0);
642 return _fail($wantarray, $textref) unless @match;
643 return _succeed($wantarray, $textref,
644 $match[2], $match[18]-$match[2], # MATCH
645 @match[18,19], # REMAINDER
646 @match[0,1], # PREFIX
647 @match[2..17], # THE BITS
648 @match[20,21], # ANY FILLET?
649 );
650};
651
652sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match)
653{
654 my ($textref, $pre, $rawmatch, $qmark) = @_;
655
656 my ($textlen,$startpos,
657 $oppos,
658 $preld1pos,$ld1pos,$str1pos,$rd1pos,
659 $preld2pos,$ld2pos,$str2pos,$rd2pos,
660 $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
661
662 unless ($$textref =~ m/\G($pre)/gc)
663 {
664 _failmsg qq{Did not find prefix /$pre/ at "} .
665 substr($$textref, pos($$textref), 20) .
666 q{..."},
667 pos $$textref;
668 return;
669 }
670 $oppos = pos($$textref);
671
672 my $initial = substr($$textref,$oppos,1);
673
674 if ($initial && $initial =~ m|^[\"\'\`]|
675 || $rawmatch && $initial =~ m|^/|
676 || $qmark && $initial =~ m|^\?|)
677 {
9686a75b 678 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
3270c621 679 {
680 _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
681 substr($$textref, $oppos, 20) .
682 q{..."},
683 pos $$textref;
684 pos $$textref = $startpos;
685 return;
686 }
687 $modpos= pos($$textref);
688 $rd1pos = $modpos-1;
689
690 if ($initial eq '/' || $initial eq '?')
691 {
692 $$textref =~ m/\G$mods{none}/gc
693 }
694
695 my $endpos = pos($$textref);
696 return (
697 $startpos, $oppos-$startpos, # PREFIX
698 $oppos, 0, # NO OPERATOR
699 $oppos, 1, # LEFT DEL
700 $oppos+1, $rd1pos-$oppos-1, # STR/PAT
701 $rd1pos, 1, # RIGHT DEL
702 $modpos, 0, # NO 2ND LDEL
703 $modpos, 0, # NO 2ND STR
704 $modpos, 0, # NO 2ND RDEL
705 $modpos, $endpos-$modpos, # MODIFIERS
706 $endpos, $textlen-$endpos, # REMAINDER
707 );
708 }
709
710 unless ($$textref =~ m{\G((?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
711 {
712 _failmsg q{No quotelike operator found after prefix at "} .
713 substr($$textref, pos($$textref), 20) .
714 q{..."},
715 pos $$textref;
716 pos $$textref = $startpos;
717 return;
718 }
719
720 my $op = $1;
721 $preld1pos = pos($$textref);
722 if ($op eq '<<') {
723 $ld1pos = pos($$textref);
724 my $label;
725 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
726 $label = $1;
727 }
728 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
729 | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
730 | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
9686a75b 731 }gcsx) {
3270c621 732 $label = $+;
733 }
734 else {
735 $label = "";
736 }
737 my $extrapos = pos($$textref);
738 $$textref =~ m{.*\n}gc;
739 $str1pos = pos($$textref);
740 unless ($$textref =~ m{.*?\n(?=$label\n)}gc) {
741 _failmsg qq{Missing here doc terminator ('$label') after "} .
742 substr($$textref, $startpos, 20) .
743 q{..."},
744 pos $$textref;
745 pos $$textref = $startpos;
746 return;
747 }
748 $rd1pos = pos($$textref);
749 $$textref =~ m{$label\n}gc;
750 $ld2pos = pos($$textref);
751 return (
752 $startpos, $oppos-$startpos, # PREFIX
753 $oppos, length($op), # OPERATOR
754 $ld1pos, $extrapos-$ld1pos, # LEFT DEL
755 $str1pos, $rd1pos-$str1pos, # STR/PAT
756 $rd1pos, $ld2pos-$rd1pos, # RIGHT DEL
757 $ld2pos, 0, # NO 2ND LDEL
758 $ld2pos, 0, # NO 2ND STR
759 $ld2pos, 0, # NO 2ND RDEL
760 $ld2pos, 0, # NO MODIFIERS
761 $ld2pos, $textlen-$ld2pos, # REMAINDER
762 $extrapos, $str1pos-$extrapos, # FILLETED BIT
763 );
764 }
765
766 $$textref =~ m/\G\s*/gc;
767 $ld1pos = pos($$textref);
768 $str1pos = $ld1pos+1;
769
770 unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD
771 {
772 _failmsg "No block delimiter found after quotelike $op",
773 pos $$textref;
774 pos $$textref = $startpos;
775 return;
776 }
777 pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
778 my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
779 if ($ldel1 =~ /[[(<{]/)
780 {
781 $rdel1 =~ tr/[({</])}>/;
782 _match_bracketed($textref,"",$ldel1,"","",$rdel1)
783 || do { pos $$textref = $startpos; return };
784 }
785 else
786 {
9686a75b 787 $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
3270c621 788 || do { pos $$textref = $startpos; return };
789 }
790 $ld2pos = $rd1pos = pos($$textref)-1;
791
792 my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
793 if ($second_arg)
794 {
795 my ($ldel2, $rdel2);
796 if ($ldel1 =~ /[[(<{]/)
797 {
798 unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD
799 {
800 _failmsg "Missing second block for quotelike $op",
801 pos $$textref;
802 pos $$textref = $startpos;
803 return;
804 }
805 $ldel2 = $rdel2 = "\Q$1";
806 $rdel2 =~ tr/[({</])}>/;
807 }
808 else
809 {
810 $ldel2 = $rdel2 = $ldel1;
811 }
812 $str2pos = $ld2pos+1;
813
814 if ($ldel2 =~ /[[(<{]/)
815 {
816 pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD
817 _match_bracketed($textref,"",$ldel2,"","",$rdel2)
818 || do { pos $$textref = $startpos; return };
819 }
820 else
821 {
9686a75b 822 $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
3270c621 823 || do { pos $$textref = $startpos; return };
824 }
825 $rd2pos = pos($$textref)-1;
826 }
827 else
828 {
829 $ld2pos = $str2pos = $rd2pos = $rd1pos;
830 }
831
832 $modpos = pos $$textref;
833
834 $$textref =~ m/\G($mods{$op})/gc;
835 my $endpos = pos $$textref;
836
837 return (
838 $startpos, $oppos-$startpos, # PREFIX
839 $oppos, length($op), # OPERATOR
840 $ld1pos, 1, # LEFT DEL
841 $str1pos, $rd1pos-$str1pos, # STR/PAT
842 $rd1pos, 1, # RIGHT DEL
843 $ld2pos, $second_arg, # 2ND LDEL (MAYBE)
844 $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE)
845 $rd2pos, $second_arg, # 2ND RDEL (MAYBE)
846 $modpos, $endpos-$modpos, # MODIFIERS
847 $endpos, $textlen-$endpos, # REMAINDER
848 );
849}
850
851my $def_func =
852[
853 sub { extract_variable($_[0], '') },
854 sub { extract_quotelike($_[0],'') },
855 sub { extract_codeblock($_[0],'{}','') },
856];
857
858sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown)
859{
860 my $textref = defined($_[0]) ? \$_[0] : \$_;
861 my $posbug = pos;
862 my ($lastpos, $firstpos);
863 my @fields = ();
864
a7602084 865 #for ($$textref)
3270c621 866 {
867 my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
868 my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
869 my $igunk = $_[3];
870
a7602084 871 pos $$textref ||= 0;
3270c621 872
873 unless (wantarray)
874 {
875 use Carp;
876 carp "extract_multiple reset maximal count to 1 in scalar context"
877 if $^W && defined($_[2]) && $max > 1;
878 $max = 1
879 }
880
881 my $unkpos;
882 my $func;
883 my $class;
884
885 my @class;
886 foreach $func ( @func )
887 {
888 if (ref($func) eq 'HASH')
889 {
890 push @class, (keys %$func)[0];
891 $func = (values %$func)[0];
892 }
893 else
894 {
895 push @class, undef;
896 }
897 }
898
a7602084 899 FIELD: while (pos($$textref) < length($$textref))
3270c621 900 {
901 my $field;
a7602084 902 my @bits;
3270c621 903 foreach my $i ( 0..$#func )
904 {
a7602084 905 my $pref;
3270c621 906 $func = $func[$i];
907 $class = $class[$i];
a7602084 908 $lastpos = pos $$textref;
3270c621 909 if (ref($func) eq 'CODE')
a7602084 910 { ($field,undef,$pref) = @bits = $func->($$textref) }
3270c621 911 elsif (ref($func) eq 'Text::Balanced::Extractor')
a7602084 912 { @bits = $field = $func->extract($$textref) }
913 elsif( $$textref =~ m/\G$func/gc )
914 { @bits = $field = defined($1) ? $1 : $& }
915 $pref ||= "";
3270c621 916 if (defined($field) && length($field))
917 {
a7602084 918 if (!$igunk) {
919 $unkpos = pos $$textref
920 if length($pref) && !defined($unkpos);
921 if (defined $unkpos)
922 {
923 push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
924 $firstpos = $unkpos unless defined $firstpos;
925 undef $unkpos;
926 last FIELD if @fields == $max;
927 }
3270c621 928 }
a7602084 929 push @fields, $class
930 ? bless (\$field, $class)
3270c621 931 : $field;
932 $firstpos = $lastpos unless defined $firstpos;
a7602084 933 $lastpos = pos $$textref;
3270c621 934 last FIELD if @fields == $max;
935 next FIELD;
936 }
937 }
a7602084 938 if ($$textref =~ /\G(.)/gcs)
3270c621 939 {
a7602084 940 $unkpos = pos($$textref)-1
3270c621 941 unless $igunk || defined $unkpos;
942 }
943 }
944
945 if (defined $unkpos)
946 {
a7602084 947 push @fields, substr($$textref, $unkpos);
3270c621 948 $firstpos = $unkpos unless defined $firstpos;
a7602084 949 $lastpos = length $$textref;
3270c621 950 }
951 last;
952 }
953
954 pos $$textref = $lastpos;
955 return @fields if wantarray;
956
957 $firstpos ||= 0;
958 eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
959 pos $$textref = $firstpos };
960 return $fields[0];
961}
962
963
964sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
965{
966 my $ldel = $_[0];
967 my $rdel = $_[1];
968 my $pre = defined $_[2] ? $_[2] : '\s*';
969 my %options = defined $_[3] ? %{$_[3]} : ();
970 my $omode = defined $options{fail} ? $options{fail} : '';
971 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
972 : defined($options{reject}) ? $options{reject}
973 : ''
974 ;
975 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
976 : defined($options{ignore}) ? $options{ignore}
977 : ''
978 ;
979
980 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
981
982 my $posbug = pos;
983 for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
984 pos = $posbug;
985
986 my $closure = sub
987 {
988 my $textref = defined $_[0] ? \$_[0] : \$_;
989 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
990
991 return _fail(wantarray, $textref) unless @match;
992 return _succeed wantarray, $textref,
993 $match[2], $match[3]+$match[5]+$match[7], # MATCH
994 @match[8..9,0..1,2..7]; # REM, PRE, BITS
995 };
996
997 bless $closure, 'Text::Balanced::Extractor';
998}
999
1000package Text::Balanced::Extractor;
1001
1002sub extract($$) # ($self, $text)
1003{
1004 &{$_[0]}($_[1]);
1005}
1006
1007package Text::Balanced::ErrorMsg;
1008
1009use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
1010
10111;
55a1c97c 1012
1013__END__
1014
1015=head1 NAME
1016
1017Text::Balanced - Extract delimited text sequences from strings.
1018
1019
1020=head1 SYNOPSIS
1021
1022 use Text::Balanced qw (
1023 extract_delimited
1024 extract_bracketed
1025 extract_quotelike
1026 extract_codeblock
1027 extract_variable
1028 extract_tagged
1029 extract_multiple
1030
1031 gen_delimited_pat
1032 gen_extract_tagged
1033 );
1034
1035 # Extract the initial substring of $text that is delimited by
1036 # two (unescaped) instances of the first character in $delim.
1037
1038 ($extracted, $remainder) = extract_delimited($text,$delim);
1039
1040
1041 # Extract the initial substring of $text that is bracketed
1042 # with a delimiter(s) specified by $delim (where the string
1043 # in $delim contains one or more of '(){}[]<>').
1044
1045 ($extracted, $remainder) = extract_bracketed($text,$delim);
1046
1047
1048 # Extract the initial substring of $text that is bounded by
1049 # an HTML/XML tag.
1050
1051 ($extracted, $remainder) = extract_tagged($text);
1052
1053
1054 # Extract the initial substring of $text that is bounded by
1055 # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1056
1057 ($extracted, $remainder) =
1058 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1059
1060
1061 # Extract the initial substring of $text that represents a
1062 # Perl "quote or quote-like operation"
1063
1064 ($extracted, $remainder) = extract_quotelike($text);
1065
1066
1067 # Extract the initial substring of $text that represents a block
1068 # of Perl code, bracketed by any of character(s) specified by $delim
1069 # (where the string $delim contains one or more of '(){}[]<>').
1070
1071 ($extracted, $remainder) = extract_codeblock($text,$delim);
1072
1073
1074 # Extract the initial substrings of $text that would be extracted by
1075 # one or more sequential applications of the specified functions
1076 # or regular expressions
1077
1078 @extracted = extract_multiple($text,
1079 [ \&extract_bracketed,
1080 \&extract_quotelike,
1081 \&some_other_extractor_sub,
1082 qr/[xyz]*/,
1083 'literal',
1084 ]);
1085
1086# Create a string representing an optimized pattern (a la Friedl)
1087# that matches a substring delimited by any of the specified characters
1088# (in this case: any type of quote or a slash)
1089
1090 $patstring = gen_delimited_pat(q{'"`/});
1091
1092
1093# Generate a reference to an anonymous sub that is just like extract_tagged
1094# but pre-compiled and optimized for a specific pair of tags, and consequently
1095# much faster (i.e. 3 times faster). It uses qr// for better performance on
1096# repeated calls, so it only works under Perl 5.005 or later.
1097
1098 $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1099
1100 ($extracted, $remainder) = $extract_head->($text);
1101
1102
1103=head1 DESCRIPTION
1104
1105The various C<extract_...> subroutines may be used to extract a
1106delimited string (possibly after skipping a specified prefix string).
1107The search for the string always begins at the current C<pos>
1108location of the string's variable (or at index zero, if no C<pos>
1109position is defined).
1110
1111=head2 General behaviour in list contexts
1112
1113In a list context, all the subroutines return a list, the first three
1114elements of which are always:
1115
1116=over 4
1117
1118=item [0]
1119
1120The extracted string, including the specified delimiters.
1121If the extraction fails an empty string is returned.
1122
1123=item [1]
1124
1125The remainder of the input string (i.e. the characters after the
1126extracted string). On failure, the entire string is returned.
1127
1128=item [2]
1129
1130The skipped prefix (i.e. the characters before the extracted string).
1131On failure, the empty string is returned.
1132
1133=back
1134
1135Note that in a list context, the contents of the original input text (the first
1136argument) are not modified in any way.
1137
1138However, if the input text was passed in a variable, that variable's
1139C<pos> value is updated to point at the first character after the
1140extracted text. That means that in a list context the various
1141subroutines can be used much like regular expressions. For example:
1142
1143 while ( $next = (extract_quotelike($text))[0] )
1144 {
1145 # process next quote-like (in $next)
1146 }
1147
1148
1149=head2 General behaviour in scalar and void contexts
1150
1151In a scalar context, the extracted string is returned, having first been
1152removed from the input text. Thus, the following code also processes
1153each quote-like operation, but actually removes them from $text:
1154
1155 while ( $next = extract_quotelike($text) )
1156 {
1157 # process next quote-like (in $next)
1158 }
1159
1160Note that if the input text is a read-only string (i.e. a literal),
1161no attempt is made to remove the extracted text.
1162
1163In a void context the behaviour of the extraction subroutines is
1164exactly the same as in a scalar context, except (of course) that the
1165extracted substring is not returned.
1166
1167=head2 A note about prefixes
1168
1169Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1170This can bite you if you're expecting a prefix specification like
1171'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1172pattern will only succeed if the <H1> tag is on the current line, since
1173. normally doesn't match newlines.
1174
1175To overcome this limitation, you need to turn on /s matching within
1176the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1177
1178
1179=head2 C<extract_delimited>
1180
1181The C<extract_delimited> function formalizes the common idiom
1182of extracting a single-character-delimited substring from the start of
1183a string. For example, to extract a single-quote delimited string, the
1184following code is typically used:
1185
1186 ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1187 $extracted = $1;
1188
1189but with C<extract_delimited> it can be simplified to:
1190
1191 ($extracted,$remainder) = extract_delimited($text, "'");
1192
1193C<extract_delimited> takes up to four scalars (the input text, the
1194delimiters, a prefix pattern to be skipped, and any escape characters)
1195and extracts the initial substring of the text that
1196is appropriately delimited. If the delimiter string has multiple
1197characters, the first one encountered in the text is taken to delimit
1198the substring.
1199The third argument specifies a prefix pattern that is to be skipped
1200(but must be present!) before the substring is extracted.
1201The final argument specifies the escape character to be used for each
1202delimiter.
1203
1204All arguments are optional. If the escape characters are not specified,
1205every delimiter is escaped with a backslash (C<\>).
1206If the prefix is not specified, the
1207pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1208is also not specified, the set C</["'`]/> is used. If the text to be processed
1209is not specified either, C<$_> is used.
1210
d1be9408 1211In list context, C<extract_delimited> returns an array of three
55a1c97c 1212elements, the extracted substring (I<including the surrounding
1213delimiters>), the remainder of the text, and the skipped prefix (if
1214any). If a suitable delimited substring is not found, the first
1215element of the array is the empty string, the second is the complete
1216original text, and the prefix returned in the third element is an
1217empty string.
1218
1219In a scalar context, just the extracted substring is returned. In
1220a void context, the extracted substring (and any prefix) are simply
1221removed from the beginning of the first argument.
1222
1223Examples:
1224
1225 # Remove a single-quoted substring from the very beginning of $text:
1226
1227 $substring = extract_delimited($text, "'", '');
1228
1229 # Remove a single-quoted Pascalish substring (i.e. one in which
1230 # doubling the quote character escapes it) from the very
1231 # beginning of $text:
1232
1233 $substring = extract_delimited($text, "'", '', "'");
1234
1235 # Extract a single- or double- quoted substring from the
1236 # beginning of $text, optionally after some whitespace
1237 # (note the list context to protect $text from modification):
1238
1239 ($substring) = extract_delimited $text, q{"'};
1240
1241
1242 # Delete the substring delimited by the first '/' in $text:
1243
1244 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1245
1246Note that this last example is I<not> the same as deleting the first
1247quote-like pattern. For instance, if C<$text> contained the string:
1248
1249 "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1250
1251then after the deletion it would contain:
1252
1253 "if ('.$UNIXCMD/s) { $cmd = $1; }"
1254
1255not:
1256
1257 "if ('./cmd' =~ ms) { $cmd = $1; }"
1258
1259
1260See L<"extract_quotelike"> for a (partial) solution to this problem.
1261
1262
1263=head2 C<extract_bracketed>
1264
1265Like C<"extract_delimited">, the C<extract_bracketed> function takes
1266up to three optional scalar arguments: a string to extract from, a delimiter
1267specifier, and a prefix pattern. As before, a missing prefix defaults to
1268optional whitespace and a missing text defaults to C<$_>. However, a missing
1269delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1270
1271C<extract_bracketed> extracts a balanced-bracket-delimited
1272substring (using any one (or more) of the user-specified delimiter
1273brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1274respect quoted unbalanced brackets (see below).
1275
1276A "delimiter bracket" is a bracket in list of delimiters passed as
1277C<extract_bracketed>'s second argument. Delimiter brackets are
1278specified by giving either the left or right (or both!) versions
1279of the required bracket(s). Note that the order in which
1280two or more delimiter brackets are specified is not significant.
1281
1282A "balanced-bracket-delimited substring" is a substring bounded by
1283matched brackets, such that any other (left or right) delimiter
1284bracket I<within> the substring is also matched by an opposite
1285(right or left) delimiter bracket I<at the same level of nesting>. Any
1286type of bracket not in the delimiter list is treated as an ordinary
1287character.
1288
1289In other words, each type of bracket specified as a delimiter must be
1290balanced and correctly nested within the substring, and any other kind of
1291("non-delimiter") bracket in the substring is ignored.
1292
1293For example, given the string:
1294
1295 $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1296
1297then a call to C<extract_bracketed> in a list context:
1298
1299 @result = extract_bracketed( $text, '{}' );
1300
1301would return:
1302
1303 ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1304
1305since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1306(In a scalar context just the first element of the array would be returned. In
1307a void context, C<$text> would be replaced by an empty string.)
1308
1309Likewise the call in:
1310
1311 @result = extract_bracketed( $text, '{[' );
1312
1313would return the same result, since all sets of both types of specified
1314delimiter brackets are correctly nested and balanced.
1315
1316However, the call in:
1317
1318 @result = extract_bracketed( $text, '{([<' );
1319
1320would fail, returning:
1321
1322 ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" );
1323
1324because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1325the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1326return an empty string. In a void context, C<$text> would be unchanged.)
1327
1328Note that the embedded single-quotes in the string don't help in this
1329case, since they have not been specified as acceptable delimiters and are
1330therefore treated as non-delimiter characters (and ignored).
1331
1332However, if a particular species of quote character is included in the
1333delimiter specification, then that type of quote will be correctly handled.
1334for example, if C<$text> is:
1335
1336 $text = '<A HREF=">>>>">link</A>';
1337
1338then
1339
1340 @result = extract_bracketed( $text, '<">' );
1341
1342returns:
1343
1344 ( '<A HREF=">>>>">', 'link</A>', "" )
1345
1346as expected. Without the specification of C<"> as an embedded quoter:
1347
1348 @result = extract_bracketed( $text, '<>' );
1349
1350the result would be:
1351
1352 ( '<A HREF=">', '>>>">link</A>', "" )
1353
1354In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1355quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1356letter 'q' as a delimiter. Hence:
1357
1358 @result = extract_bracketed( $text, '<q>' );
1359
1360would correctly match something like this:
1361
1362 $text = '<leftop: conj /and/ conj>';
1363
1364See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1365
1366
1367=head2 C<extract_tagged>
1368
1369C<extract_tagged> extracts and segments text between (balanced)
1370specified tags.
1371
1372The subroutine takes up to five optional arguments:
1373
1374=over 4
1375
1376=item 1.
1377
1378A string to be processed (C<$_> if the string is omitted or C<undef>)
1379
1380=item 2.
1381
1382A string specifying a pattern to be matched as the opening tag.
1383If the pattern string is omitted (or C<undef>) then a pattern
1384that matches any standard HTML/XML tag is used.
1385
1386=item 3.
1387
1388A string specifying a pattern to be matched at the closing tag.
1389If the pattern string is omitted (or C<undef>) then the closing
1390tag is constructed by inserting a C</> after any leading bracket
1391characters in the actual opening tag that was matched (I<not> the pattern
1392that matched the tag). For example, if the opening tag pattern
1393is specified as C<'{{\w+}}'> and actually matched the opening tag
1394C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1395
1396=item 4.
1397
1398A string specifying a pattern to be matched as a prefix (which is to be
1399skipped). If omitted, optional whitespace is skipped.
1400
1401=item 5.
1402
1403A hash reference containing various parsing options (see below)
1404
1405=back
1406
1407The various options that can be specified are:
1408
1409=over 4
1410
1411=item C<reject =E<gt> $listref>
1412
1413The list reference contains one or more strings specifying patterns
1414that must I<not> appear within the tagged text.
1415
1416For example, to extract
1417an HTML link (which should not contain nested links) use:
1418
1419 extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1420
1421=item C<ignore =E<gt> $listref>
1422
1423The list reference contains one or more strings specifying patterns
1424that are I<not> be be treated as nested tags within the tagged text
1425(even if they would match the start tag pattern).
1426
1427For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1428
1429 extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1430
1431(also see L<"gen_delimited_pat"> below).
1432
1433
1434=item C<fail =E<gt> $str>
1435
1436The C<fail> option indicates the action to be taken if a matching end
1437tag is not encountered (i.e. before the end of the string or some
1438C<reject> pattern matches). By default, a failure to match a closing
1439tag causes C<extract_tagged> to immediately fail.
1440
1441However, if the string value associated with <reject> is "MAX", then
1442C<extract_tagged> returns the complete text up to the point of failure.
1443If the string is "PARA", C<extract_tagged> returns only the first paragraph
1444after the tag (up to the first line that is either empty or contains
1445only whitespace characters).
d1be9408 1446If the string is "", the default behaviour (i.e. failure) is reinstated.
55a1c97c 1447
1448For example, suppose the start tag "/para" introduces a paragraph, which then
1449continues until the next "/endpara" tag or until another "/para" tag is
1450encountered:
1451
1452 $text = "/para line 1\n\nline 3\n/para line 4";
1453
1454 extract_tagged($text, '/para', '/endpara', undef,
1455 {reject => '/para', fail => MAX );
1456
1457 # EXTRACTED: "/para line 1\n\nline 3\n"
1458
1459Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1460tag refers only to the immediately following paragraph:
1461
1462 $text = "/para line 1\n\nline 3\n/para line 4";
1463
1464 extract_tagged($text, '/para', '/endpara', undef,
1465 {reject => '/para', fail => MAX );
1466
1467 # EXTRACTED: "/para line 1\n"
1468
1469Note that the specified C<fail> behaviour applies to nested tags as well.
1470
1471=back
1472
1473On success in a list context, an array of 6 elements is returned. The elements are:
1474
1475=over 4
1476
1477=item [0]
1478
1479the extracted tagged substring (including the outermost tags),
1480
1481=item [1]
1482
1483the remainder of the input text,
1484
1485=item [2]
1486
1487the prefix substring (if any),
1488
1489=item [3]
1490
1491the opening tag
1492
1493=item [4]
1494
1495the text between the opening and closing tags
1496
1497=item [5]
1498
1499the closing tag (or "" if no closing tag was found)
1500
1501=back
1502
1503On failure, all of these values (except the remaining text) are C<undef>.
1504
1505In a scalar context, C<extract_tagged> returns just the complete
1506substring that matched a tagged text (including the start and end
1507tags). C<undef> is returned on failure. In addition, the original input
1508text has the returned substring (and any prefix) removed from it.
1509
1510In a void context, the input text just has the matched substring (and
1511any specified prefix) removed.
1512
1513
1514=head2 C<gen_extract_tagged>
1515
1516(Note: This subroutine is only available under Perl5.005)
1517
1518C<gen_extract_tagged> generates a new anonymous subroutine which
1519extracts text between (balanced) specified tags. In other words,
1520it generates a function identical in function to C<extract_tagged>.
1521
1522The difference between C<extract_tagged> and the anonymous
1523subroutines generated by
1524C<gen_extract_tagged>, is that those generated subroutines:
1525
1526=over 4
1527
1528=item *
1529
1530do not have to reparse tag specification or parsing options every time
1531they are called (whereas C<extract_tagged> has to effectively rebuild
1532its tag parser on every call);
1533
1534=item *
1535
1536make use of the new qr// construct to pre-compile the regexes they use
1537(whereas C<extract_tagged> uses standard string variable interpolation
1538to create tag-matching patterns).
1539
1540=back
1541
1542The subroutine takes up to four optional arguments (the same set as
1543C<extract_tagged> except for the string to be processed). It returns
1544a reference to a subroutine which in turn takes a single argument (the text to
1545be extracted from).
1546
1547In other words, the implementation of C<extract_tagged> is exactly
1548equivalent to:
1549
1550 sub extract_tagged
1551 {
1552 my $text = shift;
1553 $extractor = gen_extract_tagged(@_);
1554 return $extractor->($text);
1555 }
1556
1557(although C<extract_tagged> is not currently implemented that way, in order
1558to preserve pre-5.005 compatibility).
1559
1560Using C<gen_extract_tagged> to create extraction functions for specific tags
1561is a good idea if those functions are going to be called more than once, since
1562their performance is typically twice as good as the more general-purpose
1563C<extract_tagged>.
1564
1565
1566=head2 C<extract_quotelike>
1567
1568C<extract_quotelike> attempts to recognize, extract, and segment any
1569one of the various Perl quotes and quotelike operators (see
1570L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1571delimiters (for the quotelike operators), and trailing modifiers are
1572all caught. For example, in:
1573
1574 extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1575
1576 extract_quotelike ' "You said, \"Use sed\"." '
1577
1578 extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1579
1580 extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1581
1582the full Perl quotelike operations are all extracted correctly.
1583
1584Note too that, when using the /x modifier on a regex, any comment
1585containing the current pattern delimiter will cause the regex to be
1586immediately terminated. In other words:
1587
1588 'm /
1589 (?i) # CASE INSENSITIVE
1590 [a-z_] # LEADING ALPHABETIC/UNDERSCORE
1591 [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1592 /x'
1593
1594will be extracted as if it were:
1595
1596 'm /
1597 (?i) # CASE INSENSITIVE
1598 [a-z_] # LEADING ALPHABETIC/'
1599
1600This behaviour is identical to that of the actual compiler.
1601
1602C<extract_quotelike> takes two arguments: the text to be processed and
1603a prefix to be matched at the very beginning of the text. If no prefix
1604is specified, optional whitespace is the default. If no text is given,
1605C<$_> is used.
1606
1607In a list context, an array of 11 elements is returned. The elements are:
1608
1609=over 4
1610
1611=item [0]
1612
1613the extracted quotelike substring (including trailing modifiers),
1614
1615=item [1]
1616
1617the remainder of the input text,
1618
1619=item [2]
1620
1621the prefix substring (if any),
1622
1623=item [3]
1624
1625the name of the quotelike operator (if any),
1626
1627=item [4]
1628
1629the left delimiter of the first block of the operation,
1630
1631=item [5]
1632
1633the text of the first block of the operation
1634(that is, the contents of
1635a quote, the regex of a match or substitution or the target list of a
1636translation),
1637
1638=item [6]
1639
1640the right delimiter of the first block of the operation,
1641
1642=item [7]
1643
1644the left delimiter of the second block of the operation
d1be9408 1645(that is, if it is an C<s>, C<tr>, or C<y>),
55a1c97c 1646
1647=item [8]
1648
1649the text of the second block of the operation
1650(that is, the replacement of a substitution or the translation list
1651of a translation),
1652
1653=item [9]
1654
1655the right delimiter of the second block of the operation (if any),
1656
1657=item [10]
1658
1659the trailing modifiers on the operation (if any).
1660
1661=back
1662
1663For each of the fields marked "(if any)" the default value on success is
1664an empty string.
1665On failure, all of these values (except the remaining text) are C<undef>.
1666
1667
1668In a scalar context, C<extract_quotelike> returns just the complete substring
1669that matched a quotelike operation (or C<undef> on failure). In a scalar or
1670void context, the input text has the same substring (and any specified
1671prefix) removed.
1672
1673Examples:
1674
1675 # Remove the first quotelike literal that appears in text
1676
1677 $quotelike = extract_quotelike($text,'.*?');
1678
1679 # Replace one or more leading whitespace-separated quotelike
1680 # literals in $_ with "<QLL>"
1681
1682 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1683
1684
1685 # Isolate the search pattern in a quotelike operation from $text
1686
1687 ($op,$pat) = (extract_quotelike $text)[3,5];
1688 if ($op =~ /[ms]/)
1689 {
1690 print "search pattern: $pat\n";
1691 }
1692 else
1693 {
1694 print "$op is not a pattern matching operation\n";
1695 }
1696
1697
1698=head2 C<extract_quotelike> and "here documents"
1699
1700C<extract_quotelike> can successfully extract "here documents" from an input
1701string, but with an important caveat in list contexts.
1702
1703Unlike other types of quote-like literals, a here document is rarely
1704a contiguous substring. For example, a typical piece of code using
1705here document might look like this:
1706
1707 <<'EOMSG' || die;
1708 This is the message.
1709 EOMSG
1710 exit;
1711
1712Given this as an input string in a scalar context, C<extract_quotelike>
1713would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1714leaving the string " || die;\nexit;" in the original variable. In other words,
1715the two separate pieces of the here document are successfully extracted and
1716concatenated.
1717
1718In a list context, C<extract_quotelike> would return the list
1719
1720=over 4
1721
1722=item [0]
1723
1724"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1725including fore and aft delimiters),
1726
1727=item [1]
1728
1729" || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1730
1731=item [2]
1732
1733"" (i.e. the prefix substring -- trivial in this case),
1734
1735=item [3]
1736
1737"<<" (i.e. the "name" of the quotelike operator)
1738
1739=item [4]
1740
1741"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1742
1743=item [5]
1744
1745"This is the message.\n" (i.e. the text of the here document),
1746
1747=item [6]
1748
1749"EOMSG" (i.e. the right delimiter of the here document),
1750
1751=item [7..10]
1752
1753"" (a here document has no second left delimiter, second text, second right
1754delimiter, or trailing modifiers).
1755
1756=back
1757
1758However, the matching position of the input variable would be set to
1759"exit;" (i.e. I<after> the closing delimiter of the here document),
1760which would cause the earlier " || die;\nexit;" to be skipped in any
1761sequence of code fragment extractions.
1762
d1be9408 1763To avoid this problem, when it encounters a here document while
55a1c97c 1764extracting from a modifiable string, C<extract_quotelike> silently
1765rearranges the string to an equivalent piece of Perl:
1766
1767 <<'EOMSG'
1768 This is the message.
1769 EOMSG
1770 || die;
1771 exit;
1772
1773in which the here document I<is> contiguous. It still leaves the
1774matching position after the here document, but now the rest of the line
1775on which the here document starts is not skipped.
1776
1777To prevent <extract_quotelike> from mucking about with the input in this way
1778(this is the only case where a list-context C<extract_quotelike> does so),
1779you can pass the input variable as an interpolated literal:
1780
1781 $quotelike = extract_quotelike("$var");
1782
1783
1784=head2 C<extract_codeblock>
1785
1786C<extract_codeblock> attempts to recognize and extract a balanced
1787bracket delimited substring that may contain unbalanced brackets
1788inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1789is like a combination of C<"extract_bracketed"> and
1790C<"extract_quotelike">.
1791
1792C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1793a text to process, a set of delimiter brackets to look for, and a prefix to
1794match first. It also takes an optional fourth parameter, which allows the
1795outermost delimiter brackets to be specified separately (see below).
1796
1797Omitting the first argument (input text) means process C<$_> instead.
1798Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1799Omitting the third argument (prefix argument) implies optional whitespace at the start.
1800Omitting the fourth argument (outermost delimiter brackets) indicates that the
1801value of the second argument is to be used for the outermost delimiters.
1802
d1be9408 1803Once the prefix an the outermost opening delimiter bracket have been
55a1c97c 1804recognized, code blocks are extracted by stepping through the input text and
1805trying the following alternatives in sequence:
1806
1807=over 4
1808
1809=item 1.
1810
1811Try and match a closing delimiter bracket. If the bracket was the same
1812species as the last opening bracket, return the substring to that
1813point. If the bracket was mismatched, return an error.
1814
1815=item 2.
1816
1817Try to match a quote or quotelike operator. If found, call
1818C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1819the error it returned. Otherwise go back to step 1.
1820
1821=item 3.
1822
1823Try to match an opening delimiter bracket. If found, call
1824C<extract_codeblock> recursively to eat the embedded block. If the
1825recursive call fails, return an error. Otherwise, go back to step 1.
1826
1827=item 4.
1828
1829Unconditionally match a bareword or any other single character, and
1830then go back to step 1.
1831
1832=back
1833
1834
1835Examples:
1836
1837 # Find a while loop in the text
1838
1839 if ($text =~ s/.*?while\s*\{/{/)
1840 {
1841 $loop = "while " . extract_codeblock($text);
1842 }
1843
1844 # Remove the first round-bracketed list (which may include
1845 # round- or curly-bracketed code blocks or quotelike operators)
1846
1847 extract_codeblock $text, "(){}", '[^(]*';
1848
1849
1850The ability to specify a different outermost delimiter bracket is useful
1851in some circumstances. For example, in the Parse::RecDescent module,
1852parser actions which are to be performed only on a successful parse
1853are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1854
1855 sentence: subject verb object
1856 <defer: {$::theVerb = $item{verb}} >
1857
1858Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1859within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1860
1861A deferred action like this:
1862
1863 <defer: {if ($count>10) {$count--}} >
1864
1865will be incorrectly parsed as:
1866
1867 <defer: {if ($count>
1868
1869because the "less than" operator is interpreted as a closing delimiter.
1870
1871But, by extracting the directive using
1872S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1873the '>' character is only treated as a delimited at the outermost
1874level of the code block, so the directive is parsed correctly.
1875
1876=head2 C<extract_multiple>
1877
1878The C<extract_multiple> subroutine takes a string to be processed and a
1879list of extractors (subroutines or regular expressions) to apply to that string.
1880
1881In an array context C<extract_multiple> returns an array of substrings
1882of the original string, as extracted by the specified extractors.
1883In a scalar context, C<extract_multiple> returns the first
1884substring successfully extracted from the original string. In both
1885scalar and void contexts the original string has the first successfully
1886extracted substring removed from it. In all contexts
1887C<extract_multiple> starts at the current C<pos> of the string, and
1888sets that C<pos> appropriately after it matches.
1889
d1be9408 1890Hence, the aim of a call to C<extract_multiple> in a list context
55a1c97c 1891is to split the processed string into as many non-overlapping fields as
1892possible, by repeatedly applying each of the specified extractors
1893to the remainder of the string. Thus C<extract_multiple> is
1894a generalized form of Perl's C<split> subroutine.
1895
1896The subroutine takes up to four optional arguments:
1897
1898=over 4
1899
1900=item 1.
1901
1902A string to be processed (C<$_> if the string is omitted or C<undef>)
1903
1904=item 2.
1905
1906A reference to a list of subroutine references and/or qr// objects and/or
1907literal strings and/or hash references, specifying the extractors
1908to be used to split the string. If this argument is omitted (or
1909C<undef>) the list:
1910
1911 [
1912 sub { extract_variable($_[0], '') },
1913 sub { extract_quotelike($_[0],'') },
1914 sub { extract_codeblock($_[0],'{}','') },
1915 ]
1916
1917is used.
1918
1919
1920=item 3.
1921
d1be9408 1922A number specifying the maximum number of fields to return. If this
55a1c97c 1923argument is omitted (or C<undef>), split continues as long as possible.
1924
1925If the third argument is I<N>, then extraction continues until I<N> fields
1926have been successfully extracted, or until the string has been completely
1927processed.
1928
1929Note that in scalar and void contexts the value of this argument is
1930automatically reset to 1 (under C<-w>, a warning is issued if the argument
1931has to be reset).
1932
1933=item 4.
1934
1935A value indicating whether unmatched substrings (see below) within the
1936text should be skipped or returned as fields. If the value is true,
1937such substrings are skipped. Otherwise, they are returned.
1938
1939=back
1940
1941The extraction process works by applying each extractor in
a7602084 1942sequence to the text string.
1943
1944If the extractor is a subroutine it is called in a list context and is
1945expected to return a list of a single element, namely the extracted
1946text. It may optionally also return two further arguments: a string
1947representing the text left after extraction (like $' for a pattern
1948match), and a string representing any prefix skipped before the
1949extraction (like $` in a pattern match). Note that this is designed
1950to facilitate the use of other Text::Balanced subroutines with
1951C<extract_multiple>. Note too that the value returned by an extractor
1952subroutine need not bear any relationship to the corresponding substring
1953of the original text (see examples below).
55a1c97c 1954
1955If the extractor is a precompiled regular expression or a string,
1956it is matched against the text in a scalar context with a leading
1957'\G' and the gc modifiers enabled. The extracted value is either
1958$1 if that variable is defined after the match, or else the
1959complete match (i.e. $&).
1960
1961If the extractor is a hash reference, it must contain exactly one element.
1962The value of that element is one of the
1963above extractor types (subroutine reference, regular expression, or string).
1964The key of that element is the name of a class into which the successful
1965return value of the extractor will be blessed.
1966
1967If an extractor returns a defined value, that value is immediately
1968treated as the next extracted field and pushed onto the list of fields.
1969If the extractor was specified in a hash reference, the field is also
1970blessed into the appropriate class,
1971
1972If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
1973assumed to have failed to extract.
1974If none of the extractor subroutines succeeds, then one
1975character is extracted from the start of the text and the extraction
1976subroutines reapplied. Characters which are thus removed are accumulated and
1977eventually become the next field (unless the fourth argument is true, in which
d1be9408 1978case they are discarded).
55a1c97c 1979
1980For example, the following extracts substrings that are valid Perl variables:
1981
1982 @fields = extract_multiple($text,
1983 [ sub { extract_variable($_[0]) } ],
1984 undef, 1);
1985
1986This example separates a text into fields which are quote delimited,
1987curly bracketed, and anything else. The delimited and bracketed
1988parts are also blessed to identify them (the "anything else" is unblessed):
1989
1990 @fields = extract_multiple($text,
1991 [
1992 { Delim => sub { extract_delimited($_[0],q{'"}) } },
1993 { Brack => sub { extract_bracketed($_[0],'{}') } },
1994 ]);
1995
1996This call extracts the next single substring that is a valid Perl quotelike
1997operator (and removes it from $text):
1998
1999 $quotelike = extract_multiple($text,
2000 [
2001 sub { extract_quotelike($_[0]) },
2002 ], undef, 1);
2003
2004Finally, here is yet another way to do comma-separated value parsing:
2005
2006 @fields = extract_multiple($csv_text,
2007 [
2008 sub { extract_delimited($_[0],q{'"}) },
2009 qr/([^,]+)(.*)/,
2010 ],
2011 undef,1);
2012
2013The list in the second argument means:
2014I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2015The undef third argument means:
2016I<"...as many times as possible...">,
2017and the true value in the fourth argument means
2018I<"...discarding anything else that appears (i.e. the commas)">.
2019
2020If you wanted the commas preserved as separate fields (i.e. like split
2021does if your split pattern has capturing parentheses), you would
2022just make the last parameter undefined (or remove it).
2023
2024
2025=head2 C<gen_delimited_pat>
2026
2027The C<gen_delimited_pat> subroutine takes a single (string) argument and
2028 > builds a Friedl-style optimized regex that matches a string delimited
2029by any one of the characters in the single argument. For example:
2030
2031 gen_delimited_pat(q{'"})
2032
2033returns the regex:
2034
2035 (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2036
2037Note that the specified delimiters are automatically quotemeta'd.
2038
2039A typical use of C<gen_delimited_pat> would be to build special purpose tags
2040for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2041(which might contain quoted strings):
2042
2043 my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2044
2045 extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2046
2047
2048C<gen_delimited_pat> may also be called with an optional second argument,
2049which specifies the "escape" character(s) to be used for each delimiter.
2050For example to match a Pascal-style string (where ' is the delimiter
2051and '' is a literal ' within the string):
2052
2053 gen_delimited_pat(q{'},q{'});
2054
2055Different escape characters can be specified for different delimiters.
2056For example, to specify that '/' is the escape for single quotes
2057and '%' is the escape for double quotes:
2058
2059 gen_delimited_pat(q{'"},q{/%});
2060
2061If more delimiters than escape chars are specified, the last escape char
2062is used for the remaining delimiters.
2063If no escape char is specified for a given specified delimiter, '\' is used.
2064
2065Note that
2066C<gen_delimited_pat> was previously called
2067C<delimited_pat>. That name may still be used, but is now deprecated.
2068
2069
2070=head1 DIAGNOSTICS
2071
2072In a list context, all the functions return C<(undef,$original_text)>
2073on failure. In a scalar context, failure is indicated by returning C<undef>
2074(in this case the input text is not modified in any way).
2075
2076In addition, on failure in I<any> context, the C<$@> variable is set.
2077Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2078below.
2079Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2080which the error was detected (although not necessarily where it occurred!)
2081Printing C<$@> directly produces the error message, with the offset appended.
2082On success, the C<$@> variable is guaranteed to be C<undef>.
2083
2084The available diagnostics are:
2085
2086=over 4
2087
2088=item C<Did not find a suitable bracket: "%s">
2089
2090The delimiter provided to C<extract_bracketed> was not one of
2091C<'()[]E<lt>E<gt>{}'>.
2092
2093=item C<Did not find prefix: /%s/>
2094
2095A non-optional prefix was specified but wasn't found at the start of the text.
2096
2097=item C<Did not find opening bracket after prefix: "%s">
2098
2099C<extract_bracketed> or C<extract_codeblock> was expecting a
2100particular kind of bracket at the start of the text, and didn't find it.
2101
2102=item C<No quotelike operator found after prefix: "%s">
2103
2104C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2105C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2106it was extracting.
2107
2108=item C<Unmatched closing bracket: "%c">
2109
2110C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2111a closing bracket where none was expected.
2112
2113=item C<Unmatched opening bracket(s): "%s">
2114
2115C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2116out of characters in the text before closing one or more levels of nested
2117brackets.
2118
2119=item C<Unmatched embedded quote (%s)>
2120
2121C<extract_bracketed> attempted to match an embedded quoted substring, but
2122failed to find a closing quote to match it.
2123
2124=item C<Did not find closing delimiter to match '%s'>
2125
2126C<extract_quotelike> was unable to find a closing delimiter to match the
2127one that opened the quote-like operation.
2128
2129=item C<Mismatched closing bracket: expected "%c" but found "%s">
2130
2131C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2132a valid bracket delimiter, but it was the wrong species. This usually
2133indicates a nesting error, but may indicate incorrect quoting or escaping.
2134
2135=item C<No block delimiter found after quotelike "%s">
2136
2137C<extract_quotelike> or C<extract_codeblock> found one of the
2138quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2139without a suitable block after it.
2140
2141=item C<Did not find leading dereferencer>
2142
2143C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2144a variable, but didn't find any of them.
2145
2146=item C<Bad identifier after dereferencer>
2147
2148C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2149character was not followed by a legal Perl identifier.
2150
2151=item C<Did not find expected opening bracket at %s>
2152
2153C<extract_codeblock> failed to find any of the outermost opening brackets
2154that were specified.
2155
2156=item C<Improperly nested codeblock at %s>
2157
2158A nested code block was found that started with a delimiter that was specified
2159as being only to be used as an outermost bracket.
2160
2161=item C<Missing second block for quotelike "%s">
2162
2163C<extract_codeblock> or C<extract_quotelike> found one of the
2164quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2165
2166=item C<No match found for opening bracket>
2167
2168C<extract_codeblock> failed to find a closing bracket to match the outermost
2169opening bracket.
2170
2171=item C<Did not find opening tag: /%s/>
2172
2173C<extract_tagged> did not find a suitable opening tag (after any specified
2174prefix was removed).
2175
2176=item C<Unable to construct closing tag to match: /%s/>
2177
2178C<extract_tagged> matched the specified opening tag and tried to
2179modify the matched text to produce a matching closing tag (because
2180none was specified). It failed to generate the closing tag, almost
2181certainly because the opening tag did not start with a
2182bracket of some kind.
2183
2184=item C<Found invalid nested tag: %s>
2185
2186C<extract_tagged> found a nested tag that appeared in the "reject" list
2187(and the failure mode was not "MAX" or "PARA").
2188
2189=item C<Found unbalanced nested tag: %s>
2190
2191C<extract_tagged> found a nested opening tag that was not matched by a
2192corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2193
2194=item C<Did not find closing tag>
2195
2196C<extract_tagged> reached the end of the text without finding a closing tag
2197to match the original opening tag (and the failure mode was not
2198"MAX" or "PARA").
2199
2200
2201
2202
2203=back
2204
2205
2206=head1 AUTHOR
2207
2208Damian Conway (damian@conway.org)
2209
2210
2211=head1 BUGS AND IRRITATIONS
2212
2213There are undoubtedly serious bugs lurking somewhere in this code, if
2214only because parts of it give the impression of understanding a great deal
2215more about Perl than they really do.
2216
2217Bug reports and other feedback are most welcome.
2218
2219
2220=head1 COPYRIGHT
2221
2222 Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
2223 This module is free software. It may be used, redistributed
2224 and/or modified under the same terms as Perl itself.