73d633241a76cc34151058715875313bc7dcc39a
[p5sagit/p5-mst-13.2.git] / lib / Text / Balanced.pm
1 # EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
2 # FOR FULL DOCUMENTATION SEE Balanced.pod
3
4 use 5.005;
5 use strict;
6
7 package Text::Balanced;
8
9 use Exporter;
10 use SelfLoader;
11 use vars qw { $VERSION @ISA %EXPORT_TAGS };
12
13 $VERSION = '1.85';
14 @ISA            = qw ( Exporter );
15                      
16 %EXPORT_TAGS    = ( ALL => [ qw(
17                                 &extract_delimited
18                                 &extract_bracketed
19                                 &extract_quotelike
20                                 &extract_codeblock
21                                 &extract_variable
22                                 &extract_tagged
23                                 &extract_multiple
24
25                                 &gen_delimited_pat
26                                 &gen_extract_tagged
27
28                                 &delimited_pat
29                                ) ] );
30
31 Exporter::export_ok_tags('ALL');
32
33 # PROTOTYPES
34
35 sub _match_bracketed($$$$$$);
36 sub _match_variable($$);
37 sub _match_codeblock($$$$$$$);
38 sub _match_quotelike($$$$);
39
40 # HANDLE RETURN VALUES IN VARIOUS CONTEXTS
41
42 sub _failmsg {
43         my ($message, $pos) = @_;
44         $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
45 }
46
47 sub _fail
48 {
49         my ($wantarray, $textref, $message, $pos) = @_;
50         _failmsg $message, $pos if $message;
51         return ("",$$textref,"") if $wantarray;
52         return undef;
53 }
54
55 sub _succeed
56 {
57         $@ = undef;
58         my ($wantarray,$textref) = splice @_, 0, 2;
59         my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
60         my ($startlen) = $_[5];
61         my $remainderpos = $_[2];
62         if ($wantarray)
63         {
64                 my @res;
65                 while (my ($from, $len) = splice @_, 0, 2)
66                 {
67                         push @res, substr($$textref,$from,$len);
68                 }
69                 if ($extralen) {        # CORRECT FILLET
70                         my $extra = substr($res[0], $extrapos-$startlen, $extralen, "\n");
71                         $res[1] = "$extra$res[1]";
72                         eval { substr($$textref,$remainderpos,0) = $extra;
73                                substr($$textref,$extrapos,$extralen,"\n")} ;
74                                 #REARRANGE HERE DOC AND FILLET IF POSSIBLE
75                         pos($$textref) = $remainderpos-$extralen+1; # RESET \G
76                 }
77                 else {
78                         pos($$textref) = $remainderpos;             # RESET \G
79                 }
80                 return @res;
81         }
82         else
83         {
84                 my $match = substr($$textref,$_[0],$_[1]);
85                 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
86                 my $extra = $extralen
87                         ? substr($$textref, $extrapos, $extralen)."\n" : "";
88                 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ;     #CHOP OUT PREFIX & MATCH, IF POSSIBLE
89                 pos($$textref) = $_[4];                         # RESET \G
90                 return $match;
91         }
92 }
93
94 # BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
95
96 sub gen_delimited_pat($;$)  # ($delimiters;$escapes)
97 {
98         my ($dels, $escs) = @_;
99         return "" unless $dels =~ /\S/;
100         $escs = '\\' unless $escs;
101         $escs .= substr($escs,-1) x (length($dels)-length($escs));
102         my @pat = ();
103         my $i;
104         for ($i=0; $i<length $dels; $i++)
105         {
106                 my $del = quotemeta substr($dels,$i,1);
107                 my $esc = quotemeta substr($escs,$i,1);
108                 if ($del eq $esc)
109                 {
110                         push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
111                 }
112                 else
113                 {
114                         push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
115                 }
116         }
117         my $pat = join '|', @pat;
118         return "(?:$pat)";
119 }
120
121 *delimited_pat = \&gen_delimited_pat;
122
123
124 # THE EXTRACTION FUNCTIONS
125
126 sub extract_delimited (;$$$$)
127 {
128         my $textref = defined $_[0] ? \$_[0] : \$_;
129         my $wantarray = wantarray;
130         my $del  = defined $_[1] ? $_[1] : qq{\'\"\`};
131         my $pre  = defined $_[2] ? $_[2] : '\s*';
132         my $esc  = defined $_[3] ? $_[3] : qq{\\};
133         my $pat = gen_delimited_pat($del, $esc);
134         my $startpos = pos $$textref || 0;
135         return _fail($wantarray, $textref, "Not a delimited pattern", 0)
136                 unless $$textref =~ m/\G($pre)($pat)/gc;
137         my $prelen = length($1);
138         my $matchpos = $startpos+$prelen;
139         my $endpos = pos $$textref;
140         return _succeed $wantarray, $textref,
141                         $matchpos, $endpos-$matchpos,           # MATCH
142                         $endpos,   length($$textref)-$endpos,   # REMAINDER
143                         $startpos, $prelen;                     # PREFIX
144 }
145
146 sub extract_bracketed (;$$$)
147 {
148         my $textref = defined $_[0] ? \$_[0] : \$_;
149         my $ldel = defined $_[1] ? $_[1] : '{([<';
150         my $pre  = defined $_[2] ? $_[2] : '\s*';
151         my $wantarray = wantarray;
152         my $qdel = "";
153         my $quotelike;
154         $ldel =~ s/'//g and $qdel .= q{'};
155         $ldel =~ s/"//g and $qdel .= q{"};
156         $ldel =~ s/`//g and $qdel .= q{`};
157         $ldel =~ s/q//g and $quotelike = 1;
158         $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
159         my $rdel = $ldel;
160         unless ($rdel =~ tr/[({</])}>/)
161         {
162                 return _fail $wantarray, $textref,
163                              "Did not find a suitable bracket in delimiter: \"$_[1]\"",
164                              0;
165         }
166         my $posbug = pos;
167         $ldel = join('|', map { quotemeta $_ } split('', $ldel));
168         $rdel = join('|', map { quotemeta $_ } split('', $rdel));
169         pos = $posbug;
170
171         my $startpos = pos $$textref || 0;
172         my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
173
174         return _fail ($wantarray, $textref) unless @match;
175
176         return _succeed ( $wantarray, $textref,
177                           $match[2], $match[5]+2,       # MATCH
178                           @match[8,9],                  # REMAINDER
179                           @match[0,1],                  # PREFIX
180                         );
181 }
182
183 sub _match_bracketed($$$$$$)    # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
184 {
185         my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
186         my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
187         unless ($$textref =~ m/\G$pre/gc)
188         {
189                 _failmsg "Did not find prefix: /$pre/", $startpos;
190                 return;
191         }
192
193         $ldelpos = pos $$textref;
194
195         unless ($$textref =~ m/\G($ldel)/gc)
196         {
197                 _failmsg "Did not find opening bracket after prefix: \"$pre\"",
198                          pos $$textref;
199                 pos $$textref = $startpos;
200                 return;
201         }
202
203         my @nesting = ( $1 );
204         my $textlen = length $$textref;
205         while (pos $$textref < $textlen)
206         {
207                 next if $$textref =~ m/\G\\./gcs;
208
209                 if ($$textref =~ m/\G($ldel)/gc)
210                 {
211                         push @nesting, $1;
212                 }
213                 elsif ($$textref =~ m/\G($rdel)/gc)
214                 {
215                         my ($found, $brackettype) = ($1, $1);
216                         if ($#nesting < 0)
217                         {
218                                 _failmsg "Unmatched closing bracket: \"$found\"",
219                                          pos $$textref;
220                                 pos $$textref = $startpos;
221                                 return;
222                         }
223                         my $expected = pop(@nesting);
224                         $expected =~ tr/({[</)}]>/;
225                         if ($expected ne $brackettype)
226                         {
227                                 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
228                                          pos $$textref;
229                                 pos $$textref = $startpos;
230                                 return;
231                         }
232                         last if $#nesting < 0;
233                 }
234                 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
235                 {
236                         $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
237                         _failmsg "Unmatched embedded quote ($1)",
238                                  pos $$textref;
239                         pos $$textref = $startpos;
240                         return;
241                 }
242                 elsif ($quotelike && _match_quotelike($textref,"",1,0))
243                 {
244                         next;
245                 }
246
247                 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
248         }
249         if ($#nesting>=0)
250         {
251                 _failmsg "Unmatched opening bracket(s): "
252                                 . join("..",@nesting)."..",
253                          pos $$textref;
254                 pos $$textref = $startpos;
255                 return;
256         }
257
258         $endpos = pos $$textref;
259         
260         return (
261                 $startpos,  $ldelpos-$startpos,         # PREFIX
262                 $ldelpos,   1,                          # OPENING BRACKET
263                 $ldelpos+1, $endpos-$ldelpos-2,         # CONTENTS
264                 $endpos-1,  1,                          # CLOSING BRACKET
265                 $endpos,    length($$textref)-$endpos,  # REMAINDER
266                );
267 }
268
269 sub revbracket($)
270 {
271         my $brack = reverse $_[0];
272         $brack =~ tr/[({</])}>/;
273         return $brack;
274 }
275
276 my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
277
278 sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
279 {
280         my $textref = defined $_[0] ? \$_[0] : \$_;
281         my $ldel    = $_[1];
282         my $rdel    = $_[2];
283         my $pre     = defined $_[3] ? $_[3] : '\s*';
284         my %options = defined $_[4] ? %{$_[4]} : ();
285         my $omode   = defined $options{fail} ? $options{fail} : '';
286         my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
287                     : defined($options{reject})        ? $options{reject}
288                     :                                    ''
289                     ;
290         my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
291                     : defined($options{ignore})        ? $options{ignore}
292                     :                                    ''
293                     ;
294
295         if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
296         $@ = undef;
297
298         my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
299
300         return _fail(wantarray, $textref) unless @match;
301         return _succeed wantarray, $textref,
302                         $match[2], $match[3]+$match[5]+$match[7],       # MATCH
303                         @match[8..9,0..1,2..7];                         # REM, PRE, BITS
304 }
305
306 sub _match_tagged       # ($$$$$$$)
307 {
308         my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
309         my $rdelspec;
310
311         my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
312
313         unless ($$textref =~ m/\G($pre)/gc)
314         {
315                 _failmsg "Did not find prefix: /$pre/", pos $$textref;
316                 goto failed;
317         }
318
319         $opentagpos = pos($$textref);
320
321         unless ($$textref =~ m/\G$ldel/gc)
322         {
323                 _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
324                 goto failed;
325         }
326
327         $textpos = pos($$textref);
328
329         if (!defined $rdel)
330         {
331                 $rdelspec = $&;
332                 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes)
333                 {
334                         _failmsg "Unable to construct closing tag to match: $rdel",
335                                  pos $$textref;
336                         goto failed;
337                 }
338         }
339         else
340         {
341                 $rdelspec = eval "qq{$rdel}";
342         }
343
344         while (pos($$textref) < length($$textref))
345         {
346                 next if $$textref =~ m/\G\\./gc;
347
348                 if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
349                 {
350                         $parapos = pos($$textref) - length($1)
351                                 unless defined $parapos;
352                 }
353                 elsif ($$textref =~ m/\G($rdelspec)/gc )
354                 {
355                         $closetagpos = pos($$textref)-length($1);
356                         goto matched;
357                 }
358                 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
359                 {
360                         next;
361                 }
362                 elsif ($bad && $$textref =~ m/\G($bad)/gcs)
363                 {
364                         pos($$textref) -= length($1);   # CUT OFF WHATEVER CAUSED THE SHORTNESS
365                         goto short if ($omode eq 'PARA' || $omode eq 'MAX');
366                         _failmsg "Found invalid nested tag: $1", pos $$textref;
367                         goto failed;
368                 }
369                 elsif ($$textref =~ m/\G($ldel)/gc)
370                 {
371                         my $tag = $1;
372                         pos($$textref) -= length($tag); # REWIND TO NESTED TAG
373                         unless (_match_tagged(@_))      # MATCH NESTED TAG
374                         {
375                                 goto short if $omode eq 'PARA' || $omode eq 'MAX';
376                                 _failmsg "Found unbalanced nested tag: $tag",
377                                          pos $$textref;
378                                 goto failed;
379                         }
380                 }
381                 else { $$textref =~ m/./gcs }
382         }
383
384 short:
385         $closetagpos = pos($$textref);
386         goto matched if $omode eq 'MAX';
387         goto failed unless $omode eq 'PARA';
388
389         if (defined $parapos) { pos($$textref) = $parapos }
390         else                  { $parapos = pos($$textref) }
391
392         return (
393                 $startpos,    $opentagpos-$startpos,            # PREFIX
394                 $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
395                 $textpos,     $parapos-$textpos,                # TEXT
396                 $parapos,     0,                                # NO CLOSING TAG
397                 $parapos,     length($$textref)-$parapos,       # REMAINDER
398                );
399         
400 matched:
401         $endpos = pos($$textref);
402         return (
403                 $startpos,    $opentagpos-$startpos,            # PREFIX
404                 $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
405                 $textpos,     $closetagpos-$textpos,            # TEXT
406                 $closetagpos, $endpos-$closetagpos,             # CLOSING TAG
407                 $endpos,      length($$textref)-$endpos,        # REMAINDER
408                );
409
410 failed:
411         _failmsg "Did not find closing tag", pos $$textref unless $@;
412         pos($$textref) = $startpos;
413         return;
414 }
415
416 sub extract_variable (;$$)
417 {
418         my $textref = defined $_[0] ? \$_[0] : \$_;
419         return ("","","") unless defined $$textref;
420         my $pre  = defined $_[1] ? $_[1] : '\s*';
421
422         my @match = _match_variable($textref,$pre);
423
424         return _fail wantarray, $textref unless @match;
425
426         return _succeed wantarray, $textref,
427                         @match[2..3,4..5,0..1];         # MATCH, REMAINDER, PREFIX
428 }
429
430 sub _match_variable($$)
431 {
432         my ($textref, $pre) = @_;
433         my $startpos = pos($$textref) = pos($$textref)||0;
434         unless ($$textref =~ m/\G($pre)/gc)
435         {
436                 _failmsg "Did not find prefix: /$pre/", pos $$textref;
437                 return;
438         }
439         my $varpos = pos($$textref);
440         unless ($$textref =~ m/\G(\$#?|[*\@\%]|\\&)+/gc)
441         {
442                 _failmsg "Did not find leading dereferencer", pos $$textref;
443                 pos $$textref = $startpos;
444                 return;
445         }
446
447         unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
448                 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0))
449         {
450                 _failmsg "Bad identifier after dereferencer", pos $$textref;
451                 pos $$textref = $startpos;
452                 return;
453         }
454
455         while (1)
456         {
457                 next if _match_codeblock($textref,
458                                          qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
459                                          qr/[({[]/, qr/[)}\]]/,
460                                          qr/[({[]/, qr/[)}\]]/, 0);
461                 next if _match_codeblock($textref,
462                                          qr/\s*/, qr/[{[]/, qr/[}\]]/,
463                                          qr/[{[]/, qr/[}\]]/, 0);
464                 next if _match_variable($textref,'\s*->\s*');
465                 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
466                 last;
467         }
468         
469         my $endpos = pos($$textref);
470         return ($startpos, $varpos-$startpos,
471                 $varpos,   $endpos-$varpos,
472                 $endpos,   length($$textref)-$endpos
473                 );
474 }
475
476 sub extract_codeblock (;$$$$$)
477 {
478         my $textref = defined $_[0] ? \$_[0] : \$_;
479         my $wantarray = wantarray;
480         my $ldel_inner = defined $_[1] ? $_[1] : '{';
481         my $pre        = defined $_[2] ? $_[2] : '\s*';
482         my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
483         my $rd         = $_[4];
484         my $rdel_inner = $ldel_inner;
485         my $rdel_outer = $ldel_outer;
486         my $posbug = pos;
487         for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
488         for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
489         for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
490         {
491                 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
492         }
493         pos = $posbug;
494
495         my @match = _match_codeblock($textref, $pre,
496                                      $ldel_outer, $rdel_outer,
497                                      $ldel_inner, $rdel_inner,
498                                      $rd);
499         return _fail($wantarray, $textref) unless @match;
500         return _succeed($wantarray, $textref,
501                         @match[2..3,4..5,0..1]  # MATCH, REMAINDER, PREFIX
502                        );
503
504 }
505
506 sub _match_codeblock($$$$$$$)
507 {
508         my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
509         my $startpos = pos($$textref) = pos($$textref) || 0;
510         unless ($$textref =~ m/\G($pre)/gc)
511         {
512                 _failmsg qq{Did not match prefix /$pre/ at"} .
513                             substr($$textref,pos($$textref),20) .
514                             q{..."},
515                          pos $$textref;
516                 return; 
517         }
518         my $codepos = pos($$textref);
519         unless ($$textref =~ m/\G($ldel_outer)/gc)      # OUTERMOST DELIMITER
520         {
521                 _failmsg qq{Did not find expected opening bracket at "} .
522                              substr($$textref,pos($$textref),20) .
523                              q{..."},
524                          pos $$textref;
525                 pos $$textref = $startpos;
526                 return;
527         }
528         my $closing = $1;
529            $closing =~ tr/([<{/)]>}/;
530         my $matched;
531         my $patvalid = 1;
532         while (pos($$textref) < length($$textref))
533         {
534                 $matched = '';
535                 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
536                 {
537                         $patvalid = 0;
538                         next;
539                 }
540
541                 if ($$textref =~ m/\G\s*#.*/gc)
542                 {
543                         next;
544                 }
545
546                 if ($$textref =~ m/\G\s*($rdel_outer)/gc)
547                 {
548                         unless ($matched = ($closing && $1 eq $closing) )
549                         {
550                                 next if $1 eq '>';      # MIGHT BE A "LESS THAN"
551                                 _failmsg q{Mismatched closing bracket at "} .
552                                              substr($$textref,pos($$textref),20) .
553                                              qq{...". Expected '$closing'},
554                                          pos $$textref;
555                         }
556                         last;
557                 }
558
559                 if (_match_variable($textref,'\s*') ||
560                     _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
561                 {
562                         $patvalid = 0;
563                         next;
564                 }
565
566
567                 # NEED TO COVER MANY MORE CASES HERE!!!
568                 if ($$textref =~ m#\G\s*( [-+*x/%^&|.]=?
569                                         | =(?!>)
570                                         | (\*\*|&&|\|\||<<|>>)=?
571                                         | [!=][~=]
572                                         | split|grep|map|return
573                                         )#gcx)
574                 {
575                         $patvalid = 1;
576                         next;
577                 }
578
579                 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
580                 {
581                         $patvalid = 1;
582                         next;
583                 }
584
585                 if ($$textref =~ m/\G\s*$ldel_outer/gc)
586                 {
587                         _failmsg q{Improperly nested codeblock at "} .
588                                      substr($$textref,pos($$textref),20) .
589                                      q{..."},
590                                  pos $$textref;
591                         last;
592                 }
593
594                 $patvalid = 0;
595                 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
596         }
597         continue { $@ = undef }
598
599         unless ($matched)
600         {
601                 _failmsg 'No match found for opening bracket', pos $$textref
602                         unless $@;
603                 return;
604         }
605
606         my $endpos = pos($$textref);
607         return ( $startpos, $codepos-$startpos,
608                  $codepos, $endpos-$codepos,
609                  $endpos,  length($$textref)-$endpos,
610                );
611 }
612
613
614 my %mods   = (
615                 'none'  => '[cgimsox]*',
616                 'm'     => '[cgimsox]*',
617                 's'     => '[cegimsox]*',
618                 'tr'    => '[cds]*',
619                 'y'     => '[cds]*',
620                 'qq'    => '',
621                 'qx'    => '',
622                 'qw'    => '',
623                 'qr'    => '[imsx]*',
624                 'q'     => '',
625              );
626
627 sub extract_quotelike (;$$)
628 {
629         my $textref = $_[0] ? \$_[0] : \$_;
630         my $wantarray = wantarray;
631         my $pre  = defined $_[1] ? $_[1] : '\s*';
632
633         my @match = _match_quotelike($textref,$pre,1,0);
634         return _fail($wantarray, $textref) unless @match;
635         return _succeed($wantarray, $textref,
636                         $match[2], $match[18]-$match[2],        # MATCH
637                         @match[18,19],                          # REMAINDER
638                         @match[0,1],                            # PREFIX
639                         @match[2..17],                          # THE BITS
640                         @match[20,21],                          # ANY FILLET?
641                        );
642 };
643
644 sub _match_quotelike($$$$)      # ($textref, $prepat, $allow_raw_match)
645 {
646         my ($textref, $pre, $rawmatch, $qmark) = @_;
647
648         my ($textlen,$startpos,
649             $oppos,
650             $preld1pos,$ld1pos,$str1pos,$rd1pos,
651             $preld2pos,$ld2pos,$str2pos,$rd2pos,
652             $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
653
654         unless ($$textref =~ m/\G($pre)/gc)
655         {
656                 _failmsg qq{Did not find prefix /$pre/ at "} .
657                              substr($$textref, pos($$textref), 20) .
658                              q{..."},
659                          pos $$textref;
660                 return; 
661         }
662         $oppos = pos($$textref);
663
664         my $initial = substr($$textref,$oppos,1);
665
666         if ($initial && $initial =~ m|^[\"\'\`]|
667                      || $rawmatch && $initial =~ m|^/|
668                      || $qmark && $initial =~ m|^\?|)
669         {
670                 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
671                 {
672                         _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
673                                      substr($$textref, $oppos, 20) .
674                                      q{..."},
675                                  pos $$textref;
676                         pos $$textref = $startpos;
677                         return;
678                 }
679                 $modpos= pos($$textref);
680                 $rd1pos = $modpos-1;
681
682                 if ($initial eq '/' || $initial eq '?') 
683                 {
684                         $$textref =~ m/\G$mods{none}/gc
685                 }
686
687                 my $endpos = pos($$textref);
688                 return (
689                         $startpos,      $oppos-$startpos,       # PREFIX
690                         $oppos,         0,                      # NO OPERATOR
691                         $oppos,         1,                      # LEFT DEL
692                         $oppos+1,       $rd1pos-$oppos-1,       # STR/PAT
693                         $rd1pos,        1,                      # RIGHT DEL
694                         $modpos,        0,                      # NO 2ND LDEL
695                         $modpos,        0,                      # NO 2ND STR
696                         $modpos,        0,                      # NO 2ND RDEL
697                         $modpos,        $endpos-$modpos,        # MODIFIERS
698                         $endpos,        $textlen-$endpos,       # REMAINDER
699                        );
700         }
701
702         unless ($$textref =~ m{\G((?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
703         {
704                 _failmsg q{No quotelike operator found after prefix at "} .
705                              substr($$textref, pos($$textref), 20) .
706                              q{..."},
707                          pos $$textref;
708                 pos $$textref = $startpos;
709                 return;
710         }
711
712         my $op = $1;
713         $preld1pos = pos($$textref);
714         if ($op eq '<<') {
715                 $ld1pos = pos($$textref);
716                 my $label;
717                 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
718                         $label = $1;
719                 }
720                 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
721                                      | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
722                                      | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
723                                      }gcsx) {
724                         $label = $+;
725                 }
726                 else {
727                         $label = "";
728                 }
729                 my $extrapos = pos($$textref);
730                 $$textref =~ m{.*\n}gc;
731                 $str1pos = pos($$textref);
732                 unless ($$textref =~ m{.*?\n(?=$label\n)}gc) {
733                         _failmsg qq{Missing here doc terminator ('$label') after "} .
734                                      substr($$textref, $startpos, 20) .
735                                      q{..."},
736                                  pos $$textref;
737                         pos $$textref = $startpos;
738                         return;
739                 }
740                 $rd1pos = pos($$textref);
741                 $$textref =~ m{$label\n}gc;
742                 $ld2pos = pos($$textref);
743                 return (
744                         $startpos,      $oppos-$startpos,       # PREFIX
745                         $oppos,         length($op),            # OPERATOR
746                         $ld1pos,        $extrapos-$ld1pos,      # LEFT DEL
747                         $str1pos,       $rd1pos-$str1pos,       # STR/PAT
748                         $rd1pos,        $ld2pos-$rd1pos,        # RIGHT DEL
749                         $ld2pos,        0,                      # NO 2ND LDEL
750                         $ld2pos,        0,                      # NO 2ND STR
751                         $ld2pos,        0,                      # NO 2ND RDEL
752                         $ld2pos,        0,                      # NO MODIFIERS
753                         $ld2pos,        $textlen-$ld2pos,       # REMAINDER
754                         $extrapos,      $str1pos-$extrapos,     # FILLETED BIT
755                        );
756         }
757
758         $$textref =~ m/\G\s*/gc;
759         $ld1pos = pos($$textref);
760         $str1pos = $ld1pos+1;
761
762         unless ($$textref =~ m/\G(\S)/gc)       # SHOULD USE LOOKAHEAD
763         {
764                 _failmsg "No block delimiter found after quotelike $op",
765                          pos $$textref;
766                 pos $$textref = $startpos;
767                 return;
768         }
769         pos($$textref) = $ld1pos;       # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
770         my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
771         if ($ldel1 =~ /[[(<{]/)
772         {
773                 $rdel1 =~ tr/[({</])}>/;
774                 _match_bracketed($textref,"",$ldel1,"","",$rdel1)
775                 || do { pos $$textref = $startpos; return };
776         }
777         else
778         {
779                 $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
780                 || do { pos $$textref = $startpos; return };
781         }
782         $ld2pos = $rd1pos = pos($$textref)-1;
783
784         my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
785         if ($second_arg)
786         {
787                 my ($ldel2, $rdel2);
788                 if ($ldel1 =~ /[[(<{]/)
789                 {
790                         unless ($$textref =~ /\G\s*(\S)/gc)     # SHOULD USE LOOKAHEAD
791                         {
792                                 _failmsg "Missing second block for quotelike $op",
793                                          pos $$textref;
794                                 pos $$textref = $startpos;
795                                 return;
796                         }
797                         $ldel2 = $rdel2 = "\Q$1";
798                         $rdel2 =~ tr/[({</])}>/;
799                 }
800                 else
801                 {
802                         $ldel2 = $rdel2 = $ldel1;
803                 }
804                 $str2pos = $ld2pos+1;
805
806                 if ($ldel2 =~ /[[(<{]/)
807                 {
808                         pos($$textref)--;       # OVERCOME BROKEN LOOKAHEAD 
809                         _match_bracketed($textref,"",$ldel2,"","",$rdel2)
810                         || do { pos $$textref = $startpos; return };
811                 }
812                 else
813                 {
814                         $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
815                         || do { pos $$textref = $startpos; return };
816                 }
817                 $rd2pos = pos($$textref)-1;
818         }
819         else
820         {
821                 $ld2pos = $str2pos = $rd2pos = $rd1pos;
822         }
823
824         $modpos = pos $$textref;
825
826         $$textref =~ m/\G($mods{$op})/gc;
827         my $endpos = pos $$textref;
828
829         return (
830                 $startpos,      $oppos-$startpos,       # PREFIX
831                 $oppos,         length($op),            # OPERATOR
832                 $ld1pos,        1,                      # LEFT DEL
833                 $str1pos,       $rd1pos-$str1pos,       # STR/PAT
834                 $rd1pos,        1,                      # RIGHT DEL
835                 $ld2pos,        $second_arg,            # 2ND LDEL (MAYBE)
836                 $str2pos,       $rd2pos-$str2pos,       # 2ND STR (MAYBE)
837                 $rd2pos,        $second_arg,            # 2ND RDEL (MAYBE)
838                 $modpos,        $endpos-$modpos,        # MODIFIERS
839                 $endpos,        $textlen-$endpos,       # REMAINDER
840                );
841 }
842
843 my $def_func = 
844 [
845         sub { extract_variable($_[0], '') },
846         sub { extract_quotelike($_[0],'') },
847         sub { extract_codeblock($_[0],'{}','') },
848 ];
849
850 sub extract_multiple (;$$$$)    # ($text, $functions_ref, $max_fields, $ignoreunknown)
851 {
852         my $textref = defined($_[0]) ? \$_[0] : \$_;
853         my $posbug = pos;
854         my ($lastpos, $firstpos);
855         my @fields = ();
856
857         for ($$textref)
858         {
859                 my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
860                 my $max  = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
861                 my $igunk = $_[3];
862
863                 pos ||= 0;
864
865                 unless (wantarray)
866                 {
867                         use Carp;
868                         carp "extract_multiple reset maximal count to 1 in scalar context"
869                                 if $^W && defined($_[2]) && $max > 1;
870                         $max = 1
871                 }
872
873                 my $unkpos;
874                 my $func;
875                 my $class;
876
877                 my @class;
878                 foreach $func ( @func )
879                 {
880                         if (ref($func) eq 'HASH')
881                         {
882                                 push @class, (keys %$func)[0];
883                                 $func = (values %$func)[0];
884                         }
885                         else
886                         {
887                                 push @class, undef;
888                         }
889                 }
890
891                 FIELD: while (pos() < length())
892                 {
893                         my $field;
894                         foreach my $i ( 0..$#func )
895                         {
896                                 $func = $func[$i];
897                                 $class = $class[$i];
898                                 $lastpos = pos;
899                                 if (ref($func) eq 'CODE')
900                                         { ($field) = $func->($_) }
901                                 elsif (ref($func) eq 'Text::Balanced::Extractor')
902                                         { $field = $func->extract($_) }
903                                 elsif( m/\G$func/gc )
904                                         { $field = defined($1) ? $1 : $& }
905
906                                 if (defined($field) && length($field))
907                                 {
908                                         if (defined($unkpos) && !$igunk)
909                                         {
910                                                 push @fields, substr($_, $unkpos, $lastpos-$unkpos);
911                                                 $firstpos = $unkpos unless defined $firstpos;
912                                                 undef $unkpos;
913                                                 last FIELD if @fields == $max;
914                                         }
915                                         push @fields, $class 
916                                                 ? bless(\$field, $class)
917                                                 : $field;
918                                         $firstpos = $lastpos unless defined $firstpos;
919                                         $lastpos = pos;
920                                         last FIELD if @fields == $max;
921                                         next FIELD;
922                                 }
923                         }
924                         if (/\G(.)/gcs)
925                         {
926                                 $unkpos = pos()-1
927                                         unless $igunk || defined $unkpos;
928                         }
929                 }
930                 
931                 if (defined $unkpos)
932                 {
933                         push @fields, substr($_, $unkpos);
934                         $firstpos = $unkpos unless defined $firstpos;
935                         $lastpos = length;
936                 }
937                 last;
938         }
939
940         pos $$textref = $lastpos;
941         return @fields if wantarray;
942
943         $firstpos ||= 0;
944         eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
945                pos $$textref = $firstpos };
946         return $fields[0];
947 }
948
949
950 sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
951 {
952         my $ldel    = $_[0];
953         my $rdel    = $_[1];
954         my $pre     = defined $_[2] ? $_[2] : '\s*';
955         my %options = defined $_[3] ? %{$_[3]} : ();
956         my $omode   = defined $options{fail} ? $options{fail} : '';
957         my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
958                     : defined($options{reject})        ? $options{reject}
959                     :                                    ''
960                     ;
961         my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
962                     : defined($options{ignore})        ? $options{ignore}
963                     :                                    ''
964                     ;
965
966         if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
967
968         my $posbug = pos;
969         for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
970         pos = $posbug;
971
972         my $closure = sub
973         {
974                 my $textref = defined $_[0] ? \$_[0] : \$_;
975                 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
976
977                 return _fail(wantarray, $textref) unless @match;
978                 return _succeed wantarray, $textref,
979                                 $match[2], $match[3]+$match[5]+$match[7],       # MATCH
980                                 @match[8..9,0..1,2..7];                         # REM, PRE, BITS
981         };
982
983         bless $closure, 'Text::Balanced::Extractor';
984 }
985
986 package Text::Balanced::Extractor;
987
988 sub extract($$) # ($self, $text)
989 {
990         &{$_[0]}($_[1]);
991 }
992
993 package Text::Balanced::ErrorMsg;
994
995 use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
996
997 1;