lib/Text/Balanced.pm

   1 # EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
   2 # FOR FULL DOCUMENTATION SEE Balanced.pod
   3
   4 use 5.005;
   5 use strict;
   6
   7 package Text::Balanced;
   8
   9 use Exporter;
  10 use SelfLoader;
  11 use vars qw { $VERSION @ISA %EXPORT_TAGS };
  12
  13 $VERSION = '1.95';
  14 @ISA            = qw ( Exporter );
  15
  16 %EXPORT_TAGS    = ( ALL => [ qw(
  17                                 &extract_delimited
  18                                 &extract_bracketed
  19                                 &extract_quotelike
  20                                 &extract_codeblock
  21                                 &extract_variable
  22                                 &extract_tagged
  23                                 &extract_multiple
  24
  25                                 &gen_delimited_pat
  26                                 &gen_extract_tagged
  27
  28                                 &delimited_pat
  29                                ) ] );
  30
  31 Exporter::export_ok_tags('ALL');
  32
  33 # PROTOTYPES
  34
  35 sub _match_bracketed($$$$$$);
  36 sub _match_variable($$);
  37 sub _match_codeblock($$$$$$$);
  38 sub _match_quotelike($$$$);
  39
  40 # HANDLE RETURN VALUES IN VARIOUS CONTEXTS
  41
  42 sub _failmsg {
  43         my ($message, $pos) = @_;
  44         $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
  45 }
  46
  47 sub _fail
  48 {
  49         my ($wantarray, $textref, $message, $pos) = @_;
  50         _failmsg $message, $pos if $message;
  51         return ("",$$textref,"") if $wantarray;
  52         return undef;
  53 }
  54
  55 sub _succeed
  56 {
  57         $@ = undef;
  58         my ($wantarray,$textref) = splice @_, 0, 2;
  59         my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
  60         my ($startlen) = $_[5];
  61         my $remainderpos = $_[2];
  62         if ($wantarray)
  63         {
  64                 my @res;
  65                 while (my ($from, $len) = splice @_, 0, 2)
  66                 {
  67                         push @res, substr($$textref,$from,$len);
  68                 }
  69                 if ($extralen) {        # CORRECT FILLET
  70                         my $extra = substr($res[0], $extrapos-$startlen, $extralen, "\n");
  71                         $res[1] = "$extra$res[1]";
  72                         eval { substr($$textref,$remainderpos,0) = $extra;
  73                                substr($$textref,$extrapos,$extralen,"\n")} ;
  74                                 #REARRANGE HERE DOC AND FILLET IF POSSIBLE
  75                         pos($$textref) = $remainderpos-$extralen+1; # RESET \G
  76                 }
  77                 else {
  78                         pos($$textref) = $remainderpos;             # RESET \G
  79                 }
  80                 return @res;
  81         }
  82         else
  83         {
  84                 my $match = substr($$textref,$_[0],$_[1]);
  85                 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
  86                 my $extra = $extralen
  87                         ? substr($$textref, $extrapos, $extralen)."\n" : "";
  88                 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ;     #CHOP OUT PREFIX & MATCH, IF POSSIBLE
  89                 pos($$textref) = $_[4];                         # RESET \G
  90                 return $match;
  91         }
  92 }
  93
  94 # BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
  95
  96 sub gen_delimited_pat($;$)  # ($delimiters;$escapes)
  97 {
  98         my ($dels, $escs) = @_;
  99         return "" unless $dels =~ /\S/;
 100         $escs = '\\' unless $escs;
 101         $escs .= substr($escs,-1) x (length($dels)-length($escs));
 102         my @pat = ();
 103         my $i;
 104         for ($i=0; $i<length $dels; $i++)
 105         {
 106                 my $del = quotemeta substr($dels,$i,1);
 107                 my $esc = quotemeta substr($escs,$i,1);
 108                 if ($del eq $esc)
 109                 {
 110                         push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
 111                 }
 112                 else
 113                 {
 114                         push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
 115                 }
 116         }
 117         my $pat = join '|', @pat;
 118         return "(?:$pat)";
 119 }
 120
 121 *delimited_pat = \&gen_delimited_pat;
 122
 123
 124 # THE EXTRACTION FUNCTIONS
 125
 126 sub extract_delimited (;$$$$)
 127 {
 128         my $textref = defined $_[0] ? \$_[0] : \$_;
 129         my $wantarray = wantarray;
 130         my $del  = defined $_[1] ? $_[1] : qq{\'\"\`};
 131         my $pre  = defined $_[2] ? $_[2] : '\s*';
 132         my $esc  = defined $_[3] ? $_[3] : qq{\\};
 133         my $pat = gen_delimited_pat($del, $esc);
 134         my $startpos = pos $$textref || 0;
 135         return _fail($wantarray, $textref, "Not a delimited pattern", 0)
 136                 unless $$textref =~ m/\G($pre)($pat)/gc;
 137         my $prelen = length($1);
 138         my $matchpos = $startpos+$prelen;
 139         my $endpos = pos $$textref;
 140         return _succeed $wantarray, $textref,
 141                         $matchpos, $endpos-$matchpos,           # MATCH
 142                         $endpos,   length($$textref)-$endpos,   # REMAINDER
 143                         $startpos, $prelen;                     # PREFIX
 144 }
 145
 146 sub extract_bracketed (;$$$)
 147 {
 148         my $textref = defined $_[0] ? \$_[0] : \$_;
 149         my $ldel = defined $_[1] ? $_[1] : '{([<';
 150         my $pre  = defined $_[2] ? $_[2] : '\s*';
 151         my $wantarray = wantarray;
 152         my $qdel = "";
 153         my $quotelike;
 154         $ldel =~ s/'//g and $qdel .= q{'};
 155         $ldel =~ s/"//g and $qdel .= q{"};
 156         $ldel =~ s/`//g and $qdel .= q{`};
 157         $ldel =~ s/q//g and $quotelike = 1;
 158         $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
 159         my $rdel = $ldel;
 160         unless ($rdel =~ tr/[({</])}>/)
 161         {
 162                 return _fail $wantarray, $textref,
 163                              "Did not find a suitable bracket in delimiter: \"$_[1]\"",
 164                              0;
 165         }
 166         my $posbug = pos;
 167         $ldel = join('|', map { quotemeta $_ } split('', $ldel));
 168         $rdel = join('|', map { quotemeta $_ } split('', $rdel));
 169         pos = $posbug;
 170
 171         my $startpos = pos $$textref || 0;
 172         my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
 173
 174         return _fail ($wantarray, $textref) unless @match;
 175
 176         return _succeed ( $wantarray, $textref,
 177                           $match[2], $match[5]+2,       # MATCH
 178                           @match[8,9],                  # REMAINDER
 179                           @match[0,1],                  # PREFIX
 180                         );
 181 }
 182
 183 sub _match_bracketed($$$$$$)    # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
 184 {
 185         my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
 186         my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
 187         unless ($$textref =~ m/\G$pre/gc)
 188         {
 189                 _failmsg "Did not find prefix: /$pre/", $startpos;
 190                 return;
 191         }
 192
 193         $ldelpos = pos $$textref;
 194
 195         unless ($$textref =~ m/\G($ldel)/gc)
 196         {
 197                 _failmsg "Did not find opening bracket after prefix: \"$pre\"",
 198                          pos $$textref;
 199                 pos $$textref = $startpos;
 200                 return;
 201         }
 202
 203         my @nesting = ( $1 );
 204         my $textlen = length $$textref;
 205         while (pos $$textref < $textlen)
 206         {
 207                 next if $$textref =~ m/\G\\./gcs;
 208
 209                 if ($$textref =~ m/\G($ldel)/gc)
 210                 {
 211                         push @nesting, $1;
 212                 }
 213                 elsif ($$textref =~ m/\G($rdel)/gc)
 214                 {
 215                         my ($found, $brackettype) = ($1, $1);
 216                         if ($#nesting < 0)
 217                         {
 218                                 _failmsg "Unmatched closing bracket: \"$found\"",
 219                                          pos $$textref;
 220                                 pos $$textref = $startpos;
 221                                 return;
 222                         }
 223                         my $expected = pop(@nesting);
 224                         $expected =~ tr/({[</)}]>/;
 225                         if ($expected ne $brackettype)
 226                         {
 227                                 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
 228                                          pos $$textref;
 229                                 pos $$textref = $startpos;
 230                                 return;
 231                         }
 232                         last if $#nesting < 0;
 233                 }
 234                 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
 235                 {
 236                         $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
 237                         _failmsg "Unmatched embedded quote ($1)",
 238                                  pos $$textref;
 239                         pos $$textref = $startpos;
 240                         return;
 241                 }
 242                 elsif ($quotelike && _match_quotelike($textref,"",1,0))
 243                 {
 244                         next;
 245                 }
 246
 247                 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
 248         }
 249         if ($#nesting>=0)
 250         {
 251                 _failmsg "Unmatched opening bracket(s): "
 252                                 . join("..",@nesting)."..",
 253                          pos $$textref;
 254                 pos $$textref = $startpos;
 255                 return;
 256         }
 257
 258         $endpos = pos $$textref;
 259
 260         return (
 261                 $startpos,  $ldelpos-$startpos,         # PREFIX
 262                 $ldelpos,   1,                          # OPENING BRACKET
 263                 $ldelpos+1, $endpos-$ldelpos-2,         # CONTENTS
 264                 $endpos-1,  1,                          # CLOSING BRACKET
 265                 $endpos,    length($$textref)-$endpos,  # REMAINDER
 266                );
 267 }
 268
 269 sub revbracket($)
 270 {
 271         my $brack = reverse $_[0];
 272         $brack =~ tr/[({</])}>/;
 273         return $brack;
 274 }
 275
 276 my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
 277
 278 sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
 279 {
 280         my $textref = defined $_[0] ? \$_[0] : \$_;
 281         my $ldel    = $_[1];
 282         my $rdel    = $_[2];
 283         my $pre     = defined $_[3] ? $_[3] : '\s*';
 284         my %options = defined $_[4] ? %{$_[4]} : ();
 285         my $omode   = defined $options{fail} ? $options{fail} : '';
 286         my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
 287                     : defined($options{reject})        ? $options{reject}
 288                     :                                    ''
 289                     ;
 290         my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
 291                     : defined($options{ignore})        ? $options{ignore}
 292                     :                                    ''
 293                     ;
 294
 295         if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
 296         $@ = undef;
 297
 298         my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
 299
 300         return _fail(wantarray, $textref) unless @match;
 301         return _succeed wantarray, $textref,
 302                         $match[2], $match[3]+$match[5]+$match[7],       # MATCH
 303                         @match[8..9,0..1,2..7];                         # REM, PRE, BITS
 304 }
 305
 306 sub _match_tagged       # ($$$$$$$)
 307 {
 308         my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
 309         my $rdelspec;
 310
 311         my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
 312
 313         unless ($$textref =~ m/\G($pre)/gc)
 314         {
 315                 _failmsg "Did not find prefix: /$pre/", pos $$textref;
 316                 goto failed;
 317         }
 318
 319         $opentagpos = pos($$textref);
 320
 321         unless ($$textref =~ m/\G$ldel/gc)
 322         {
 323                 _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
 324                 goto failed;
 325         }
 326
 327         $textpos = pos($$textref);
 328
 329         if (!defined $rdel)
 330         {
 331                 $rdelspec = $&;
 332                 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes)
 333                 {
 334                         _failmsg "Unable to construct closing tag to match: $rdel",
 335                                  pos $$textref;
 336                         goto failed;
 337                 }
 338         }
 339         else
 340         {
 341                 $rdelspec = eval "qq{$rdel}" || do {
 342                         my $del;
 343                         for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
 344                                 { next if $rdel =~ /\Q$_/; $del = $_; last }
 345                         unless ($del) {
 346                                 use Carp;
 347                                 croak "Can't interpolate right delimiter $rdel"
 348                         }
 349                         eval "qq$del$rdel$del";
 350                 };
 351         }
 352
 353         while (pos($$textref) < length($$textref))
 354         {
 355                 next if $$textref =~ m/\G\\./gc;
 356
 357                 if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
 358                 {
 359                         $parapos = pos($$textref) - length($1)
 360                                 unless defined $parapos;
 361                 }
 362                 elsif ($$textref =~ m/\G($rdelspec)/gc )
 363                 {
 364                         $closetagpos = pos($$textref)-length($1);
 365                         goto matched;
 366                 }
 367                 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
 368                 {
 369                         next;
 370                 }
 371                 elsif ($bad && $$textref =~ m/\G($bad)/gcs)
 372                 {
 373                         pos($$textref) -= length($1);   # CUT OFF WHATEVER CAUSED THE SHORTNESS
 374                         goto short if ($omode eq 'PARA' || $omode eq 'MAX');
 375                         _failmsg "Found invalid nested tag: $1", pos $$textref;
 376                         goto failed;
 377                 }
 378                 elsif ($$textref =~ m/\G($ldel)/gc)
 379                 {
 380                         my $tag = $1;
 381                         pos($$textref) -= length($tag); # REWIND TO NESTED TAG
 382                         unless (_match_tagged(@_))      # MATCH NESTED TAG
 383                         {
 384                                 goto short if $omode eq 'PARA' || $omode eq 'MAX';
 385                                 _failmsg "Found unbalanced nested tag: $tag",
 386                                          pos $$textref;
 387                                 goto failed;
 388                         }
 389                 }
 390                 else { $$textref =~ m/./gcs }
 391         }
 392
 393 short:
 394         $closetagpos = pos($$textref);
 395         goto matched if $omode eq 'MAX';
 396         goto failed unless $omode eq 'PARA';
 397
 398         if (defined $parapos) { pos($$textref) = $parapos }
 399         else                  { $parapos = pos($$textref) }
 400
 401         return (
 402                 $startpos,    $opentagpos-$startpos,            # PREFIX
 403                 $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
 404                 $textpos,     $parapos-$textpos,                # TEXT
 405                 $parapos,     0,                                # NO CLOSING TAG
 406                 $parapos,     length($$textref)-$parapos,       # REMAINDER
 407                );
 408
 409 matched:
 410         $endpos = pos($$textref);
 411         return (
 412                 $startpos,    $opentagpos-$startpos,            # PREFIX
 413                 $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
 414                 $textpos,     $closetagpos-$textpos,            # TEXT
 415                 $closetagpos, $endpos-$closetagpos,             # CLOSING TAG
 416                 $endpos,      length($$textref)-$endpos,        # REMAINDER
 417                );
 418
 419 failed:
 420         _failmsg "Did not find closing tag", pos $$textref unless $@;
 421         pos($$textref) = $startpos;
 422         return;
 423 }
 424
 425 sub extract_variable (;$$)
 426 {
 427         my $textref = defined $_[0] ? \$_[0] : \$_;
 428         return ("","","") unless defined $$textref;
 429         my $pre  = defined $_[1] ? $_[1] : '\s*';
 430
 431         my @match = _match_variable($textref,$pre);
 432
 433         return _fail wantarray, $textref unless @match;
 434
 435         return _succeed wantarray, $textref,
 436                         @match[2..3,4..5,0..1];         # MATCH, REMAINDER, PREFIX
 437 }
 438
 439 sub _match_variable($$)
 440 {
 441 #  $#
 442 #  $^
 443 #  $$
 444         my ($textref, $pre) = @_;
 445         my $startpos = pos($$textref) = pos($$textref)||0;
 446         unless ($$textref =~ m/\G($pre)/gc)
 447         {
 448                 _failmsg "Did not find prefix: /$pre/", pos $$textref;
 449                 return;
 450         }
 451         my $varpos = pos($$textref);
 452         unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
 453         {
 454             unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
 455             {
 456                 _failmsg "Did not find leading dereferencer", pos $$textref;
 457                 pos $$textref = $startpos;
 458                 return;
 459             }
 460             my $deref = $1;
 461
 462             unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
 463                 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
 464                 or $deref eq '$#' or $deref eq '$$' )
 465             {
 466                 _failmsg "Bad identifier after dereferencer", pos $$textref;
 467                 pos $$textref = $startpos;
 468                 return;
 469             }
 470         }
 471
 472         while (1)
 473         {
 474                 next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
 475                 next if _match_codeblock($textref,
 476                                          qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
 477                                          qr/[({[]/, qr/[)}\]]/,
 478                                          qr/[({[]/, qr/[)}\]]/, 0);
 479                 next if _match_codeblock($textref,
 480                                          qr/\s*/, qr/[{[]/, qr/[}\]]/,
 481                                          qr/[{[]/, qr/[}\]]/, 0);
 482                 next if _match_variable($textref,'\s*->\s*');
 483                 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
 484                 last;
 485         }
 486
 487         my $endpos = pos($$textref);
 488         return ($startpos, $varpos-$startpos,
 489                 $varpos,   $endpos-$varpos,
 490                 $endpos,   length($$textref)-$endpos
 491                 );
 492 }
 493
 494 sub extract_codeblock (;$$$$$)
 495 {
 496         my $textref = defined $_[0] ? \$_[0] : \$_;
 497         my $wantarray = wantarray;
 498         my $ldel_inner = defined $_[1] ? $_[1] : '{';
 499         my $pre        = defined $_[2] ? $_[2] : '\s*';
 500         my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
 501         my $rd         = $_[4];
 502         my $rdel_inner = $ldel_inner;
 503         my $rdel_outer = $ldel_outer;
 504         my $posbug = pos;
 505         for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
 506         for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
 507         for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
 508         {
 509                 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
 510         }
 511         pos = $posbug;
 512
 513         my @match = _match_codeblock($textref, $pre,
 514                                      $ldel_outer, $rdel_outer,
 515                                      $ldel_inner, $rdel_inner,
 516                                      $rd);
 517         return _fail($wantarray, $textref) unless @match;
 518         return _succeed($wantarray, $textref,
 519                         @match[2..3,4..5,0..1]  # MATCH, REMAINDER, PREFIX
 520                        );
 521
 522 }
 523
 524 sub _match_codeblock($$$$$$$)
 525 {
 526         my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
 527         my $startpos = pos($$textref) = pos($$textref) || 0;
 528         unless ($$textref =~ m/\G($pre)/gc)
 529         {
 530                 _failmsg qq{Did not match prefix /$pre/ at"} .
 531                             substr($$textref,pos($$textref),20) .
 532                             q{..."},
 533                          pos $$textref;
 534                 return;
 535         }
 536         my $codepos = pos($$textref);
 537         unless ($$textref =~ m/\G($ldel_outer)/gc)      # OUTERMOST DELIMITER
 538         {
 539                 _failmsg qq{Did not find expected opening bracket at "} .
 540                              substr($$textref,pos($$textref),20) .
 541                              q{..."},
 542                          pos $$textref;
 543                 pos $$textref = $startpos;
 544                 return;
 545         }
 546         my $closing = $1;
 547            $closing =~ tr/([<{/)]>}/;
 548         my $matched;
 549         my $patvalid = 1;
 550         while (pos($$textref) < length($$textref))
 551         {
 552                 $matched = '';
 553                 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
 554                 {
 555                         $patvalid = 0;
 556                         next;
 557                 }
 558
 559                 if ($$textref =~ m/\G\s*#.*/gc)
 560                 {
 561                         next;
 562                 }
 563
 564                 if ($$textref =~ m/\G\s*($rdel_outer)/gc)
 565                 {
 566                         unless ($matched = ($closing && $1 eq $closing) )
 567                         {
 568                                 next if $1 eq '>';      # MIGHT BE A "LESS THAN"
 569                                 _failmsg q{Mismatched closing bracket at "} .
 570                                              substr($$textref,pos($$textref),20) .
 571                                              qq{...". Expected '$closing'},
 572                                          pos $$textref;
 573                         }
 574                         last;
 575                 }
 576
 577                 if (_match_variable($textref,'\s*') ||
 578                     _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
 579                 {
 580                         $patvalid = 0;
 581                         next;
 582                 }
 583
 584
 585                 # NEED TO COVER MANY MORE CASES HERE!!!
 586                 if ($$textref =~ m#\G\s*(?!$ldel_inner)
 587                                         ( [-+*x/%^&|.]=?
 588                                         | [!=]~
 589                                         | =(?!>)
 590                                         | (\*\*|&&|\|\||<<|>>)=?
 591                                         | split|grep|map|return
 592                                         | [([]
 593                                         )#gcx)
 594                 {
 595                         $patvalid = 1;
 596                         next;
 597                 }
 598
 599                 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
 600                 {
 601                         $patvalid = 1;
 602                         next;
 603                 }
 604
 605                 if ($$textref =~ m/\G\s*$ldel_outer/gc)
 606                 {
 607                         _failmsg q{Improperly nested codeblock at "} .
 608                                      substr($$textref,pos($$textref),20) .
 609                                      q{..."},
 610                                  pos $$textref;
 611                         last;
 612                 }
 613
 614                 $patvalid = 0;
 615                 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
 616         }
 617         continue { $@ = undef }
 618
 619         unless ($matched)
 620         {
 621                 _failmsg 'No match found for opening bracket', pos $$textref
 622                         unless $@;
 623                 return;
 624         }
 625
 626         my $endpos = pos($$textref);
 627         return ( $startpos, $codepos-$startpos,
 628                  $codepos, $endpos-$codepos,
 629                  $endpos,  length($$textref)-$endpos,
 630                );
 631 }
 632
 633
 634 my %mods   = (
 635                 'none'  => '[cgimsox]*',
 636                 'm'     => '[cgimsox]*',
 637                 's'     => '[cegimsox]*',
 638                 'tr'    => '[cds]*',
 639                 'y'     => '[cds]*',
 640                 'qq'    => '',
 641                 'qx'    => '',
 642                 'qw'    => '',
 643                 'qr'    => '[imsx]*',
 644                 'q'     => '',
 645              );
 646
 647 sub extract_quotelike (;$$)
 648 {
 649         my $textref = $_[0] ? \$_[0] : \$_;
 650         my $wantarray = wantarray;
 651         my $pre  = defined $_[1] ? $_[1] : '\s*';
 652
 653         my @match = _match_quotelike($textref,$pre,1,0);
 654         return _fail($wantarray, $textref) unless @match;
 655         return _succeed($wantarray, $textref,
 656                         $match[2], $match[18]-$match[2],        # MATCH
 657                         @match[18,19],                          # REMAINDER
 658                         @match[0,1],                            # PREFIX
 659                         @match[2..17],                          # THE BITS
 660                         @match[20,21],                          # ANY FILLET?
 661                        );
 662 };
 663
 664 sub _match_quotelike($$$$)      # ($textref, $prepat, $allow_raw_match)
 665 {
 666         my ($textref, $pre, $rawmatch, $qmark) = @_;
 667
 668         my ($textlen,$startpos,
 669             $oppos,
 670             $preld1pos,$ld1pos,$str1pos,$rd1pos,
 671             $preld2pos,$ld2pos,$str2pos,$rd2pos,
 672             $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
 673
 674         unless ($$textref =~ m/\G($pre)/gc)
 675         {
 676                 _failmsg qq{Did not find prefix /$pre/ at "} .
 677                              substr($$textref, pos($$textref), 20) .
 678                              q{..."},
 679                          pos $$textref;
 680                 return;
 681         }
 682         $oppos = pos($$textref);
 683
 684         my $initial = substr($$textref,$oppos,1);
 685
 686         if ($initial && $initial =~ m|^[\"\'\`]|
 687                      || $rawmatch && $initial =~ m|^/|
 688                      || $qmark && $initial =~ m|^\?|)
 689         {
 690                 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
 691                 {
 692                         _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
 693                                      substr($$textref, $oppos, 20) .
 694                                      q{..."},
 695                                  pos $$textref;
 696                         pos $$textref = $startpos;
 697                         return;
 698                 }
 699                 $modpos= pos($$textref);
 700                 $rd1pos = $modpos-1;
 701
 702                 if ($initial eq '/' || $initial eq '?')
 703                 {
 704                         $$textref =~ m/\G$mods{none}/gc
 705                 }
 706
 707                 my $endpos = pos($$textref);
 708                 return (
 709                         $startpos,      $oppos-$startpos,       # PREFIX
 710                         $oppos,         0,                      # NO OPERATOR
 711                         $oppos,         1,                      # LEFT DEL
 712                         $oppos+1,       $rd1pos-$oppos-1,       # STR/PAT
 713                         $rd1pos,        1,                      # RIGHT DEL
 714                         $modpos,        0,                      # NO 2ND LDEL
 715                         $modpos,        0,                      # NO 2ND STR
 716                         $modpos,        0,                      # NO 2ND RDEL
 717                         $modpos,        $endpos-$modpos,        # MODIFIERS
 718                         $endpos,        $textlen-$endpos,       # REMAINDER
 719                        );
 720         }
 721
 722         unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
 723         {
 724                 _failmsg q{No quotelike operator found after prefix at "} .
 725                              substr($$textref, pos($$textref), 20) .
 726                              q{..."},
 727                          pos $$textref;
 728                 pos $$textref = $startpos;
 729                 return;
 730         }
 731
 732         my $op = $1;
 733         $preld1pos = pos($$textref);
 734         if ($op eq '<<') {
 735                 $ld1pos = pos($$textref);
 736                 my $label;
 737                 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
 738                         $label = $1;
 739                 }
 740                 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
 741                                      | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
 742                                      | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
 743                                      }gcsx) {
 744                         $label = $+;
 745                 }
 746                 else {
 747                         $label = "";
 748                 }
 749                 my $extrapos = pos($$textref);
 750                 $$textref =~ m{.*\n}gc;
 751                 $str1pos = pos($$textref);
 752                 unless ($$textref =~ m{.*?\n(?=$label\n)}gc) {
 753                         _failmsg qq{Missing here doc terminator ('$label') after "} .
 754                                      substr($$textref, $startpos, 20) .
 755                                      q{..."},
 756                                  pos $$textref;
 757                         pos $$textref = $startpos;
 758                         return;
 759                 }
 760                 $rd1pos = pos($$textref);
 761                 $$textref =~ m{$label\n}gc;
 762                 $ld2pos = pos($$textref);
 763                 return (
 764                         $startpos,      $oppos-$startpos,       # PREFIX
 765                         $oppos,         length($op),            # OPERATOR
 766                         $ld1pos,        $extrapos-$ld1pos,      # LEFT DEL
 767                         $str1pos,       $rd1pos-$str1pos,       # STR/PAT
 768                         $rd1pos,        $ld2pos-$rd1pos,        # RIGHT DEL
 769                         $ld2pos,        0,                      # NO 2ND LDEL
 770                         $ld2pos,        0,                      # NO 2ND STR
 771                         $ld2pos,        0,                      # NO 2ND RDEL
 772                         $ld2pos,        0,                      # NO MODIFIERS
 773                         $ld2pos,        $textlen-$ld2pos,       # REMAINDER
 774                         $extrapos,      $str1pos-$extrapos,     # FILLETED BIT
 775                        );
 776         }
 777
 778         $$textref =~ m/\G\s*/gc;
 779         $ld1pos = pos($$textref);
 780         $str1pos = $ld1pos+1;
 781
 782         unless ($$textref =~ m/\G(\S)/gc)       # SHOULD USE LOOKAHEAD
 783         {
 784                 _failmsg "No block delimiter found after quotelike $op",
 785                          pos $$textref;
 786                 pos $$textref = $startpos;
 787                 return;
 788         }
 789         pos($$textref) = $ld1pos;       # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
 790         my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
 791         if ($ldel1 =~ /[[(<{]/)
 792         {
 793                 $rdel1 =~ tr/[({</])}>/;
 794                 _match_bracketed($textref,"",$ldel1,"","",$rdel1)
 795                 || do { pos $$textref = $startpos; return };
 796         }
 797         else
 798         {
 799                 $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
 800                 || do { pos $$textref = $startpos; return };
 801         }
 802         $ld2pos = $rd1pos = pos($$textref)-1;
 803
 804         my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
 805         if ($second_arg)
 806         {
 807                 my ($ldel2, $rdel2);
 808                 if ($ldel1 =~ /[[(<{]/)
 809                 {
 810                         unless ($$textref =~ /\G\s*(\S)/gc)     # SHOULD USE LOOKAHEAD
 811                         {
 812                                 _failmsg "Missing second block for quotelike $op",
 813                                          pos $$textref;
 814                                 pos $$textref = $startpos;
 815                                 return;
 816                         }
 817                         $ldel2 = $rdel2 = "\Q$1";
 818                         $rdel2 =~ tr/[({</])}>/;
 819                 }
 820                 else
 821                 {
 822                         $ldel2 = $rdel2 = $ldel1;
 823                 }
 824                 $str2pos = $ld2pos+1;
 825
 826                 if ($ldel2 =~ /[[(<{]/)
 827                 {
 828                         pos($$textref)--;       # OVERCOME BROKEN LOOKAHEAD
 829                         _match_bracketed($textref,"",$ldel2,"","",$rdel2)
 830                         || do { pos $$textref = $startpos; return };
 831                 }
 832                 else
 833                 {
 834                         $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
 835                         || do { pos $$textref = $startpos; return };
 836                 }
 837                 $rd2pos = pos($$textref)-1;
 838         }
 839         else
 840         {
 841                 $ld2pos = $str2pos = $rd2pos = $rd1pos;
 842         }
 843
 844         $modpos = pos $$textref;
 845
 846         $$textref =~ m/\G($mods{$op})/gc;
 847         my $endpos = pos $$textref;
 848
 849         return (
 850                 $startpos,      $oppos-$startpos,       # PREFIX
 851                 $oppos,         length($op),            # OPERATOR
 852                 $ld1pos,        1,                      # LEFT DEL
 853                 $str1pos,       $rd1pos-$str1pos,       # STR/PAT
 854                 $rd1pos,        1,                      # RIGHT DEL
 855                 $ld2pos,        $second_arg,            # 2ND LDEL (MAYBE)
 856                 $str2pos,       $rd2pos-$str2pos,       # 2ND STR (MAYBE)
 857                 $rd2pos,        $second_arg,            # 2ND RDEL (MAYBE)
 858                 $modpos,        $endpos-$modpos,        # MODIFIERS
 859                 $endpos,        $textlen-$endpos,       # REMAINDER
 860                );
 861 }
 862
 863 my $def_func =
 864 [
 865         sub { extract_variable($_[0], '') },
 866         sub { extract_quotelike($_[0],'') },
 867         sub { extract_codeblock($_[0],'{}','') },
 868 ];
 869
 870 sub extract_multiple (;$$$$)    # ($text, $functions_ref, $max_fields, $ignoreunknown)
 871 {
 872         my $textref = defined($_[0]) ? \$_[0] : \$_;
 873         my $posbug = pos;
 874         my ($lastpos, $firstpos);
 875         my @fields = ();
 876
 877         #for ($$textref)
 878         {
 879                 my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
 880                 my $max  = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
 881                 my $igunk = $_[3];
 882
 883                 pos $$textref ||= 0;
 884
 885                 unless (wantarray)
 886                 {
 887                         use Carp;
 888                         carp "extract_multiple reset maximal count to 1 in scalar context"
 889                                 if $^W && defined($_[2]) && $max > 1;
 890                         $max = 1
 891                 }
 892
 893                 my $unkpos;
 894                 my $func;
 895                 my $class;
 896
 897                 my @class;
 898                 foreach $func ( @func )
 899                 {
 900                         if (ref($func) eq 'HASH')
 901                         {
 902                                 push @class, (keys %$func)[0];
 903                                 $func = (values %$func)[0];
 904                         }
 905                         else
 906                         {
 907                                 push @class, undef;
 908                         }
 909                 }
 910
 911                 FIELD: while (pos($$textref) < length($$textref))
 912                 {
 913                         my ($field, $rem);
 914                         my @bits;
 915                         foreach my $i ( 0..$#func )
 916                         {
 917                                 my $pref;
 918                                 $func = $func[$i];
 919                                 $class = $class[$i];
 920                                 $lastpos = pos $$textref;
 921                                 if (ref($func) eq 'CODE')
 922                                         { ($field,$rem,$pref) = @bits = $func->($$textref);
 923                                         # print "[$field|$rem]" if $field;
 924                                         }
 925                                 elsif (ref($func) eq 'Text::Balanced::Extractor')
 926                                         { @bits = $field = $func->extract($$textref) }
 927                                 elsif( $$textref =~ m/\G$func/gc )
 928                                         { @bits = $field = defined($1) ? $1 : $& }
 929                                 $pref ||= "";
 930                                 if (defined($field) && length($field))
 931                                 {
 932                                         if (!$igunk) {
 933                                                 $unkpos = pos $$textref
 934                                                         if length($pref) && !defined($unkpos);
 935                                                 if (defined $unkpos)
 936                                                 {
 937                                                         push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
 938                                                         $firstpos = $unkpos unless defined $firstpos;
 939                                                         undef $unkpos;
 940                                                         last FIELD if @fields == $max;
 941                                                 }
 942                                         }
 943                                         push @fields, $class
 944                                                 ? bless (\$field, $class)
 945                                                 : $field;
 946                                         $firstpos = $lastpos unless defined $firstpos;
 947                                         $lastpos = pos $$textref;
 948                                         last FIELD if @fields == $max;
 949                                         next FIELD;
 950                                 }
 951                         }
 952                         if ($$textref =~ /\G(.)/gcs)
 953                         {
 954                                 $unkpos = pos($$textref)-1
 955                                         unless $igunk || defined $unkpos;
 956                         }
 957                 }
 958
 959                 if (defined $unkpos)
 960                 {
 961                         push @fields, substr($$textref, $unkpos);
 962                         $firstpos = $unkpos unless defined $firstpos;
 963                         $lastpos = length $$textref;
 964                 }
 965                 last;
 966         }
 967
 968         pos $$textref = $lastpos;
 969         return @fields if wantarray;
 970
 971         $firstpos ||= 0;
 972         eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
 973                pos $$textref = $firstpos };
 974         return $fields[0];
 975 }
 976
 977
 978 sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
 979 {
 980         my $ldel    = $_[0];
 981         my $rdel    = $_[1];
 982         my $pre     = defined $_[2] ? $_[2] : '\s*';
 983         my %options = defined $_[3] ? %{$_[3]} : ();
 984         my $omode   = defined $options{fail} ? $options{fail} : '';
 985         my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
 986                     : defined($options{reject})        ? $options{reject}
 987                     :                                    ''
 988                     ;
 989         my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
 990                     : defined($options{ignore})        ? $options{ignore}
 991                     :                                    ''
 992                     ;
 993
 994         if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
 995
 996         my $posbug = pos;
 997         for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
 998         pos = $posbug;
 999
1000         my $closure = sub
1001         {
1002                 my $textref = defined $_[0] ? \$_[0] : \$_;
1003                 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
1004
1005                 return _fail(wantarray, $textref) unless @match;
1006                 return _succeed wantarray, $textref,
1007                                 $match[2], $match[3]+$match[5]+$match[7],       # MATCH
1008                                 @match[8..9,0..1,2..7];                         # REM, PRE, BITS
1009         };
1010
1011         bless $closure, 'Text::Balanced::Extractor';
1012 }
1013
1014 package Text::Balanced::Extractor;
1015
1016 sub extract($$) # ($self, $text)
1017 {
1018         &{$_[0]}($_[1]);
1019 }
1020
1021 package Text::Balanced::ErrorMsg;
1022
1023 use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
1024
1025 1;
1026
1027 __END__
1028
1029 =head1 NAME
1030
1031 Text::Balanced - Extract delimited text sequences from strings.
1032
1033
1034 =head1 SYNOPSIS
1035
1036  use Text::Balanced qw (
1037                         extract_delimited
1038                         extract_bracketed
1039                         extract_quotelike
1040                         extract_codeblock
1041                         extract_variable
1042                         extract_tagged
1043                         extract_multiple
1044
1045                         gen_delimited_pat
1046                         gen_extract_tagged
1047                        );
1048
1049  # Extract the initial substring of $text that is delimited by
1050  # two (unescaped) instances of the first character in $delim.
1051
1052         ($extracted, $remainder) = extract_delimited($text,$delim);
1053
1054
1055  # Extract the initial substring of $text that is bracketed
1056  # with a delimiter(s) specified by $delim (where the string
1057  # in $delim contains one or more of '(){}[]<>').
1058
1059         ($extracted, $remainder) = extract_bracketed($text,$delim);
1060
1061
1062  # Extract the initial substring of $text that is bounded by
1063  # an XML tag.
1064
1065         ($extracted, $remainder) = extract_tagged($text);
1066
1067
1068  # Extract the initial substring of $text that is bounded by
1069  # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1070
1071         ($extracted, $remainder) =
1072                 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1073
1074
1075  # Extract the initial substring of $text that represents a
1076  # Perl "quote or quote-like operation"
1077
1078         ($extracted, $remainder) = extract_quotelike($text);
1079
1080
1081  # Extract the initial substring of $text that represents a block
1082  # of Perl code, bracketed by any of character(s) specified by $delim
1083  # (where the string $delim contains one or more of '(){}[]<>').
1084
1085         ($extracted, $remainder) = extract_codeblock($text,$delim);
1086
1087
1088  # Extract the initial substrings of $text that would be extracted by
1089  # one or more sequential applications of the specified functions
1090  # or regular expressions
1091
1092         @extracted = extract_multiple($text,
1093                                       [ \&extract_bracketed,
1094                                         \&extract_quotelike,
1095                                         \&some_other_extractor_sub,
1096                                         qr/[xyz]*/,
1097                                         'literal',
1098                                       ]);
1099
1100 # Create a string representing an optimized pattern (a la Friedl)
1101 # that matches a substring delimited by any of the specified characters
1102 # (in this case: any type of quote or a slash)
1103
1104         $patstring = gen_delimited_pat(q{'"`/});
1105
1106
1107 # Generate a reference to an anonymous sub that is just like extract_tagged
1108 # but pre-compiled and optimized for a specific pair of tags, and consequently
1109 # much faster (i.e. 3 times faster). It uses qr// for better performance on
1110 # repeated calls, so it only works under Perl 5.005 or later.
1111
1112         $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1113
1114         ($extracted, $remainder) = $extract_head->($text);
1115
1116
1117 =head1 DESCRIPTION
1118
1119 The various C<extract_...> subroutines may be used to
1120 extract a delimited substring, possibly after skipping a
1121 specified prefix string. By default, that prefix is
1122 optional whitespace (C</\s*/>), but you can change it to whatever
1123 you wish (see below).
1124
1125 The substring to be extracted must appear at the
1126 current C<pos> location of the string's variable
1127 (or at index zero, if no C<pos> position is defined).
1128 In other words, the C<extract_...> subroutines I<don't>
1129 extract the first occurance of a substring anywhere
1130 in a string (like an unanchored regex would). Rather,
1131 they extract an occurance of the substring appearing
1132 immediately at the current matching position in the
1133 string (like a C<\G>-anchored regex would).
1134
1135
1136
1137 =head2 General behaviour in list contexts
1138
1139 In a list context, all the subroutines return a list, the first three
1140 elements of which are always:
1141
1142 =over 4
1143
1144 =item [0]
1145
1146 The extracted string, including the specified delimiters.
1147 If the extraction fails an empty string is returned.
1148
1149 =item [1]
1150
1151 The remainder of the input string (i.e. the characters after the
1152 extracted string). On failure, the entire string is returned.
1153
1154 =item [2]
1155
1156 The skipped prefix (i.e. the characters before the extracted string).
1157 On failure, the empty string is returned.
1158
1159 =back
1160
1161 Note that in a list context, the contents of the original input text (the first
1162 argument) are not modified in any way.
1163
1164 However, if the input text was passed in a variable, that variable's
1165 C<pos> value is updated to point at the first character after the
1166 extracted text. That means that in a list context the various
1167 subroutines can be used much like regular expressions. For example:
1168
1169         while ( $next = (extract_quotelike($text))[0] )
1170         {
1171                 # process next quote-like (in $next)
1172         }
1173
1174
1175 =head2 General behaviour in scalar and void contexts
1176
1177 In a scalar context, the extracted string is returned, having first been
1178 removed from the input text. Thus, the following code also processes
1179 each quote-like operation, but actually removes them from $text:
1180
1181         while ( $next = extract_quotelike($text) )
1182         {
1183                 # process next quote-like (in $next)
1184         }
1185
1186 Note that if the input text is a read-only string (i.e. a literal),
1187 no attempt is made to remove the extracted text.
1188
1189 In a void context the behaviour of the extraction subroutines is
1190 exactly the same as in a scalar context, except (of course) that the
1191 extracted substring is not returned.
1192
1193 =head2 A note about prefixes
1194
1195 Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1196 This can bite you if you're expecting a prefix specification like
1197 '.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1198 pattern will only succeed if the <H1> tag is on the current line, since
1199 . normally doesn't match newlines.
1200
1201 To overcome this limitation, you need to turn on /s matching within
1202 the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1203
1204
1205 =head2 C<extract_delimited>
1206
1207 The C<extract_delimited> function formalizes the common idiom
1208 of extracting a single-character-delimited substring from the start of
1209 a string. For example, to extract a single-quote delimited string, the
1210 following code is typically used:
1211
1212         ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1213         $extracted = $1;
1214
1215 but with C<extract_delimited> it can be simplified to:
1216
1217         ($extracted,$remainder) = extract_delimited($text, "'");
1218
1219 C<extract_delimited> takes up to four scalars (the input text, the
1220 delimiters, a prefix pattern to be skipped, and any escape characters)
1221 and extracts the initial substring of the text that
1222 is appropriately delimited. If the delimiter string has multiple
1223 characters, the first one encountered in the text is taken to delimit
1224 the substring.
1225 The third argument specifies a prefix pattern that is to be skipped
1226 (but must be present!) before the substring is extracted.
1227 The final argument specifies the escape character to be used for each
1228 delimiter.
1229
1230 All arguments are optional. If the escape characters are not specified,
1231 every delimiter is escaped with a backslash (C<\>).
1232 If the prefix is not specified, the
1233 pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1234 is also not specified, the set C</["'`]/> is used. If the text to be processed
1235 is not specified either, C<$_> is used.
1236
1237 In list context, C<extract_delimited> returns a array of three
1238 elements, the extracted substring (I<including the surrounding
1239 delimiters>), the remainder of the text, and the skipped prefix (if
1240 any). If a suitable delimited substring is not found, the first
1241 element of the array is the empty string, the second is the complete
1242 original text, and the prefix returned in the third element is an
1243 empty string.
1244
1245 In a scalar context, just the extracted substring is returned. In
1246 a void context, the extracted substring (and any prefix) are simply
1247 removed from the beginning of the first argument.
1248
1249 Examples:
1250
1251         # Remove a single-quoted substring from the very beginning of $text:
1252
1253                 $substring = extract_delimited($text, "'", '');
1254
1255         # Remove a single-quoted Pascalish substring (i.e. one in which
1256         # doubling the quote character escapes it) from the very
1257         # beginning of $text:
1258
1259                 $substring = extract_delimited($text, "'", '', "'");
1260
1261         # Extract a single- or double- quoted substring from the
1262         # beginning of $text, optionally after some whitespace
1263         # (note the list context to protect $text from modification):
1264
1265                 ($substring) = extract_delimited $text, q{"'};
1266
1267
1268         # Delete the substring delimited by the first '/' in $text:
1269
1270                 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1271
1272 Note that this last example is I<not> the same as deleting the first
1273 quote-like pattern. For instance, if C<$text> contained the string:
1274
1275         "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1276
1277 then after the deletion it would contain:
1278
1279         "if ('.$UNIXCMD/s) { $cmd = $1; }"
1280
1281 not:
1282
1283         "if ('./cmd' =~ ms) { $cmd = $1; }"
1284
1285
1286 See L<"extract_quotelike"> for a (partial) solution to this problem.
1287
1288
1289 =head2 C<extract_bracketed>
1290
1291 Like C<"extract_delimited">, the C<extract_bracketed> function takes
1292 up to three optional scalar arguments: a string to extract from, a delimiter
1293 specifier, and a prefix pattern. As before, a missing prefix defaults to
1294 optional whitespace and a missing text defaults to C<$_>. However, a missing
1295 delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1296
1297 C<extract_bracketed> extracts a balanced-bracket-delimited
1298 substring (using any one (or more) of the user-specified delimiter
1299 brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1300 respect quoted unbalanced brackets (see below).
1301
1302 A "delimiter bracket" is a bracket in list of delimiters passed as
1303 C<extract_bracketed>'s second argument. Delimiter brackets are
1304 specified by giving either the left or right (or both!) versions
1305 of the required bracket(s). Note that the order in which
1306 two or more delimiter brackets are specified is not significant.
1307
1308 A "balanced-bracket-delimited substring" is a substring bounded by
1309 matched brackets, such that any other (left or right) delimiter
1310 bracket I<within> the substring is also matched by an opposite
1311 (right or left) delimiter bracket I<at the same level of nesting>. Any
1312 type of bracket not in the delimiter list is treated as an ordinary
1313 character.
1314
1315 In other words, each type of bracket specified as a delimiter must be
1316 balanced and correctly nested within the substring, and any other kind of
1317 ("non-delimiter") bracket in the substring is ignored.
1318
1319 For example, given the string:
1320
1321         $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1322
1323 then a call to C<extract_bracketed> in a list context:
1324
1325         @result = extract_bracketed( $text, '{}' );
1326
1327 would return:
1328
1329         ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1330
1331 since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1332 (In a scalar context just the first element of the array would be returned. In
1333 a void context, C<$text> would be replaced by an empty string.)
1334
1335 Likewise the call in:
1336
1337         @result = extract_bracketed( $text, '{[' );
1338
1339 would return the same result, since all sets of both types of specified
1340 delimiter brackets are correctly nested and balanced.
1341
1342 However, the call in:
1343
1344         @result = extract_bracketed( $text, '{([<' );
1345
1346 would fail, returning:
1347
1348         ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }"  );
1349
1350 because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1351 the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1352 return an empty string. In a void context, C<$text> would be unchanged.)
1353
1354 Note that the embedded single-quotes in the string don't help in this
1355 case, since they have not been specified as acceptable delimiters and are
1356 therefore treated as non-delimiter characters (and ignored).
1357
1358 However, if a particular species of quote character is included in the
1359 delimiter specification, then that type of quote will be correctly handled.
1360 for example, if C<$text> is:
1361
1362         $text = '<A HREF=">>>>">link</A>';
1363
1364 then
1365
1366         @result = extract_bracketed( $text, '<">' );
1367
1368 returns:
1369
1370         ( '<A HREF=">>>>">', 'link</A>', "" )
1371
1372 as expected. Without the specification of C<"> as an embedded quoter:
1373
1374         @result = extract_bracketed( $text, '<>' );
1375
1376 the result would be:
1377
1378         ( '<A HREF=">', '>>>">link</A>', "" )
1379
1380 In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1381 quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1382 letter 'q' as a delimiter. Hence:
1383
1384         @result = extract_bracketed( $text, '<q>' );
1385
1386 would correctly match something like this:
1387
1388         $text = '<leftop: conj /and/ conj>';
1389
1390 See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1391
1392
1393 =head2 C<extract_variable>
1394
1395 C<extract_variable> extracts any valid Perl variable or
1396 variable-involved expression, including scalars, arrays, hashes, array
1397 accesses, hash look-ups, method calls through objects, subroutine calles
1398 through subroutine references, etc.
1399
1400 The subroutine takes up to two optional arguments:
1401
1402 =over 4
1403
1404 =item 1.
1405
1406 A string to be processed (C<$_> if the string is omitted or C<undef>)
1407
1408 =item 2.
1409
1410 A string specifying a pattern to be matched as a prefix (which is to be
1411 skipped). If omitted, optional whitespace is skipped.
1412
1413 =back
1414
1415 On success in a list context, an array of 3 elements is returned. The
1416 elements are:
1417
1418 =over 4
1419
1420 =item [0]
1421
1422 the extracted variable, or variablish expression
1423
1424 =item [1]
1425
1426 the remainder of the input text,
1427
1428 =item [2]
1429
1430 the prefix substring (if any),
1431
1432 =back
1433
1434 On failure, all of these values (except the remaining text) are C<undef>.
1435
1436 In a scalar context, C<extract_variable> returns just the complete
1437 substring that matched a variablish expression. C<undef> is returned on
1438 failure. In addition, the original input text has the returned substring
1439 (and any prefix) removed from it.
1440
1441 In a void context, the input text just has the matched substring (and
1442 any specified prefix) removed.
1443
1444
1445 =head2 C<extract_tagged>
1446
1447 C<extract_tagged> extracts and segments text between (balanced)
1448 specified tags.
1449
1450 The subroutine takes up to five optional arguments:
1451
1452 =over 4
1453
1454 =item 1.
1455
1456 A string to be processed (C<$_> if the string is omitted or C<undef>)
1457
1458 =item 2.
1459
1460 A string specifying a pattern to be matched as the opening tag.
1461 If the pattern string is omitted (or C<undef>) then a pattern
1462 that matches any standard XML tag is used.
1463
1464 =item 3.
1465
1466 A string specifying a pattern to be matched at the closing tag.
1467 If the pattern string is omitted (or C<undef>) then the closing
1468 tag is constructed by inserting a C</> after any leading bracket
1469 characters in the actual opening tag that was matched (I<not> the pattern
1470 that matched the tag). For example, if the opening tag pattern
1471 is specified as C<'{{\w+}}'> and actually matched the opening tag
1472 C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1473
1474 =item 4.
1475
1476 A string specifying a pattern to be matched as a prefix (which is to be
1477 skipped). If omitted, optional whitespace is skipped.
1478
1479 =item 5.
1480
1481 A hash reference containing various parsing options (see below)
1482
1483 =back
1484
1485 The various options that can be specified are:
1486
1487 =over 4
1488
1489 =item C<reject =E<gt> $listref>
1490
1491 The list reference contains one or more strings specifying patterns
1492 that must I<not> appear within the tagged text.
1493
1494 For example, to extract
1495 an HTML link (which should not contain nested links) use:
1496
1497         extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1498
1499 =item C<ignore =E<gt> $listref>
1500
1501 The list reference contains one or more strings specifying patterns
1502 that are I<not> be be treated as nested tags within the tagged text
1503 (even if they would match the start tag pattern).
1504
1505 For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1506
1507         extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1508
1509 (also see L<"gen_delimited_pat"> below).
1510
1511
1512 =item C<fail =E<gt> $str>
1513
1514 The C<fail> option indicates the action to be taken if a matching end
1515 tag is not encountered (i.e. before the end of the string or some
1516 C<reject> pattern matches). By default, a failure to match a closing
1517 tag causes C<extract_tagged> to immediately fail.
1518
1519 However, if the string value associated with <reject> is "MAX", then
1520 C<extract_tagged> returns the complete text up to the point of failure.
1521 If the string is "PARA", C<extract_tagged> returns only the first paragraph
1522 after the tag (up to the first line that is either empty or contains
1523 only whitespace characters).
1524 If the string is "", the the default behaviour (i.e. failure) is reinstated.
1525
1526 For example, suppose the start tag "/para" introduces a paragraph, which then
1527 continues until the next "/endpara" tag or until another "/para" tag is
1528 encountered:
1529
1530         $text = "/para line 1\n\nline 3\n/para line 4";
1531
1532         extract_tagged($text, '/para', '/endpara', undef,
1533                                 {reject => '/para', fail => MAX );
1534
1535         # EXTRACTED: "/para line 1\n\nline 3\n"
1536
1537 Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1538 tag refers only to the immediately following paragraph:
1539
1540         $text = "/para line 1\n\nline 3\n/para line 4";
1541
1542         extract_tagged($text, '/para', '/endpara', undef,
1543                         {reject => '/para', fail => MAX );
1544
1545         # EXTRACTED: "/para line 1\n"
1546
1547 Note that the specified C<fail> behaviour applies to nested tags as well.
1548
1549 =back
1550
1551 On success in a list context, an array of 6 elements is returned. The elements are:
1552
1553 =over 4
1554
1555 =item [0]
1556
1557 the extracted tagged substring (including the outermost tags),
1558
1559 =item [1]
1560
1561 the remainder of the input text,
1562
1563 =item [2]
1564
1565 the prefix substring (if any),
1566
1567 =item [3]
1568
1569 the opening tag
1570
1571 =item [4]
1572
1573 the text between the opening and closing tags
1574
1575 =item [5]
1576
1577 the closing tag (or "" if no closing tag was found)
1578
1579 =back
1580
1581 On failure, all of these values (except the remaining text) are C<undef>.
1582
1583 In a scalar context, C<extract_tagged> returns just the complete
1584 substring that matched a tagged text (including the start and end
1585 tags). C<undef> is returned on failure. In addition, the original input
1586 text has the returned substring (and any prefix) removed from it.
1587
1588 In a void context, the input text just has the matched substring (and
1589 any specified prefix) removed.
1590
1591
1592 =head2 C<gen_extract_tagged>
1593
1594 (Note: This subroutine is only available under Perl5.005)
1595
1596 C<gen_extract_tagged> generates a new anonymous subroutine which
1597 extracts text between (balanced) specified tags. In other words,
1598 it generates a function identical in function to C<extract_tagged>.
1599
1600 The difference between C<extract_tagged> and the anonymous
1601 subroutines generated by
1602 C<gen_extract_tagged>, is that those generated subroutines:
1603
1604 =over 4
1605
1606 =item *
1607
1608 do not have to reparse tag specification or parsing options every time
1609 they are called (whereas C<extract_tagged> has to effectively rebuild
1610 its tag parser on every call);
1611
1612 =item *
1613
1614 make use of the new qr// construct to pre-compile the regexes they use
1615 (whereas C<extract_tagged> uses standard string variable interpolation
1616 to create tag-matching patterns).
1617
1618 =back
1619
1620 The subroutine takes up to four optional arguments (the same set as
1621 C<extract_tagged> except for the string to be processed). It returns
1622 a reference to a subroutine which in turn takes a single argument (the text to
1623 be extracted from).
1624
1625 In other words, the implementation of C<extract_tagged> is exactly
1626 equivalent to:
1627
1628         sub extract_tagged
1629         {
1630                 my $text = shift;
1631                 $extractor = gen_extract_tagged(@_);
1632                 return $extractor->($text);
1633         }
1634
1635 (although C<extract_tagged> is not currently implemented that way, in order
1636 to preserve pre-5.005 compatibility).
1637
1638 Using C<gen_extract_tagged> to create extraction functions for specific tags
1639 is a good idea if those functions are going to be called more than once, since
1640 their performance is typically twice as good as the more general-purpose
1641 C<extract_tagged>.
1642
1643
1644 =head2 C<extract_quotelike>
1645
1646 C<extract_quotelike> attempts to recognize, extract, and segment any
1647 one of the various Perl quotes and quotelike operators (see
1648 L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1649 delimiters (for the quotelike operators), and trailing modifiers are
1650 all caught. For example, in:
1651
1652         extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1653
1654         extract_quotelike '  "You said, \"Use sed\"."  '
1655
1656         extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1657
1658         extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1659
1660 the full Perl quotelike operations are all extracted correctly.
1661
1662 Note too that, when using the /x modifier on a regex, any comment
1663 containing the current pattern delimiter will cause the regex to be
1664 immediately terminated. In other words:
1665
1666         'm /
1667                 (?i)            # CASE INSENSITIVE
1668                 [a-z_]          # LEADING ALPHABETIC/UNDERSCORE
1669                 [a-z0-9]*       # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1670            /x'
1671
1672 will be extracted as if it were:
1673
1674         'm /
1675                 (?i)            # CASE INSENSITIVE
1676                 [a-z_]          # LEADING ALPHABETIC/'
1677
1678 This behaviour is identical to that of the actual compiler.
1679
1680 C<extract_quotelike> takes two arguments: the text to be processed and
1681 a prefix to be matched at the very beginning of the text. If no prefix
1682 is specified, optional whitespace is the default. If no text is given,
1683 C<$_> is used.
1684
1685 In a list context, an array of 11 elements is returned. The elements are:
1686
1687 =over 4
1688
1689 =item [0]
1690
1691 the extracted quotelike substring (including trailing modifiers),
1692
1693 =item [1]
1694
1695 the remainder of the input text,
1696
1697 =item [2]
1698
1699 the prefix substring (if any),
1700
1701 =item [3]
1702
1703 the name of the quotelike operator (if any),
1704
1705 =item [4]
1706
1707 the left delimiter of the first block of the operation,
1708
1709 =item [5]
1710
1711 the text of the first block of the operation
1712 (that is, the contents of
1713 a quote, the regex of a match or substitution or the target list of a
1714 translation),
1715
1716 =item [6]
1717
1718 the right delimiter of the first block of the operation,
1719
1720 =item [7]
1721
1722 the left delimiter of the second block of the operation
1723 (that is, if it is a C<s>, C<tr>, or C<y>),
1724
1725 =item [8]
1726
1727 the text of the second block of the operation
1728 (that is, the replacement of a substitution or the translation list
1729 of a translation),
1730
1731 =item [9]
1732
1733 the right delimiter of the second block of the operation (if any),
1734
1735 =item [10]
1736
1737 the trailing modifiers on the operation (if any).
1738
1739 =back
1740
1741 For each of the fields marked "(if any)" the default value on success is
1742 an empty string.
1743 On failure, all of these values (except the remaining text) are C<undef>.
1744
1745
1746 In a scalar context, C<extract_quotelike> returns just the complete substring
1747 that matched a quotelike operation (or C<undef> on failure). In a scalar or
1748 void context, the input text has the same substring (and any specified
1749 prefix) removed.
1750
1751 Examples:
1752
1753         # Remove the first quotelike literal that appears in text
1754
1755                 $quotelike = extract_quotelike($text,'.*?');
1756
1757         # Replace one or more leading whitespace-separated quotelike
1758         # literals in $_ with "<QLL>"
1759
1760                 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1761
1762
1763         # Isolate the search pattern in a quotelike operation from $text
1764
1765                 ($op,$pat) = (extract_quotelike $text)[3,5];
1766                 if ($op =~ /[ms]/)
1767                 {
1768                         print "search pattern: $pat\n";
1769                 }
1770                 else
1771                 {
1772                         print "$op is not a pattern matching operation\n";
1773                 }
1774
1775
1776 =head2 C<extract_quotelike> and "here documents"
1777
1778 C<extract_quotelike> can successfully extract "here documents" from an input
1779 string, but with an important caveat in list contexts.
1780
1781 Unlike other types of quote-like literals, a here document is rarely
1782 a contiguous substring. For example, a typical piece of code using
1783 here document might look like this:
1784
1785         <<'EOMSG' || die;
1786         This is the message.
1787         EOMSG
1788         exit;
1789
1790 Given this as an input string in a scalar context, C<extract_quotelike>
1791 would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1792 leaving the string " || die;\nexit;" in the original variable. In other words,
1793 the two separate pieces of the here document are successfully extracted and
1794 concatenated.
1795
1796 In a list context, C<extract_quotelike> would return the list
1797
1798 =over 4
1799
1800 =item [0]
1801
1802 "<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1803 including fore and aft delimiters),
1804
1805 =item [1]
1806
1807 " || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1808
1809 =item [2]
1810
1811 "" (i.e. the prefix substring -- trivial in this case),
1812
1813 =item [3]
1814
1815 "<<" (i.e. the "name" of the quotelike operator)
1816
1817 =item [4]
1818
1819 "'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1820
1821 =item [5]
1822
1823 "This is the message.\n" (i.e. the text of the here document),
1824
1825 =item [6]
1826
1827 "EOMSG" (i.e. the right delimiter of the here document),
1828
1829 =item [7..10]
1830
1831 "" (a here document has no second left delimiter, second text, second right
1832 delimiter, or trailing modifiers).
1833
1834 =back
1835
1836 However, the matching position of the input variable would be set to
1837 "exit;" (i.e. I<after> the closing delimiter of the here document),
1838 which would cause the earlier " || die;\nexit;" to be skipped in any
1839 sequence of code fragment extractions.
1840
1841 To avoid this problem, when it encounters a here document whilst
1842 extracting from a modifiable string, C<extract_quotelike> silently
1843 rearranges the string to an equivalent piece of Perl:
1844
1845         <<'EOMSG'
1846         This is the message.
1847         EOMSG
1848         || die;
1849         exit;
1850
1851 in which the here document I<is> contiguous. It still leaves the
1852 matching position after the here document, but now the rest of the line
1853 on which the here document starts is not skipped.
1854
1855 To prevent <extract_quotelike> from mucking about with the input in this way
1856 (this is the only case where a list-context C<extract_quotelike> does so),
1857 you can pass the input variable as an interpolated literal:
1858
1859         $quotelike = extract_quotelike("$var");
1860
1861
1862 =head2 C<extract_codeblock>
1863
1864 C<extract_codeblock> attempts to recognize and extract a balanced
1865 bracket delimited substring that may contain unbalanced brackets
1866 inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1867 is like a combination of C<"extract_bracketed"> and
1868 C<"extract_quotelike">.
1869
1870 C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1871 a text to process, a set of delimiter brackets to look for, and a prefix to
1872 match first. It also takes an optional fourth parameter, which allows the
1873 outermost delimiter brackets to be specified separately (see below).
1874
1875 Omitting the first argument (input text) means process C<$_> instead.
1876 Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1877 Omitting the third argument (prefix argument) implies optional whitespace at the start.
1878 Omitting the fourth argument (outermost delimiter brackets) indicates that the
1879 value of the second argument is to be used for the outermost delimiters.
1880
1881 Once the prefix an dthe outermost opening delimiter bracket have been
1882 recognized, code blocks are extracted by stepping through the input text and
1883 trying the following alternatives in sequence:
1884
1885 =over 4
1886
1887 =item 1.
1888
1889 Try and match a closing delimiter bracket. If the bracket was the same
1890 species as the last opening bracket, return the substring to that
1891 point. If the bracket was mismatched, return an error.
1892
1893 =item 2.
1894
1895 Try to match a quote or quotelike operator. If found, call
1896 C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1897 the error it returned. Otherwise go back to step 1.
1898
1899 =item 3.
1900
1901 Try to match an opening delimiter bracket. If found, call
1902 C<extract_codeblock> recursively to eat the embedded block. If the
1903 recursive call fails, return an error. Otherwise, go back to step 1.
1904
1905 =item 4.
1906
1907 Unconditionally match a bareword or any other single character, and
1908 then go back to step 1.
1909
1910 =back
1911
1912
1913 Examples:
1914
1915         # Find a while loop in the text
1916
1917                 if ($text =~ s/.*?while\s*\{/{/)
1918                 {
1919                         $loop = "while " . extract_codeblock($text);
1920                 }
1921
1922         # Remove the first round-bracketed list (which may include
1923         # round- or curly-bracketed code blocks or quotelike operators)
1924
1925                 extract_codeblock $text, "(){}", '[^(]*';
1926
1927
1928 The ability to specify a different outermost delimiter bracket is useful
1929 in some circumstances. For example, in the Parse::RecDescent module,
1930 parser actions which are to be performed only on a successful parse
1931 are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1932
1933         sentence: subject verb object
1934                         <defer: {$::theVerb = $item{verb}} >
1935
1936 Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1937 within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1938
1939 A deferred action like this:
1940
1941                         <defer: {if ($count>10) {$count--}} >
1942
1943 will be incorrectly parsed as:
1944
1945                         <defer: {if ($count>
1946
1947 because the "less than" operator is interpreted as a closing delimiter.
1948
1949 But, by extracting the directive using
1950 S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1951 the '>' character is only treated as a delimited at the outermost
1952 level of the code block, so the directive is parsed correctly.
1953
1954 =head2 C<extract_multiple>
1955
1956 The C<extract_multiple> subroutine takes a string to be processed and a
1957 list of extractors (subroutines or regular expressions) to apply to that string.
1958
1959 In an array context C<extract_multiple> returns an array of substrings
1960 of the original string, as extracted by the specified extractors.
1961 In a scalar context, C<extract_multiple> returns the first
1962 substring successfully extracted from the original string. In both
1963 scalar and void contexts the original string has the first successfully
1964 extracted substring removed from it. In all contexts
1965 C<extract_multiple> starts at the current C<pos> of the string, and
1966 sets that C<pos> appropriately after it matches.
1967
1968 Hence, the aim of of a call to C<extract_multiple> in a list context
1969 is to split the processed string into as many non-overlapping fields as
1970 possible, by repeatedly applying each of the specified extractors
1971 to the remainder of the string. Thus C<extract_multiple> is
1972 a generalized form of Perl's C<split> subroutine.
1973
1974 The subroutine takes up to four optional arguments:
1975
1976 =over 4
1977
1978 =item 1.
1979
1980 A string to be processed (C<$_> if the string is omitted or C<undef>)
1981
1982 =item 2.
1983
1984 A reference to a list of subroutine references and/or qr// objects and/or
1985 literal strings and/or hash references, specifying the extractors
1986 to be used to split the string. If this argument is omitted (or
1987 C<undef>) the list:
1988
1989         [
1990                 sub { extract_variable($_[0], '') },
1991                 sub { extract_quotelike($_[0],'') },
1992                 sub { extract_codeblock($_[0],'{}','') },
1993         ]
1994
1995 is used.
1996
1997
1998 =item 3.
1999
2000 An number specifying the maximum number of fields to return. If this
2001 argument is omitted (or C<undef>), split continues as long as possible.
2002
2003 If the third argument is I<N>, then extraction continues until I<N> fields
2004 have been successfully extracted, or until the string has been completely
2005 processed.
2006
2007 Note that in scalar and void contexts the value of this argument is
2008 automatically reset to 1 (under C<-w>, a warning is issued if the argument
2009 has to be reset).
2010
2011 =item 4.
2012
2013 A value indicating whether unmatched substrings (see below) within the
2014 text should be skipped or returned as fields. If the value is true,
2015 such substrings are skipped. Otherwise, they are returned.
2016
2017 =back
2018
2019 The extraction process works by applying each extractor in
2020 sequence to the text string.
2021
2022 If the extractor is a subroutine it is called in a list context and is
2023 expected to return a list of a single element, namely the extracted
2024 text. It may optionally also return two further arguments: a string
2025 representing the text left after extraction (like $' for a pattern
2026 match), and a string representing any prefix skipped before the
2027 extraction (like $` in a pattern match). Note that this is designed
2028 to facilitate the use of other Text::Balanced subroutines with
2029 C<extract_multiple>. Note too that the value returned by an extractor
2030 subroutine need not bear any relationship to the corresponding substring
2031 of the original text (see examples below).
2032
2033 If the extractor is a precompiled regular expression or a string,
2034 it is matched against the text in a scalar context with a leading
2035 '\G' and the gc modifiers enabled. The extracted value is either
2036 $1 if that variable is defined after the match, or else the
2037 complete match (i.e. $&).
2038
2039 If the extractor is a hash reference, it must contain exactly one element.
2040 The value of that element is one of the
2041 above extractor types (subroutine reference, regular expression, or string).
2042 The key of that element is the name of a class into which the successful
2043 return value of the extractor will be blessed.
2044
2045 If an extractor returns a defined value, that value is immediately
2046 treated as the next extracted field and pushed onto the list of fields.
2047 If the extractor was specified in a hash reference, the field is also
2048 blessed into the appropriate class,
2049
2050 If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
2051 assumed to have failed to extract.
2052 If none of the extractor subroutines succeeds, then one
2053 character is extracted from the start of the text and the extraction
2054 subroutines reapplied. Characters which are thus removed are accumulated and
2055 eventually become the next field (unless the fourth argument is true, in which
2056 case they are disgarded).
2057
2058 For example, the following extracts substrings that are valid Perl variables:
2059
2060         @fields = extract_multiple($text,
2061                                    [ sub { extract_variable($_[0]) } ],
2062                                    undef, 1);
2063
2064 This example separates a text into fields which are quote delimited,
2065 curly bracketed, and anything else. The delimited and bracketed
2066 parts are also blessed to identify them (the "anything else" is unblessed):
2067
2068         @fields = extract_multiple($text,
2069                    [
2070                         { Delim => sub { extract_delimited($_[0],q{'"}) } },
2071                         { Brack => sub { extract_bracketed($_[0],'{}') } },
2072                    ]);
2073
2074 This call extracts the next single substring that is a valid Perl quotelike
2075 operator (and removes it from $text):
2076
2077         $quotelike = extract_multiple($text,
2078                                       [
2079                                         sub { extract_quotelike($_[0]) },
2080                                       ], undef, 1);
2081
2082 Finally, here is yet another way to do comma-separated value parsing:
2083
2084         @fields = extract_multiple($csv_text,
2085                                   [
2086                                         sub { extract_delimited($_[0],q{'"}) },
2087                                         qr/([^,]+)(.*)/,
2088                                   ],
2089                                   undef,1);
2090
2091 The list in the second argument means:
2092 I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2093 The undef third argument means:
2094 I<"...as many times as possible...">,
2095 and the true value in the fourth argument means
2096 I<"...discarding anything else that appears (i.e. the commas)">.
2097
2098 If you wanted the commas preserved as separate fields (i.e. like split
2099 does if your split pattern has capturing parentheses), you would
2100 just make the last parameter undefined (or remove it).
2101
2102
2103 =head2 C<gen_delimited_pat>
2104
2105 The C<gen_delimited_pat> subroutine takes a single (string) argument and
2106    > builds a Friedl-style optimized regex that matches a string delimited
2107 by any one of the characters in the single argument. For example:
2108
2109         gen_delimited_pat(q{'"})
2110
2111 returns the regex:
2112
2113         (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2114
2115 Note that the specified delimiters are automatically quotemeta'd.
2116
2117 A typical use of C<gen_delimited_pat> would be to build special purpose tags
2118 for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2119 (which might contain quoted strings):
2120
2121         my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2122
2123         extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2124
2125
2126 C<gen_delimited_pat> may also be called with an optional second argument,
2127 which specifies the "escape" character(s) to be used for each delimiter.
2128 For example to match a Pascal-style string (where ' is the delimiter
2129 and '' is a literal ' within the string):
2130
2131         gen_delimited_pat(q{'},q{'});
2132
2133 Different escape characters can be specified for different delimiters.
2134 For example, to specify that '/' is the escape for single quotes
2135 and '%' is the escape for double quotes:
2136
2137         gen_delimited_pat(q{'"},q{/%});
2138
2139 If more delimiters than escape chars are specified, the last escape char
2140 is used for the remaining delimiters.
2141 If no escape char is specified for a given specified delimiter, '\' is used.
2142
2143 Note that
2144 C<gen_delimited_pat> was previously called
2145 C<delimited_pat>. That name may still be used, but is now deprecated.
2146
2147
2148 =head1 DIAGNOSTICS
2149
2150 In a list context, all the functions return C<(undef,$original_text)>
2151 on failure. In a scalar context, failure is indicated by returning C<undef>
2152 (in this case the input text is not modified in any way).
2153
2154 In addition, on failure in I<any> context, the C<$@> variable is set.
2155 Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2156 below.
2157 Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2158 which the error was detected (although not necessarily where it occurred!)
2159 Printing C<$@> directly produces the error message, with the offset appended.
2160 On success, the C<$@> variable is guaranteed to be C<undef>.
2161
2162 The available diagnostics are:
2163
2164 =over 4
2165
2166 =item  C<Did not find a suitable bracket: "%s">
2167
2168 The delimiter provided to C<extract_bracketed> was not one of
2169 C<'()[]E<lt>E<gt>{}'>.
2170
2171 =item  C<Did not find prefix: /%s/>
2172
2173 A non-optional prefix was specified but wasn't found at the start of the text.
2174
2175 =item  C<Did not find opening bracket after prefix: "%s">
2176
2177 C<extract_bracketed> or C<extract_codeblock> was expecting a
2178 particular kind of bracket at the start of the text, and didn't find it.
2179
2180 =item  C<No quotelike operator found after prefix: "%s">
2181
2182 C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2183 C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2184 it was extracting.
2185
2186 =item  C<Unmatched closing bracket: "%c">
2187
2188 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2189 a closing bracket where none was expected.
2190
2191 =item  C<Unmatched opening bracket(s): "%s">
2192
2193 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2194 out of characters in the text before closing one or more levels of nested
2195 brackets.
2196
2197 =item C<Unmatched embedded quote (%s)>
2198
2199 C<extract_bracketed> attempted to match an embedded quoted substring, but
2200 failed to find a closing quote to match it.
2201
2202 =item C<Did not find closing delimiter to match '%s'>
2203
2204 C<extract_quotelike> was unable to find a closing delimiter to match the
2205 one that opened the quote-like operation.
2206
2207 =item  C<Mismatched closing bracket: expected "%c" but found "%s">
2208
2209 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2210 a valid bracket delimiter, but it was the wrong species. This usually
2211 indicates a nesting error, but may indicate incorrect quoting or escaping.
2212
2213 =item  C<No block delimiter found after quotelike "%s">
2214
2215 C<extract_quotelike> or C<extract_codeblock> found one of the
2216 quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2217 without a suitable block after it.
2218
2219 =item C<Did not find leading dereferencer>
2220
2221 C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2222 a variable, but didn't find any of them.
2223
2224 =item C<Bad identifier after dereferencer>
2225
2226 C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2227 character was not followed by a legal Perl identifier.
2228
2229 =item C<Did not find expected opening bracket at %s>
2230
2231 C<extract_codeblock> failed to find any of the outermost opening brackets
2232 that were specified.
2233
2234 =item C<Improperly nested codeblock at %s>
2235
2236 A nested code block was found that started with a delimiter that was specified
2237 as being only to be used as an outermost bracket.
2238
2239 =item  C<Missing second block for quotelike "%s">
2240
2241 C<extract_codeblock> or C<extract_quotelike> found one of the
2242 quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2243
2244 =item C<No match found for opening bracket>
2245
2246 C<extract_codeblock> failed to find a closing bracket to match the outermost
2247 opening bracket.
2248
2249 =item C<Did not find opening tag: /%s/>
2250
2251 C<extract_tagged> did not find a suitable opening tag (after any specified
2252 prefix was removed).
2253
2254 =item C<Unable to construct closing tag to match: /%s/>
2255
2256 C<extract_tagged> matched the specified opening tag and tried to
2257 modify the matched text to produce a matching closing tag (because
2258 none was specified). It failed to generate the closing tag, almost
2259 certainly because the opening tag did not start with a
2260 bracket of some kind.
2261
2262 =item C<Found invalid nested tag: %s>
2263
2264 C<extract_tagged> found a nested tag that appeared in the "reject" list
2265 (and the failure mode was not "MAX" or "PARA").
2266
2267 =item C<Found unbalanced nested tag: %s>
2268
2269 C<extract_tagged> found a nested opening tag that was not matched by a
2270 corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2271
2272 =item C<Did not find closing tag>
2273
2274 C<extract_tagged> reached the end of the text without finding a closing tag
2275 to match the original opening tag (and the failure mode was not
2276 "MAX" or "PARA").
2277
2278
2279
2280
2281 =back
2282
2283
2284 =head1 AUTHOR
2285
2286 Damian Conway (damian@conway.org)
2287
2288
2289 =head1 BUGS AND IRRITATIONS
2290
2291 There are undoubtedly serious bugs lurking somewhere in this code, if
2292 only because parts of it give the impression of understanding a great deal
2293 more about Perl than they really do.
2294
2295 Bug reports and other feedback are most welcome.
2296
2297
2298 =head1 COPYRIGHT
2299
2300  Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
2301  This module is free software. It may be used, redistributed
2302      and/or modified under the same terms as Perl itself.