lib/Text/Balanced.pm

   1 # EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
   2 # FOR FULL DOCUMENTATION SEE Balanced.pod
   3
   4 use 5.005;
   5 use strict;
   6
   7 package Text::Balanced;
   8
   9 use Exporter;
  10 use vars qw { $VERSION @ISA %EXPORT_TAGS };
  11
  12 $VERSION = '1.95_01';
  13 @ISA            = qw ( Exporter );
  14
  15 %EXPORT_TAGS    = ( ALL => [ qw(
  16                                 &extract_delimited
  17                                 &extract_bracketed
  18                                 &extract_quotelike
  19                                 &extract_codeblock
  20                                 &extract_variable
  21                                 &extract_tagged
  22                                 &extract_multiple
  23
  24                                 &gen_delimited_pat
  25                                 &gen_extract_tagged
  26
  27                                 &delimited_pat
  28                                ) ] );
  29
  30 Exporter::export_ok_tags('ALL');
  31
  32 # PROTOTYPES
  33
  34 sub _match_bracketed($$$$$$);
  35 sub _match_variable($$);
  36 sub _match_codeblock($$$$$$$);
  37 sub _match_quotelike($$$$);
  38
  39 sub carp {
  40   require Carp; goto &Carp::carp;
  41 }
  42
  43 sub croak {
  44   require Carp; goto &Carp::croak;
  45 }
  46
  47 # HANDLE RETURN VALUES IN VARIOUS CONTEXTS
  48
  49 sub _failmsg {
  50         my ($message, $pos) = @_;
  51         $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
  52 }
  53
  54 sub _fail
  55 {
  56         my ($wantarray, $textref, $message, $pos) = @_;
  57         _failmsg $message, $pos if $message;
  58         return ("",$$textref,"") if $wantarray;
  59         return undef;
  60 }
  61
  62 sub _succeed
  63 {
  64         $@ = undef;
  65         my ($wantarray,$textref) = splice @_, 0, 2;
  66         my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
  67         my ($startlen) = $_[5];
  68         my $oppos = $_[6];
  69         my $remainderpos = $_[2];
  70         if ($wantarray)
  71         {
  72                 my @res;
  73                 while (my ($from, $len) = splice @_, 0, 2)
  74                 {
  75                         push @res, substr($$textref,$from,$len);
  76                 }
  77                 if ($extralen) {        # CORRECT FILLET
  78                         my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n");
  79                         $res[1] = "$extra$res[1]";
  80                         eval { substr($$textref,$remainderpos,0) = $extra;
  81                                substr($$textref,$extrapos,$extralen,"\n")} ;
  82                                 #REARRANGE HERE DOC AND FILLET IF POSSIBLE
  83                         pos($$textref) = $remainderpos-$extralen+1; # RESET \G
  84                 }
  85                 else {
  86                         pos($$textref) = $remainderpos;             # RESET \G
  87                 }
  88                 return @res;
  89         }
  90         else
  91         {
  92                 my $match = substr($$textref,$_[0],$_[1]);
  93                 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
  94                 my $extra = $extralen
  95                         ? substr($$textref, $extrapos, $extralen)."\n" : "";
  96                 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ;     #CHOP OUT PREFIX & MATCH, IF POSSIBLE
  97                 pos($$textref) = $_[4];                         # RESET \G
  98                 return $match;
  99         }
 100 }
 101
 102 # BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
 103
 104 sub gen_delimited_pat($;$)  # ($delimiters;$escapes)
 105 {
 106         my ($dels, $escs) = @_;
 107         return "" unless $dels =~ /\S/;
 108         $escs = '\\' unless $escs;
 109         $escs .= substr($escs,-1) x (length($dels)-length($escs));
 110         my @pat = ();
 111         my $i;
 112         for ($i=0; $i<length $dels; $i++)
 113         {
 114                 my $del = quotemeta substr($dels,$i,1);
 115                 my $esc = quotemeta substr($escs,$i,1);
 116                 if ($del eq $esc)
 117                 {
 118                         push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
 119                 }
 120                 else
 121                 {
 122                         push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
 123                 }
 124         }
 125         my $pat = join '|', @pat;
 126         return "(?:$pat)";
 127 }
 128
 129 *delimited_pat = \&gen_delimited_pat;
 130
 131
 132 # THE EXTRACTION FUNCTIONS
 133
 134 sub extract_delimited (;$$$$)
 135 {
 136         my $textref = defined $_[0] ? \$_[0] : \$_;
 137         my $wantarray = wantarray;
 138         my $del  = defined $_[1] ? $_[1] : qq{\'\"\`};
 139         my $pre  = defined $_[2] ? $_[2] : '\s*';
 140         my $esc  = defined $_[3] ? $_[3] : qq{\\};
 141         my $pat = gen_delimited_pat($del, $esc);
 142         my $startpos = pos $$textref || 0;
 143         return _fail($wantarray, $textref, "Not a delimited pattern", 0)
 144                 unless $$textref =~ m/\G($pre)($pat)/gc;
 145         my $prelen = length($1);
 146         my $matchpos = $startpos+$prelen;
 147         my $endpos = pos $$textref;
 148         return _succeed $wantarray, $textref,
 149                         $matchpos, $endpos-$matchpos,           # MATCH
 150                         $endpos,   length($$textref)-$endpos,   # REMAINDER
 151                         $startpos, $prelen;                     # PREFIX
 152 }
 153
 154 sub extract_bracketed (;$$$)
 155 {
 156         my $textref = defined $_[0] ? \$_[0] : \$_;
 157         my $ldel = defined $_[1] ? $_[1] : '{([<';
 158         my $pre  = defined $_[2] ? $_[2] : '\s*';
 159         my $wantarray = wantarray;
 160         my $qdel = "";
 161         my $quotelike;
 162         $ldel =~ s/'//g and $qdel .= q{'};
 163         $ldel =~ s/"//g and $qdel .= q{"};
 164         $ldel =~ s/`//g and $qdel .= q{`};
 165         $ldel =~ s/q//g and $quotelike = 1;
 166         $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
 167         my $rdel = $ldel;
 168         unless ($rdel =~ tr/[({</])}>/)
 169         {
 170                 return _fail $wantarray, $textref,
 171                              "Did not find a suitable bracket in delimiter: \"$_[1]\"",
 172                              0;
 173         }
 174         my $posbug = pos;
 175         $ldel = join('|', map { quotemeta $_ } split('', $ldel));
 176         $rdel = join('|', map { quotemeta $_ } split('', $rdel));
 177         pos = $posbug;
 178
 179         my $startpos = pos $$textref || 0;
 180         my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
 181
 182         return _fail ($wantarray, $textref) unless @match;
 183
 184         return _succeed ( $wantarray, $textref,
 185                           $match[2], $match[5]+2,       # MATCH
 186                           @match[8,9],                  # REMAINDER
 187                           @match[0,1],                  # PREFIX
 188                         );
 189 }
 190
 191 sub _match_bracketed($$$$$$)    # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
 192 {
 193         my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
 194         my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
 195         unless ($$textref =~ m/\G$pre/gc)
 196         {
 197                 _failmsg "Did not find prefix: /$pre/", $startpos;
 198                 return;
 199         }
 200
 201         $ldelpos = pos $$textref;
 202
 203         unless ($$textref =~ m/\G($ldel)/gc)
 204         {
 205                 _failmsg "Did not find opening bracket after prefix: \"$pre\"",
 206                          pos $$textref;
 207                 pos $$textref = $startpos;
 208                 return;
 209         }
 210
 211         my @nesting = ( $1 );
 212         my $textlen = length $$textref;
 213         while (pos $$textref < $textlen)
 214         {
 215                 next if $$textref =~ m/\G\\./gcs;
 216
 217                 if ($$textref =~ m/\G($ldel)/gc)
 218                 {
 219                         push @nesting, $1;
 220                 }
 221                 elsif ($$textref =~ m/\G($rdel)/gc)
 222                 {
 223                         my ($found, $brackettype) = ($1, $1);
 224                         if ($#nesting < 0)
 225                         {
 226                                 _failmsg "Unmatched closing bracket: \"$found\"",
 227                                          pos $$textref;
 228                                 pos $$textref = $startpos;
 229                                 return;
 230                         }
 231                         my $expected = pop(@nesting);
 232                         $expected =~ tr/({[</)}]>/;
 233                         if ($expected ne $brackettype)
 234                         {
 235                                 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
 236                                          pos $$textref;
 237                                 pos $$textref = $startpos;
 238                                 return;
 239                         }
 240                         last if $#nesting < 0;
 241                 }
 242                 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
 243                 {
 244                         $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
 245                         _failmsg "Unmatched embedded quote ($1)",
 246                                  pos $$textref;
 247                         pos $$textref = $startpos;
 248                         return;
 249                 }
 250                 elsif ($quotelike && _match_quotelike($textref,"",1,0))
 251                 {
 252                         next;
 253                 }
 254
 255                 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
 256         }
 257         if ($#nesting>=0)
 258         {
 259                 _failmsg "Unmatched opening bracket(s): "
 260                                 . join("..",@nesting)."..",
 261                          pos $$textref;
 262                 pos $$textref = $startpos;
 263                 return;
 264         }
 265
 266         $endpos = pos $$textref;
 267
 268         return (
 269                 $startpos,  $ldelpos-$startpos,         # PREFIX
 270                 $ldelpos,   1,                          # OPENING BRACKET
 271                 $ldelpos+1, $endpos-$ldelpos-2,         # CONTENTS
 272                 $endpos-1,  1,                          # CLOSING BRACKET
 273                 $endpos,    length($$textref)-$endpos,  # REMAINDER
 274                );
 275 }
 276
 277 sub revbracket($)
 278 {
 279         my $brack = reverse $_[0];
 280         $brack =~ tr/[({</])}>/;
 281         return $brack;
 282 }
 283
 284 my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
 285
 286 sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
 287 {
 288         my $textref = defined $_[0] ? \$_[0] : \$_;
 289         my $ldel    = $_[1];
 290         my $rdel    = $_[2];
 291         my $pre     = defined $_[3] ? $_[3] : '\s*';
 292         my %options = defined $_[4] ? %{$_[4]} : ();
 293         my $omode   = defined $options{fail} ? $options{fail} : '';
 294         my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
 295                     : defined($options{reject})        ? $options{reject}
 296                     :                                    ''
 297                     ;
 298         my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
 299                     : defined($options{ignore})        ? $options{ignore}
 300                     :                                    ''
 301                     ;
 302
 303         if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
 304         $@ = undef;
 305
 306         my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
 307
 308         return _fail(wantarray, $textref) unless @match;
 309         return _succeed wantarray, $textref,
 310                         $match[2], $match[3]+$match[5]+$match[7],       # MATCH
 311                         @match[8..9,0..1,2..7];                         # REM, PRE, BITS
 312 }
 313
 314 sub _match_tagged       # ($$$$$$$)
 315 {
 316         my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
 317         my $rdelspec;
 318
 319         my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
 320
 321         unless ($$textref =~ m/\G($pre)/gc)
 322         {
 323                 _failmsg "Did not find prefix: /$pre/", pos $$textref;
 324                 goto failed;
 325         }
 326
 327         $opentagpos = pos($$textref);
 328
 329         unless ($$textref =~ m/\G$ldel/gc)
 330         {
 331                 _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
 332                 goto failed;
 333         }
 334
 335         $textpos = pos($$textref);
 336
 337         if (!defined $rdel)
 338         {
 339                 $rdelspec = $&;
 340                 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes)
 341                 {
 342                         _failmsg "Unable to construct closing tag to match: $rdel",
 343                                  pos $$textref;
 344                         goto failed;
 345                 }
 346         }
 347         else
 348         {
 349                 $rdelspec = eval "qq{$rdel}" || do {
 350                         my $del;
 351                         for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
 352                                 { next if $rdel =~ /\Q$_/; $del = $_; last }
 353                         unless ($del) {
 354                                 croak ("Can't interpolate right delimiter $rdel")
 355                         }
 356                         eval "qq$del$rdel$del";
 357                 };
 358         }
 359
 360         while (pos($$textref) < length($$textref))
 361         {
 362                 next if $$textref =~ m/\G\\./gc;
 363
 364                 if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
 365                 {
 366                         $parapos = pos($$textref) - length($1)
 367                                 unless defined $parapos;
 368                 }
 369                 elsif ($$textref =~ m/\G($rdelspec)/gc )
 370                 {
 371                         $closetagpos = pos($$textref)-length($1);
 372                         goto matched;
 373                 }
 374                 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
 375                 {
 376                         next;
 377                 }
 378                 elsif ($bad && $$textref =~ m/\G($bad)/gcs)
 379                 {
 380                         pos($$textref) -= length($1);   # CUT OFF WHATEVER CAUSED THE SHORTNESS
 381                         goto short if ($omode eq 'PARA' || $omode eq 'MAX');
 382                         _failmsg "Found invalid nested tag: $1", pos $$textref;
 383                         goto failed;
 384                 }
 385                 elsif ($$textref =~ m/\G($ldel)/gc)
 386                 {
 387                         my $tag = $1;
 388                         pos($$textref) -= length($tag); # REWIND TO NESTED TAG
 389                         unless (_match_tagged(@_))      # MATCH NESTED TAG
 390                         {
 391                                 goto short if $omode eq 'PARA' || $omode eq 'MAX';
 392                                 _failmsg "Found unbalanced nested tag: $tag",
 393                                          pos $$textref;
 394                                 goto failed;
 395                         }
 396                 }
 397                 else { $$textref =~ m/./gcs }
 398         }
 399
 400 short:
 401         $closetagpos = pos($$textref);
 402         goto matched if $omode eq 'MAX';
 403         goto failed unless $omode eq 'PARA';
 404
 405         if (defined $parapos) { pos($$textref) = $parapos }
 406         else                  { $parapos = pos($$textref) }
 407
 408         return (
 409                 $startpos,    $opentagpos-$startpos,            # PREFIX
 410                 $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
 411                 $textpos,     $parapos-$textpos,                # TEXT
 412                 $parapos,     0,                                # NO CLOSING TAG
 413                 $parapos,     length($$textref)-$parapos,       # REMAINDER
 414                );
 415
 416 matched:
 417         $endpos = pos($$textref);
 418         return (
 419                 $startpos,    $opentagpos-$startpos,            # PREFIX
 420                 $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
 421                 $textpos,     $closetagpos-$textpos,            # TEXT
 422                 $closetagpos, $endpos-$closetagpos,             # CLOSING TAG
 423                 $endpos,      length($$textref)-$endpos,        # REMAINDER
 424                );
 425
 426 failed:
 427         _failmsg "Did not find closing tag", pos $$textref unless $@;
 428         pos($$textref) = $startpos;
 429         return;
 430 }
 431
 432 sub extract_variable (;$$)
 433 {
 434         my $textref = defined $_[0] ? \$_[0] : \$_;
 435         return ("","","") unless defined $$textref;
 436         my $pre  = defined $_[1] ? $_[1] : '\s*';
 437
 438         my @match = _match_variable($textref,$pre);
 439
 440         return _fail wantarray, $textref unless @match;
 441
 442         return _succeed wantarray, $textref,
 443                         @match[2..3,4..5,0..1];         # MATCH, REMAINDER, PREFIX
 444 }
 445
 446 sub _match_variable($$)
 447 {
 448 #  $#
 449 #  $^
 450 #  $$
 451         my ($textref, $pre) = @_;
 452         my $startpos = pos($$textref) = pos($$textref)||0;
 453         unless ($$textref =~ m/\G($pre)/gc)
 454         {
 455                 _failmsg "Did not find prefix: /$pre/", pos $$textref;
 456                 return;
 457         }
 458         my $varpos = pos($$textref);
 459         unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
 460         {
 461             unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
 462             {
 463                 _failmsg "Did not find leading dereferencer", pos $$textref;
 464                 pos $$textref = $startpos;
 465                 return;
 466             }
 467             my $deref = $1;
 468
 469             unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
 470                 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
 471                 or $deref eq '$#' or $deref eq '$$' )
 472             {
 473                 _failmsg "Bad identifier after dereferencer", pos $$textref;
 474                 pos $$textref = $startpos;
 475                 return;
 476             }
 477         }
 478
 479         while (1)
 480         {
 481                 next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
 482                 next if _match_codeblock($textref,
 483                                          qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
 484                                          qr/[({[]/, qr/[)}\]]/,
 485                                          qr/[({[]/, qr/[)}\]]/, 0);
 486                 next if _match_codeblock($textref,
 487                                          qr/\s*/, qr/[{[]/, qr/[}\]]/,
 488                                          qr/[{[]/, qr/[}\]]/, 0);
 489                 next if _match_variable($textref,'\s*->\s*');
 490                 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
 491                 last;
 492         }
 493
 494         my $endpos = pos($$textref);
 495         return ($startpos, $varpos-$startpos,
 496                 $varpos,   $endpos-$varpos,
 497                 $endpos,   length($$textref)-$endpos
 498                 );
 499 }
 500
 501 sub extract_codeblock (;$$$$$)
 502 {
 503         my $textref = defined $_[0] ? \$_[0] : \$_;
 504         my $wantarray = wantarray;
 505         my $ldel_inner = defined $_[1] ? $_[1] : '{';
 506         my $pre        = defined $_[2] ? $_[2] : '\s*';
 507         my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
 508         my $rd         = $_[4];
 509         my $rdel_inner = $ldel_inner;
 510         my $rdel_outer = $ldel_outer;
 511         my $posbug = pos;
 512         for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
 513         for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
 514         for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
 515         {
 516                 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
 517         }
 518         pos = $posbug;
 519
 520         my @match = _match_codeblock($textref, $pre,
 521                                      $ldel_outer, $rdel_outer,
 522                                      $ldel_inner, $rdel_inner,
 523                                      $rd);
 524         return _fail($wantarray, $textref) unless @match;
 525         return _succeed($wantarray, $textref,
 526                         @match[2..3,4..5,0..1]  # MATCH, REMAINDER, PREFIX
 527                        );
 528
 529 }
 530
 531 sub _match_codeblock($$$$$$$)
 532 {
 533         my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
 534         my $startpos = pos($$textref) = pos($$textref) || 0;
 535         unless ($$textref =~ m/\G($pre)/gc)
 536         {
 537                 _failmsg qq{Did not match prefix /$pre/ at"} .
 538                             substr($$textref,pos($$textref),20) .
 539                             q{..."},
 540                          pos $$textref;
 541                 return;
 542         }
 543         my $codepos = pos($$textref);
 544         unless ($$textref =~ m/\G($ldel_outer)/gc)      # OUTERMOST DELIMITER
 545         {
 546                 _failmsg qq{Did not find expected opening bracket at "} .
 547                              substr($$textref,pos($$textref),20) .
 548                              q{..."},
 549                          pos $$textref;
 550                 pos $$textref = $startpos;
 551                 return;
 552         }
 553         my $closing = $1;
 554            $closing =~ tr/([<{/)]>}/;
 555         my $matched;
 556         my $patvalid = 1;
 557         while (pos($$textref) < length($$textref))
 558         {
 559                 $matched = '';
 560                 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
 561                 {
 562                         $patvalid = 0;
 563                         next;
 564                 }
 565
 566                 if ($$textref =~ m/\G\s*#.*/gc)
 567                 {
 568                         next;
 569                 }
 570
 571                 if ($$textref =~ m/\G\s*($rdel_outer)/gc)
 572                 {
 573                         unless ($matched = ($closing && $1 eq $closing) )
 574                         {
 575                                 next if $1 eq '>';      # MIGHT BE A "LESS THAN"
 576                                 _failmsg q{Mismatched closing bracket at "} .
 577                                              substr($$textref,pos($$textref),20) .
 578                                              qq{...". Expected '$closing'},
 579                                          pos $$textref;
 580                         }
 581                         last;
 582                 }
 583
 584                 if (_match_variable($textref,'\s*') ||
 585                     _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
 586                 {
 587                         $patvalid = 0;
 588                         next;
 589                 }
 590
 591
 592                 # NEED TO COVER MANY MORE CASES HERE!!!
 593                 # NB 'case' is included here, because in Switch.pm,
 594                 # it's followed by a term, not an op
 595
 596                 if ($$textref =~ m#\G\s*(?!$ldel_inner)
 597                                         ( [-+*x/%^&|.]=?
 598                                         | [!=]~
 599                                         | =(?!>)
 600                                         | (\*\*|&&|\|\||<<|>>)=?
 601                                         | case|split|grep|map|return
 602                                         | [([]
 603                                         )#gcx)
 604                 {
 605                         $patvalid = 1;
 606                         next;
 607                 }
 608
 609                 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
 610                 {
 611                         $patvalid = 1;
 612                         next;
 613                 }
 614
 615                 if ($$textref =~ m/\G\s*$ldel_outer/gc)
 616                 {
 617                         _failmsg q{Improperly nested codeblock at "} .
 618                                      substr($$textref,pos($$textref),20) .
 619                                      q{..."},
 620                                  pos $$textref;
 621                         last;
 622                 }
 623
 624                 $patvalid = 0;
 625                 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
 626         }
 627         continue { $@ = undef }
 628
 629         unless ($matched)
 630         {
 631                 _failmsg 'No match found for opening bracket', pos $$textref
 632                         unless $@;
 633                 return;
 634         }
 635
 636         my $endpos = pos($$textref);
 637         return ( $startpos, $codepos-$startpos,
 638                  $codepos, $endpos-$codepos,
 639                  $endpos,  length($$textref)-$endpos,
 640                );
 641 }
 642
 643
 644 my %mods   = (
 645                 'none'  => '[cgimsox]*',
 646                 'm'     => '[cgimsox]*',
 647                 's'     => '[cegimsox]*',
 648                 'tr'    => '[cds]*',
 649                 'y'     => '[cds]*',
 650                 'qq'    => '',
 651                 'qx'    => '',
 652                 'qw'    => '',
 653                 'qr'    => '[imsx]*',
 654                 'q'     => '',
 655              );
 656
 657 sub extract_quotelike (;$$)
 658 {
 659         my $textref = $_[0] ? \$_[0] : \$_;
 660         my $wantarray = wantarray;
 661         my $pre  = defined $_[1] ? $_[1] : '\s*';
 662
 663         my @match = _match_quotelike($textref,$pre,1,0);
 664         return _fail($wantarray, $textref) unless @match;
 665         return _succeed($wantarray, $textref,
 666                         $match[2], $match[18]-$match[2],        # MATCH
 667                         @match[18,19],                          # REMAINDER
 668                         @match[0,1],                            # PREFIX
 669                         @match[2..17],                          # THE BITS
 670                         @match[20,21],                          # ANY FILLET?
 671                        );
 672 };
 673
 674 sub _match_quotelike($$$$)      # ($textref, $prepat, $allow_raw_match)
 675 {
 676         my ($textref, $pre, $rawmatch, $qmark) = @_;
 677
 678         my ($textlen,$startpos,
 679             $oppos,
 680             $preld1pos,$ld1pos,$str1pos,$rd1pos,
 681             $preld2pos,$ld2pos,$str2pos,$rd2pos,
 682             $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
 683
 684         unless ($$textref =~ m/\G($pre)/gc)
 685         {
 686                 _failmsg qq{Did not find prefix /$pre/ at "} .
 687                              substr($$textref, pos($$textref), 20) .
 688                              q{..."},
 689                          pos $$textref;
 690                 return;
 691         }
 692         $oppos = pos($$textref);
 693
 694         my $initial = substr($$textref,$oppos,1);
 695
 696         if ($initial && $initial =~ m|^[\"\'\`]|
 697                      || $rawmatch && $initial =~ m|^/|
 698                      || $qmark && $initial =~ m|^\?|)
 699         {
 700                 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
 701                 {
 702                         _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
 703                                      substr($$textref, $oppos, 20) .
 704                                      q{..."},
 705                                  pos $$textref;
 706                         pos $$textref = $startpos;
 707                         return;
 708                 }
 709                 $modpos= pos($$textref);
 710                 $rd1pos = $modpos-1;
 711
 712                 if ($initial eq '/' || $initial eq '?')
 713                 {
 714                         $$textref =~ m/\G$mods{none}/gc
 715                 }
 716
 717                 my $endpos = pos($$textref);
 718                 return (
 719                         $startpos,      $oppos-$startpos,       # PREFIX
 720                         $oppos,         0,                      # NO OPERATOR
 721                         $oppos,         1,                      # LEFT DEL
 722                         $oppos+1,       $rd1pos-$oppos-1,       # STR/PAT
 723                         $rd1pos,        1,                      # RIGHT DEL
 724                         $modpos,        0,                      # NO 2ND LDEL
 725                         $modpos,        0,                      # NO 2ND STR
 726                         $modpos,        0,                      # NO 2ND RDEL
 727                         $modpos,        $endpos-$modpos,        # MODIFIERS
 728                         $endpos,        $textlen-$endpos,       # REMAINDER
 729                        );
 730         }
 731
 732         unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
 733         {
 734                 _failmsg q{No quotelike operator found after prefix at "} .
 735                              substr($$textref, pos($$textref), 20) .
 736                              q{..."},
 737                          pos $$textref;
 738                 pos $$textref = $startpos;
 739                 return;
 740         }
 741
 742         my $op = $1;
 743         $preld1pos = pos($$textref);
 744         if ($op eq '<<') {
 745                 $ld1pos = pos($$textref);
 746                 my $label;
 747                 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
 748                         $label = $1;
 749                 }
 750                 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
 751                                      | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
 752                                      | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
 753                                      }gcsx) {
 754                         $label = $+;
 755                 }
 756                 else {
 757                         $label = "";
 758                 }
 759                 my $extrapos = pos($$textref);
 760                 $$textref =~ m{.*\n}gc;
 761                 $str1pos = pos($$textref)--;
 762                 unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) {
 763                         _failmsg qq{Missing here doc terminator ('$label') after "} .
 764                                      substr($$textref, $startpos, 20) .
 765                                      q{..."},
 766                                  pos $$textref;
 767                         pos $$textref = $startpos;
 768                         return;
 769                 }
 770                 $rd1pos = pos($$textref);
 771                 $$textref =~ m{\Q$label\E\n}gc;
 772                 $ld2pos = pos($$textref);
 773                 return (
 774                         $startpos,      $oppos-$startpos,       # PREFIX
 775                         $oppos,         length($op),            # OPERATOR
 776                         $ld1pos,        $extrapos-$ld1pos,      # LEFT DEL
 777                         $str1pos,       $rd1pos-$str1pos,       # STR/PAT
 778                         $rd1pos,        $ld2pos-$rd1pos,        # RIGHT DEL
 779                         $ld2pos,        0,                      # NO 2ND LDEL
 780                         $ld2pos,        0,                      # NO 2ND STR
 781                         $ld2pos,        0,                      # NO 2ND RDEL
 782                         $ld2pos,        0,                      # NO MODIFIERS
 783                         $ld2pos,        $textlen-$ld2pos,       # REMAINDER
 784                         $extrapos,      $str1pos-$extrapos,     # FILLETED BIT
 785                        );
 786         }
 787
 788         $$textref =~ m/\G\s*/gc;
 789         $ld1pos = pos($$textref);
 790         $str1pos = $ld1pos+1;
 791
 792         unless ($$textref =~ m/\G(\S)/gc)       # SHOULD USE LOOKAHEAD
 793         {
 794                 _failmsg "No block delimiter found after quotelike $op",
 795                          pos $$textref;
 796                 pos $$textref = $startpos;
 797                 return;
 798         }
 799         pos($$textref) = $ld1pos;       # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
 800         my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
 801         if ($ldel1 =~ /[[(<{]/)
 802         {
 803                 $rdel1 =~ tr/[({</])}>/;
 804                 defined(_match_bracketed($textref,"",$ldel1,"","",$rdel1))
 805                 || do { pos $$textref = $startpos; return };
 806         }
 807         else
 808         {
 809                 $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
 810                 || do { pos $$textref = $startpos; return };
 811         }
 812         $ld2pos = $rd1pos = pos($$textref)-1;
 813
 814         my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
 815         if ($second_arg)
 816         {
 817                 my ($ldel2, $rdel2);
 818                 if ($ldel1 =~ /[[(<{]/)
 819                 {
 820                         unless ($$textref =~ /\G\s*(\S)/gc)     # SHOULD USE LOOKAHEAD
 821                         {
 822                                 _failmsg "Missing second block for quotelike $op",
 823                                          pos $$textref;
 824                                 pos $$textref = $startpos;
 825                                 return;
 826                         }
 827                         $ldel2 = $rdel2 = "\Q$1";
 828                         $rdel2 =~ tr/[({</])}>/;
 829                 }
 830                 else
 831                 {
 832                         $ldel2 = $rdel2 = $ldel1;
 833                 }
 834                 $str2pos = $ld2pos+1;
 835
 836                 if ($ldel2 =~ /[[(<{]/)
 837                 {
 838                         pos($$textref)--;       # OVERCOME BROKEN LOOKAHEAD
 839                         defined(_match_bracketed($textref,"",$ldel2,"","",$rdel2))
 840                         || do { pos $$textref = $startpos; return };
 841                 }
 842                 else
 843                 {
 844                         $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
 845                         || do { pos $$textref = $startpos; return };
 846                 }
 847                 $rd2pos = pos($$textref)-1;
 848         }
 849         else
 850         {
 851                 $ld2pos = $str2pos = $rd2pos = $rd1pos;
 852         }
 853
 854         $modpos = pos $$textref;
 855
 856         $$textref =~ m/\G($mods{$op})/gc;
 857         my $endpos = pos $$textref;
 858
 859         return (
 860                 $startpos,      $oppos-$startpos,       # PREFIX
 861                 $oppos,         length($op),            # OPERATOR
 862                 $ld1pos,        1,                      # LEFT DEL
 863                 $str1pos,       $rd1pos-$str1pos,       # STR/PAT
 864                 $rd1pos,        1,                      # RIGHT DEL
 865                 $ld2pos,        $second_arg,            # 2ND LDEL (MAYBE)
 866                 $str2pos,       $rd2pos-$str2pos,       # 2ND STR (MAYBE)
 867                 $rd2pos,        $second_arg,            # 2ND RDEL (MAYBE)
 868                 $modpos,        $endpos-$modpos,        # MODIFIERS
 869                 $endpos,        $textlen-$endpos,       # REMAINDER
 870                );
 871 }
 872
 873 my $def_func =
 874 [
 875         sub { extract_variable($_[0], '') },
 876         sub { extract_quotelike($_[0],'') },
 877         sub { extract_codeblock($_[0],'{}','') },
 878 ];
 879
 880 sub extract_multiple (;$$$$)    # ($text, $functions_ref, $max_fields, $ignoreunknown)
 881 {
 882         my $textref = defined($_[0]) ? \$_[0] : \$_;
 883         my $posbug = pos;
 884         my ($lastpos, $firstpos);
 885         my @fields = ();
 886
 887         #for ($$textref)
 888         {
 889                 my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
 890                 my $max  = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
 891                 my $igunk = $_[3];
 892
 893                 pos $$textref ||= 0;
 894
 895                 unless (wantarray)
 896                 {
 897                         carp ("extract_multiple reset maximal count to 1 in scalar context")
 898                                 if $^W && defined($_[2]) && $max > 1;
 899                         $max = 1
 900                 }
 901
 902                 my $unkpos;
 903                 my $func;
 904                 my $class;
 905
 906                 my @class;
 907                 foreach $func ( @func )
 908                 {
 909                         if (ref($func) eq 'HASH')
 910                         {
 911                                 push @class, (keys %$func)[0];
 912                                 $func = (values %$func)[0];
 913                         }
 914                         else
 915                         {
 916                                 push @class, undef;
 917                         }
 918                 }
 919
 920                 FIELD: while (pos($$textref) < length($$textref))
 921                 {
 922                         my ($field, $rem);
 923                         my @bits;
 924                         foreach my $i ( 0..$#func )
 925                         {
 926                                 my $pref;
 927                                 $func = $func[$i];
 928                                 $class = $class[$i];
 929                                 $lastpos = pos $$textref;
 930                                 if (ref($func) eq 'CODE')
 931                                         { ($field,$rem,$pref) = @bits = $func->($$textref);
 932                                         # print "[$field|$rem]" if $field;
 933                                         }
 934                                 elsif (ref($func) eq 'Text::Balanced::Extractor')
 935                                         { @bits = $field = $func->extract($$textref) }
 936                                 elsif( $$textref =~ m/\G$func/gc )
 937                                         { @bits = $field = defined($1) ? $1 : $& }
 938                                 $pref ||= "";
 939                                 if (defined($field) && length($field))
 940                                 {
 941                                         if (!$igunk) {
 942                                                 $unkpos = $lastpos
 943                                                         if length($pref) && !defined($unkpos);
 944                                                 if (defined $unkpos)
 945                                                 {
 946                                                         push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
 947                                                         $firstpos = $unkpos unless defined $firstpos;
 948                                                         undef $unkpos;
 949                                                         last FIELD if @fields == $max;
 950                                                 }
 951                                         }
 952                                         push @fields, $class
 953                                                 ? bless (\$field, $class)
 954                                                 : $field;
 955                                         $firstpos = $lastpos unless defined $firstpos;
 956                                         $lastpos = pos $$textref;
 957                                         last FIELD if @fields == $max;
 958                                         next FIELD;
 959                                 }
 960                         }
 961                         if ($$textref =~ /\G(.)/gcs)
 962                         {
 963                                 $unkpos = pos($$textref)-1
 964                                         unless $igunk || defined $unkpos;
 965                         }
 966                 }
 967
 968                 if (defined $unkpos)
 969                 {
 970                         push @fields, substr($$textref, $unkpos);
 971                         $firstpos = $unkpos unless defined $firstpos;
 972                         $lastpos = length $$textref;
 973                 }
 974                 last;
 975         }
 976
 977         pos $$textref = $lastpos;
 978         return @fields if wantarray;
 979
 980         $firstpos ||= 0;
 981         eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
 982                pos $$textref = $firstpos };
 983         return $fields[0];
 984 }
 985
 986
 987 sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
 988 {
 989         my $ldel    = $_[0];
 990         my $rdel    = $_[1];
 991         my $pre     = defined $_[2] ? $_[2] : '\s*';
 992         my %options = defined $_[3] ? %{$_[3]} : ();
 993         my $omode   = defined $options{fail} ? $options{fail} : '';
 994         my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
 995                     : defined($options{reject})        ? $options{reject}
 996                     :                                    ''
 997                     ;
 998         my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
 999                     : defined($options{ignore})        ? $options{ignore}
1000                     :                                    ''
1001                     ;
1002
1003         if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
1004
1005         my $posbug = pos;
1006         for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
1007         pos = $posbug;
1008
1009         my $closure = sub
1010         {
1011                 my $textref = defined $_[0] ? \$_[0] : \$_;
1012                 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
1013
1014                 return _fail(wantarray, $textref) unless @match;
1015                 return _succeed wantarray, $textref,
1016                                 $match[2], $match[3]+$match[5]+$match[7],       # MATCH
1017                                 @match[8..9,0..1,2..7];                         # REM, PRE, BITS
1018         };
1019
1020         bless $closure, 'Text::Balanced::Extractor';
1021 }
1022
1023 package Text::Balanced::Extractor;
1024
1025 sub extract($$) # ($self, $text)
1026 {
1027         &{$_[0]}($_[1]);
1028 }
1029
1030 package Text::Balanced::ErrorMsg;
1031
1032 use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
1033
1034 1;
1035
1036 __END__
1037
1038 =head1 NAME
1039
1040 Text::Balanced - Extract delimited text sequences from strings.
1041
1042
1043 =head1 SYNOPSIS
1044
1045  use Text::Balanced qw (
1046                         extract_delimited
1047                         extract_bracketed
1048                         extract_quotelike
1049                         extract_codeblock
1050                         extract_variable
1051                         extract_tagged
1052                         extract_multiple
1053
1054                         gen_delimited_pat
1055                         gen_extract_tagged
1056                        );
1057
1058  # Extract the initial substring of $text that is delimited by
1059  # two (unescaped) instances of the first character in $delim.
1060
1061         ($extracted, $remainder) = extract_delimited($text,$delim);
1062
1063
1064  # Extract the initial substring of $text that is bracketed
1065  # with a delimiter(s) specified by $delim (where the string
1066  # in $delim contains one or more of '(){}[]<>').
1067
1068         ($extracted, $remainder) = extract_bracketed($text,$delim);
1069
1070
1071  # Extract the initial substring of $text that is bounded by
1072  # an XML tag.
1073
1074         ($extracted, $remainder) = extract_tagged($text);
1075
1076
1077  # Extract the initial substring of $text that is bounded by
1078  # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1079
1080         ($extracted, $remainder) =
1081                 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1082
1083
1084  # Extract the initial substring of $text that represents a
1085  # Perl "quote or quote-like operation"
1086
1087         ($extracted, $remainder) = extract_quotelike($text);
1088
1089
1090  # Extract the initial substring of $text that represents a block
1091  # of Perl code, bracketed by any of character(s) specified by $delim
1092  # (where the string $delim contains one or more of '(){}[]<>').
1093
1094         ($extracted, $remainder) = extract_codeblock($text,$delim);
1095
1096
1097  # Extract the initial substrings of $text that would be extracted by
1098  # one or more sequential applications of the specified functions
1099  # or regular expressions
1100
1101         @extracted = extract_multiple($text,
1102                                       [ \&extract_bracketed,
1103                                         \&extract_quotelike,
1104                                         \&some_other_extractor_sub,
1105                                         qr/[xyz]*/,
1106                                         'literal',
1107                                       ]);
1108
1109 # Create a string representing an optimized pattern (a la Friedl)
1110 # that matches a substring delimited by any of the specified characters
1111 # (in this case: any type of quote or a slash)
1112
1113         $patstring = gen_delimited_pat(q{'"`/});
1114
1115
1116 # Generate a reference to an anonymous sub that is just like extract_tagged
1117 # but pre-compiled and optimized for a specific pair of tags, and consequently
1118 # much faster (i.e. 3 times faster). It uses qr// for better performance on
1119 # repeated calls, so it only works under Perl 5.005 or later.
1120
1121         $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1122
1123         ($extracted, $remainder) = $extract_head->($text);
1124
1125
1126 =head1 DESCRIPTION
1127
1128 The various C<extract_...> subroutines may be used to
1129 extract a delimited substring, possibly after skipping a
1130 specified prefix string. By default, that prefix is
1131 optional whitespace (C</\s*/>), but you can change it to whatever
1132 you wish (see below).
1133
1134 The substring to be extracted must appear at the
1135 current C<pos> location of the string's variable
1136 (or at index zero, if no C<pos> position is defined).
1137 In other words, the C<extract_...> subroutines I<don't>
1138 extract the first occurrence of a substring anywhere
1139 in a string (like an unanchored regex would). Rather,
1140 they extract an occurrence of the substring appearing
1141 immediately at the current matching position in the
1142 string (like a C<\G>-anchored regex would).
1143
1144
1145
1146 =head2 General behaviour in list contexts
1147
1148 In a list context, all the subroutines return a list, the first three
1149 elements of which are always:
1150
1151 =over 4
1152
1153 =item [0]
1154
1155 The extracted string, including the specified delimiters.
1156 If the extraction fails an empty string is returned.
1157
1158 =item [1]
1159
1160 The remainder of the input string (i.e. the characters after the
1161 extracted string). On failure, the entire string is returned.
1162
1163 =item [2]
1164
1165 The skipped prefix (i.e. the characters before the extracted string).
1166 On failure, the empty string is returned.
1167
1168 =back
1169
1170 Note that in a list context, the contents of the original input text (the first
1171 argument) are not modified in any way.
1172
1173 However, if the input text was passed in a variable, that variable's
1174 C<pos> value is updated to point at the first character after the
1175 extracted text. That means that in a list context the various
1176 subroutines can be used much like regular expressions. For example:
1177
1178         while ( $next = (extract_quotelike($text))[0] )
1179         {
1180                 # process next quote-like (in $next)
1181         }
1182
1183
1184 =head2 General behaviour in scalar and void contexts
1185
1186 In a scalar context, the extracted string is returned, having first been
1187 removed from the input text. Thus, the following code also processes
1188 each quote-like operation, but actually removes them from $text:
1189
1190         while ( $next = extract_quotelike($text) )
1191         {
1192                 # process next quote-like (in $next)
1193         }
1194
1195 Note that if the input text is a read-only string (i.e. a literal),
1196 no attempt is made to remove the extracted text.
1197
1198 In a void context the behaviour of the extraction subroutines is
1199 exactly the same as in a scalar context, except (of course) that the
1200 extracted substring is not returned.
1201
1202 =head2 A note about prefixes
1203
1204 Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1205 This can bite you if you're expecting a prefix specification like
1206 '.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1207 pattern will only succeed if the <H1> tag is on the current line, since
1208 . normally doesn't match newlines.
1209
1210 To overcome this limitation, you need to turn on /s matching within
1211 the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1212
1213
1214 =head2 C<extract_delimited>
1215
1216 The C<extract_delimited> function formalizes the common idiom
1217 of extracting a single-character-delimited substring from the start of
1218 a string. For example, to extract a single-quote delimited string, the
1219 following code is typically used:
1220
1221         ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1222         $extracted = $1;
1223
1224 but with C<extract_delimited> it can be simplified to:
1225
1226         ($extracted,$remainder) = extract_delimited($text, "'");
1227
1228 C<extract_delimited> takes up to four scalars (the input text, the
1229 delimiters, a prefix pattern to be skipped, and any escape characters)
1230 and extracts the initial substring of the text that
1231 is appropriately delimited. If the delimiter string has multiple
1232 characters, the first one encountered in the text is taken to delimit
1233 the substring.
1234 The third argument specifies a prefix pattern that is to be skipped
1235 (but must be present!) before the substring is extracted.
1236 The final argument specifies the escape character to be used for each
1237 delimiter.
1238
1239 All arguments are optional. If the escape characters are not specified,
1240 every delimiter is escaped with a backslash (C<\>).
1241 If the prefix is not specified, the
1242 pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1243 is also not specified, the set C</["'`]/> is used. If the text to be processed
1244 is not specified either, C<$_> is used.
1245
1246 In list context, C<extract_delimited> returns a array of three
1247 elements, the extracted substring (I<including the surrounding
1248 delimiters>), the remainder of the text, and the skipped prefix (if
1249 any). If a suitable delimited substring is not found, the first
1250 element of the array is the empty string, the second is the complete
1251 original text, and the prefix returned in the third element is an
1252 empty string.
1253
1254 In a scalar context, just the extracted substring is returned. In
1255 a void context, the extracted substring (and any prefix) are simply
1256 removed from the beginning of the first argument.
1257
1258 Examples:
1259
1260         # Remove a single-quoted substring from the very beginning of $text:
1261
1262                 $substring = extract_delimited($text, "'", '');
1263
1264         # Remove a single-quoted Pascalish substring (i.e. one in which
1265         # doubling the quote character escapes it) from the very
1266         # beginning of $text:
1267
1268                 $substring = extract_delimited($text, "'", '', "'");
1269
1270         # Extract a single- or double- quoted substring from the
1271         # beginning of $text, optionally after some whitespace
1272         # (note the list context to protect $text from modification):
1273
1274                 ($substring) = extract_delimited $text, q{"'};
1275
1276
1277         # Delete the substring delimited by the first '/' in $text:
1278
1279                 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1280
1281 Note that this last example is I<not> the same as deleting the first
1282 quote-like pattern. For instance, if C<$text> contained the string:
1283
1284         "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1285
1286 then after the deletion it would contain:
1287
1288         "if ('.$UNIXCMD/s) { $cmd = $1; }"
1289
1290 not:
1291
1292         "if ('./cmd' =~ ms) { $cmd = $1; }"
1293
1294
1295 See L<"extract_quotelike"> for a (partial) solution to this problem.
1296
1297
1298 =head2 C<extract_bracketed>
1299
1300 Like C<"extract_delimited">, the C<extract_bracketed> function takes
1301 up to three optional scalar arguments: a string to extract from, a delimiter
1302 specifier, and a prefix pattern. As before, a missing prefix defaults to
1303 optional whitespace and a missing text defaults to C<$_>. However, a missing
1304 delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1305
1306 C<extract_bracketed> extracts a balanced-bracket-delimited
1307 substring (using any one (or more) of the user-specified delimiter
1308 brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1309 respect quoted unbalanced brackets (see below).
1310
1311 A "delimiter bracket" is a bracket in list of delimiters passed as
1312 C<extract_bracketed>'s second argument. Delimiter brackets are
1313 specified by giving either the left or right (or both!) versions
1314 of the required bracket(s). Note that the order in which
1315 two or more delimiter brackets are specified is not significant.
1316
1317 A "balanced-bracket-delimited substring" is a substring bounded by
1318 matched brackets, such that any other (left or right) delimiter
1319 bracket I<within> the substring is also matched by an opposite
1320 (right or left) delimiter bracket I<at the same level of nesting>. Any
1321 type of bracket not in the delimiter list is treated as an ordinary
1322 character.
1323
1324 In other words, each type of bracket specified as a delimiter must be
1325 balanced and correctly nested within the substring, and any other kind of
1326 ("non-delimiter") bracket in the substring is ignored.
1327
1328 For example, given the string:
1329
1330         $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1331
1332 then a call to C<extract_bracketed> in a list context:
1333
1334         @result = extract_bracketed( $text, '{}' );
1335
1336 would return:
1337
1338         ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1339
1340 since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1341 (In a scalar context just the first element of the array would be returned. In
1342 a void context, C<$text> would be replaced by an empty string.)
1343
1344 Likewise the call in:
1345
1346         @result = extract_bracketed( $text, '{[' );
1347
1348 would return the same result, since all sets of both types of specified
1349 delimiter brackets are correctly nested and balanced.
1350
1351 However, the call in:
1352
1353         @result = extract_bracketed( $text, '{([<' );
1354
1355 would fail, returning:
1356
1357         ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }"  );
1358
1359 because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1360 the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1361 return an empty string. In a void context, C<$text> would be unchanged.)
1362
1363 Note that the embedded single-quotes in the string don't help in this
1364 case, since they have not been specified as acceptable delimiters and are
1365 therefore treated as non-delimiter characters (and ignored).
1366
1367 However, if a particular species of quote character is included in the
1368 delimiter specification, then that type of quote will be correctly handled.
1369 for example, if C<$text> is:
1370
1371         $text = '<A HREF=">>>>">link</A>';
1372
1373 then
1374
1375         @result = extract_bracketed( $text, '<">' );
1376
1377 returns:
1378
1379         ( '<A HREF=">>>>">', 'link</A>', "" )
1380
1381 as expected. Without the specification of C<"> as an embedded quoter:
1382
1383         @result = extract_bracketed( $text, '<>' );
1384
1385 the result would be:
1386
1387         ( '<A HREF=">', '>>>">link</A>', "" )
1388
1389 In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1390 quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1391 letter 'q' as a delimiter. Hence:
1392
1393         @result = extract_bracketed( $text, '<q>' );
1394
1395 would correctly match something like this:
1396
1397         $text = '<leftop: conj /and/ conj>';
1398
1399 See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1400
1401
1402 =head2 C<extract_variable>
1403
1404 C<extract_variable> extracts any valid Perl variable or
1405 variable-involved expression, including scalars, arrays, hashes, array
1406 accesses, hash look-ups, method calls through objects, subroutine calls
1407 through subroutine references, etc.
1408
1409 The subroutine takes up to two optional arguments:
1410
1411 =over 4
1412
1413 =item 1.
1414
1415 A string to be processed (C<$_> if the string is omitted or C<undef>)
1416
1417 =item 2.
1418
1419 A string specifying a pattern to be matched as a prefix (which is to be
1420 skipped). If omitted, optional whitespace is skipped.
1421
1422 =back
1423
1424 On success in a list context, an array of 3 elements is returned. The
1425 elements are:
1426
1427 =over 4
1428
1429 =item [0]
1430
1431 the extracted variable, or variablish expression
1432
1433 =item [1]
1434
1435 the remainder of the input text,
1436
1437 =item [2]
1438
1439 the prefix substring (if any),
1440
1441 =back
1442
1443 On failure, all of these values (except the remaining text) are C<undef>.
1444
1445 In a scalar context, C<extract_variable> returns just the complete
1446 substring that matched a variablish expression. C<undef> is returned on
1447 failure. In addition, the original input text has the returned substring
1448 (and any prefix) removed from it.
1449
1450 In a void context, the input text just has the matched substring (and
1451 any specified prefix) removed.
1452
1453
1454 =head2 C<extract_tagged>
1455
1456 C<extract_tagged> extracts and segments text between (balanced)
1457 specified tags.
1458
1459 The subroutine takes up to five optional arguments:
1460
1461 =over 4
1462
1463 =item 1.
1464
1465 A string to be processed (C<$_> if the string is omitted or C<undef>)
1466
1467 =item 2.
1468
1469 A string specifying a pattern to be matched as the opening tag.
1470 If the pattern string is omitted (or C<undef>) then a pattern
1471 that matches any standard XML tag is used.
1472
1473 =item 3.
1474
1475 A string specifying a pattern to be matched at the closing tag.
1476 If the pattern string is omitted (or C<undef>) then the closing
1477 tag is constructed by inserting a C</> after any leading bracket
1478 characters in the actual opening tag that was matched (I<not> the pattern
1479 that matched the tag). For example, if the opening tag pattern
1480 is specified as C<'{{\w+}}'> and actually matched the opening tag
1481 C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1482
1483 =item 4.
1484
1485 A string specifying a pattern to be matched as a prefix (which is to be
1486 skipped). If omitted, optional whitespace is skipped.
1487
1488 =item 5.
1489
1490 A hash reference containing various parsing options (see below)
1491
1492 =back
1493
1494 The various options that can be specified are:
1495
1496 =over 4
1497
1498 =item C<reject =E<gt> $listref>
1499
1500 The list reference contains one or more strings specifying patterns
1501 that must I<not> appear within the tagged text.
1502
1503 For example, to extract
1504 an HTML link (which should not contain nested links) use:
1505
1506         extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1507
1508 =item C<ignore =E<gt> $listref>
1509
1510 The list reference contains one or more strings specifying patterns
1511 that are I<not> be be treated as nested tags within the tagged text
1512 (even if they would match the start tag pattern).
1513
1514 For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1515
1516         extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1517
1518 (also see L<"gen_delimited_pat"> below).
1519
1520
1521 =item C<fail =E<gt> $str>
1522
1523 The C<fail> option indicates the action to be taken if a matching end
1524 tag is not encountered (i.e. before the end of the string or some
1525 C<reject> pattern matches). By default, a failure to match a closing
1526 tag causes C<extract_tagged> to immediately fail.
1527
1528 However, if the string value associated with <reject> is "MAX", then
1529 C<extract_tagged> returns the complete text up to the point of failure.
1530 If the string is "PARA", C<extract_tagged> returns only the first paragraph
1531 after the tag (up to the first line that is either empty or contains
1532 only whitespace characters).
1533 If the string is "", the the default behaviour (i.e. failure) is reinstated.
1534
1535 For example, suppose the start tag "/para" introduces a paragraph, which then
1536 continues until the next "/endpara" tag or until another "/para" tag is
1537 encountered:
1538
1539         $text = "/para line 1\n\nline 3\n/para line 4";
1540
1541         extract_tagged($text, '/para', '/endpara', undef,
1542                                 {reject => '/para', fail => MAX );
1543
1544         # EXTRACTED: "/para line 1\n\nline 3\n"
1545
1546 Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1547 tag refers only to the immediately following paragraph:
1548
1549         $text = "/para line 1\n\nline 3\n/para line 4";
1550
1551         extract_tagged($text, '/para', '/endpara', undef,
1552                         {reject => '/para', fail => MAX );
1553
1554         # EXTRACTED: "/para line 1\n"
1555
1556 Note that the specified C<fail> behaviour applies to nested tags as well.
1557
1558 =back
1559
1560 On success in a list context, an array of 6 elements is returned. The elements are:
1561
1562 =over 4
1563
1564 =item [0]
1565
1566 the extracted tagged substring (including the outermost tags),
1567
1568 =item [1]
1569
1570 the remainder of the input text,
1571
1572 =item [2]
1573
1574 the prefix substring (if any),
1575
1576 =item [3]
1577
1578 the opening tag
1579
1580 =item [4]
1581
1582 the text between the opening and closing tags
1583
1584 =item [5]
1585
1586 the closing tag (or "" if no closing tag was found)
1587
1588 =back
1589
1590 On failure, all of these values (except the remaining text) are C<undef>.
1591
1592 In a scalar context, C<extract_tagged> returns just the complete
1593 substring that matched a tagged text (including the start and end
1594 tags). C<undef> is returned on failure. In addition, the original input
1595 text has the returned substring (and any prefix) removed from it.
1596
1597 In a void context, the input text just has the matched substring (and
1598 any specified prefix) removed.
1599
1600
1601 =head2 C<gen_extract_tagged>
1602
1603 (Note: This subroutine is only available under Perl5.005)
1604
1605 C<gen_extract_tagged> generates a new anonymous subroutine which
1606 extracts text between (balanced) specified tags. In other words,
1607 it generates a function identical in function to C<extract_tagged>.
1608
1609 The difference between C<extract_tagged> and the anonymous
1610 subroutines generated by
1611 C<gen_extract_tagged>, is that those generated subroutines:
1612
1613 =over 4
1614
1615 =item *
1616
1617 do not have to reparse tag specification or parsing options every time
1618 they are called (whereas C<extract_tagged> has to effectively rebuild
1619 its tag parser on every call);
1620
1621 =item *
1622
1623 make use of the new qr// construct to pre-compile the regexes they use
1624 (whereas C<extract_tagged> uses standard string variable interpolation
1625 to create tag-matching patterns).
1626
1627 =back
1628
1629 The subroutine takes up to four optional arguments (the same set as
1630 C<extract_tagged> except for the string to be processed). It returns
1631 a reference to a subroutine which in turn takes a single argument (the text to
1632 be extracted from).
1633
1634 In other words, the implementation of C<extract_tagged> is exactly
1635 equivalent to:
1636
1637         sub extract_tagged
1638         {
1639                 my $text = shift;
1640                 $extractor = gen_extract_tagged(@_);
1641                 return $extractor->($text);
1642         }
1643
1644 (although C<extract_tagged> is not currently implemented that way, in order
1645 to preserve pre-5.005 compatibility).
1646
1647 Using C<gen_extract_tagged> to create extraction functions for specific tags
1648 is a good idea if those functions are going to be called more than once, since
1649 their performance is typically twice as good as the more general-purpose
1650 C<extract_tagged>.
1651
1652
1653 =head2 C<extract_quotelike>
1654
1655 C<extract_quotelike> attempts to recognize, extract, and segment any
1656 one of the various Perl quotes and quotelike operators (see
1657 L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1658 delimiters (for the quotelike operators), and trailing modifiers are
1659 all caught. For example, in:
1660
1661         extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1662
1663         extract_quotelike '  "You said, \"Use sed\"."  '
1664
1665         extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1666
1667         extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1668
1669 the full Perl quotelike operations are all extracted correctly.
1670
1671 Note too that, when using the /x modifier on a regex, any comment
1672 containing the current pattern delimiter will cause the regex to be
1673 immediately terminated. In other words:
1674
1675         'm /
1676                 (?i)            # CASE INSENSITIVE
1677                 [a-z_]          # LEADING ALPHABETIC/UNDERSCORE
1678                 [a-z0-9]*       # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1679            /x'
1680
1681 will be extracted as if it were:
1682
1683         'm /
1684                 (?i)            # CASE INSENSITIVE
1685                 [a-z_]          # LEADING ALPHABETIC/'
1686
1687 This behaviour is identical to that of the actual compiler.
1688
1689 C<extract_quotelike> takes two arguments: the text to be processed and
1690 a prefix to be matched at the very beginning of the text. If no prefix
1691 is specified, optional whitespace is the default. If no text is given,
1692 C<$_> is used.
1693
1694 In a list context, an array of 11 elements is returned. The elements are:
1695
1696 =over 4
1697
1698 =item [0]
1699
1700 the extracted quotelike substring (including trailing modifiers),
1701
1702 =item [1]
1703
1704 the remainder of the input text,
1705
1706 =item [2]
1707
1708 the prefix substring (if any),
1709
1710 =item [3]
1711
1712 the name of the quotelike operator (if any),
1713
1714 =item [4]
1715
1716 the left delimiter of the first block of the operation,
1717
1718 =item [5]
1719
1720 the text of the first block of the operation
1721 (that is, the contents of
1722 a quote, the regex of a match or substitution or the target list of a
1723 translation),
1724
1725 =item [6]
1726
1727 the right delimiter of the first block of the operation,
1728
1729 =item [7]
1730
1731 the left delimiter of the second block of the operation
1732 (that is, if it is a C<s>, C<tr>, or C<y>),
1733
1734 =item [8]
1735
1736 the text of the second block of the operation
1737 (that is, the replacement of a substitution or the translation list
1738 of a translation),
1739
1740 =item [9]
1741
1742 the right delimiter of the second block of the operation (if any),
1743
1744 =item [10]
1745
1746 the trailing modifiers on the operation (if any).
1747
1748 =back
1749
1750 For each of the fields marked "(if any)" the default value on success is
1751 an empty string.
1752 On failure, all of these values (except the remaining text) are C<undef>.
1753
1754
1755 In a scalar context, C<extract_quotelike> returns just the complete substring
1756 that matched a quotelike operation (or C<undef> on failure). In a scalar or
1757 void context, the input text has the same substring (and any specified
1758 prefix) removed.
1759
1760 Examples:
1761
1762         # Remove the first quotelike literal that appears in text
1763
1764                 $quotelike = extract_quotelike($text,'.*?');
1765
1766         # Replace one or more leading whitespace-separated quotelike
1767         # literals in $_ with "<QLL>"
1768
1769                 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1770
1771
1772         # Isolate the search pattern in a quotelike operation from $text
1773
1774                 ($op,$pat) = (extract_quotelike $text)[3,5];
1775                 if ($op =~ /[ms]/)
1776                 {
1777                         print "search pattern: $pat\n";
1778                 }
1779                 else
1780                 {
1781                         print "$op is not a pattern matching operation\n";
1782                 }
1783
1784
1785 =head2 C<extract_quotelike> and "here documents"
1786
1787 C<extract_quotelike> can successfully extract "here documents" from an input
1788 string, but with an important caveat in list contexts.
1789
1790 Unlike other types of quote-like literals, a here document is rarely
1791 a contiguous substring. For example, a typical piece of code using
1792 here document might look like this:
1793
1794         <<'EOMSG' || die;
1795         This is the message.
1796         EOMSG
1797         exit;
1798
1799 Given this as an input string in a scalar context, C<extract_quotelike>
1800 would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1801 leaving the string " || die;\nexit;" in the original variable. In other words,
1802 the two separate pieces of the here document are successfully extracted and
1803 concatenated.
1804
1805 In a list context, C<extract_quotelike> would return the list
1806
1807 =over 4
1808
1809 =item [0]
1810
1811 "<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1812 including fore and aft delimiters),
1813
1814 =item [1]
1815
1816 " || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1817
1818 =item [2]
1819
1820 "" (i.e. the prefix substring -- trivial in this case),
1821
1822 =item [3]
1823
1824 "<<" (i.e. the "name" of the quotelike operator)
1825
1826 =item [4]
1827
1828 "'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1829
1830 =item [5]
1831
1832 "This is the message.\n" (i.e. the text of the here document),
1833
1834 =item [6]
1835
1836 "EOMSG" (i.e. the right delimiter of the here document),
1837
1838 =item [7..10]
1839
1840 "" (a here document has no second left delimiter, second text, second right
1841 delimiter, or trailing modifiers).
1842
1843 =back
1844
1845 However, the matching position of the input variable would be set to
1846 "exit;" (i.e. I<after> the closing delimiter of the here document),
1847 which would cause the earlier " || die;\nexit;" to be skipped in any
1848 sequence of code fragment extractions.
1849
1850 To avoid this problem, when it encounters a here document whilst
1851 extracting from a modifiable string, C<extract_quotelike> silently
1852 rearranges the string to an equivalent piece of Perl:
1853
1854         <<'EOMSG'
1855         This is the message.
1856         EOMSG
1857         || die;
1858         exit;
1859
1860 in which the here document I<is> contiguous. It still leaves the
1861 matching position after the here document, but now the rest of the line
1862 on which the here document starts is not skipped.
1863
1864 To prevent <extract_quotelike> from mucking about with the input in this way
1865 (this is the only case where a list-context C<extract_quotelike> does so),
1866 you can pass the input variable as an interpolated literal:
1867
1868         $quotelike = extract_quotelike("$var");
1869
1870
1871 =head2 C<extract_codeblock>
1872
1873 C<extract_codeblock> attempts to recognize and extract a balanced
1874 bracket delimited substring that may contain unbalanced brackets
1875 inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1876 is like a combination of C<"extract_bracketed"> and
1877 C<"extract_quotelike">.
1878
1879 C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1880 a text to process, a set of delimiter brackets to look for, and a prefix to
1881 match first. It also takes an optional fourth parameter, which allows the
1882 outermost delimiter brackets to be specified separately (see below).
1883
1884 Omitting the first argument (input text) means process C<$_> instead.
1885 Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1886 Omitting the third argument (prefix argument) implies optional whitespace at the start.
1887 Omitting the fourth argument (outermost delimiter brackets) indicates that the
1888 value of the second argument is to be used for the outermost delimiters.
1889
1890 Once the prefix an dthe outermost opening delimiter bracket have been
1891 recognized, code blocks are extracted by stepping through the input text and
1892 trying the following alternatives in sequence:
1893
1894 =over 4
1895
1896 =item 1.
1897
1898 Try and match a closing delimiter bracket. If the bracket was the same
1899 species as the last opening bracket, return the substring to that
1900 point. If the bracket was mismatched, return an error.
1901
1902 =item 2.
1903
1904 Try to match a quote or quotelike operator. If found, call
1905 C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1906 the error it returned. Otherwise go back to step 1.
1907
1908 =item 3.
1909
1910 Try to match an opening delimiter bracket. If found, call
1911 C<extract_codeblock> recursively to eat the embedded block. If the
1912 recursive call fails, return an error. Otherwise, go back to step 1.
1913
1914 =item 4.
1915
1916 Unconditionally match a bareword or any other single character, and
1917 then go back to step 1.
1918
1919 =back
1920
1921
1922 Examples:
1923
1924         # Find a while loop in the text
1925
1926                 if ($text =~ s/.*?while\s*\{/{/)
1927                 {
1928                         $loop = "while " . extract_codeblock($text);
1929                 }
1930
1931         # Remove the first round-bracketed list (which may include
1932         # round- or curly-bracketed code blocks or quotelike operators)
1933
1934                 extract_codeblock $text, "(){}", '[^(]*';
1935
1936
1937 The ability to specify a different outermost delimiter bracket is useful
1938 in some circumstances. For example, in the Parse::RecDescent module,
1939 parser actions which are to be performed only on a successful parse
1940 are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1941
1942         sentence: subject verb object
1943                         <defer: {$::theVerb = $item{verb}} >
1944
1945 Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1946 within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1947
1948 A deferred action like this:
1949
1950                         <defer: {if ($count>10) {$count--}} >
1951
1952 will be incorrectly parsed as:
1953
1954                         <defer: {if ($count>
1955
1956 because the "less than" operator is interpreted as a closing delimiter.
1957
1958 But, by extracting the directive using
1959 S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1960 the '>' character is only treated as a delimited at the outermost
1961 level of the code block, so the directive is parsed correctly.
1962
1963 =head2 C<extract_multiple>
1964
1965 The C<extract_multiple> subroutine takes a string to be processed and a
1966 list of extractors (subroutines or regular expressions) to apply to that string.
1967
1968 In an array context C<extract_multiple> returns an array of substrings
1969 of the original string, as extracted by the specified extractors.
1970 In a scalar context, C<extract_multiple> returns the first
1971 substring successfully extracted from the original string. In both
1972 scalar and void contexts the original string has the first successfully
1973 extracted substring removed from it. In all contexts
1974 C<extract_multiple> starts at the current C<pos> of the string, and
1975 sets that C<pos> appropriately after it matches.
1976
1977 Hence, the aim of of a call to C<extract_multiple> in a list context
1978 is to split the processed string into as many non-overlapping fields as
1979 possible, by repeatedly applying each of the specified extractors
1980 to the remainder of the string. Thus C<extract_multiple> is
1981 a generalized form of Perl's C<split> subroutine.
1982
1983 The subroutine takes up to four optional arguments:
1984
1985 =over 4
1986
1987 =item 1.
1988
1989 A string to be processed (C<$_> if the string is omitted or C<undef>)
1990
1991 =item 2.
1992
1993 A reference to a list of subroutine references and/or qr// objects and/or
1994 literal strings and/or hash references, specifying the extractors
1995 to be used to split the string. If this argument is omitted (or
1996 C<undef>) the list:
1997
1998         [
1999                 sub { extract_variable($_[0], '') },
2000                 sub { extract_quotelike($_[0],'') },
2001                 sub { extract_codeblock($_[0],'{}','') },
2002         ]
2003
2004 is used.
2005
2006
2007 =item 3.
2008
2009 An number specifying the maximum number of fields to return. If this
2010 argument is omitted (or C<undef>), split continues as long as possible.
2011
2012 If the third argument is I<N>, then extraction continues until I<N> fields
2013 have been successfully extracted, or until the string has been completely
2014 processed.
2015
2016 Note that in scalar and void contexts the value of this argument is
2017 automatically reset to 1 (under C<-w>, a warning is issued if the argument
2018 has to be reset).
2019
2020 =item 4.
2021
2022 A value indicating whether unmatched substrings (see below) within the
2023 text should be skipped or returned as fields. If the value is true,
2024 such substrings are skipped. Otherwise, they are returned.
2025
2026 =back
2027
2028 The extraction process works by applying each extractor in
2029 sequence to the text string.
2030
2031 If the extractor is a subroutine it is called in a list context and is
2032 expected to return a list of a single element, namely the extracted
2033 text. It may optionally also return two further arguments: a string
2034 representing the text left after extraction (like $' for a pattern
2035 match), and a string representing any prefix skipped before the
2036 extraction (like $` in a pattern match). Note that this is designed
2037 to facilitate the use of other Text::Balanced subroutines with
2038 C<extract_multiple>. Note too that the value returned by an extractor
2039 subroutine need not bear any relationship to the corresponding substring
2040 of the original text (see examples below).
2041
2042 If the extractor is a precompiled regular expression or a string,
2043 it is matched against the text in a scalar context with a leading
2044 '\G' and the gc modifiers enabled. The extracted value is either
2045 $1 if that variable is defined after the match, or else the
2046 complete match (i.e. $&).
2047
2048 If the extractor is a hash reference, it must contain exactly one element.
2049 The value of that element is one of the
2050 above extractor types (subroutine reference, regular expression, or string).
2051 The key of that element is the name of a class into which the successful
2052 return value of the extractor will be blessed.
2053
2054 If an extractor returns a defined value, that value is immediately
2055 treated as the next extracted field and pushed onto the list of fields.
2056 If the extractor was specified in a hash reference, the field is also
2057 blessed into the appropriate class,
2058
2059 If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
2060 assumed to have failed to extract.
2061 If none of the extractor subroutines succeeds, then one
2062 character is extracted from the start of the text and the extraction
2063 subroutines reapplied. Characters which are thus removed are accumulated and
2064 eventually become the next field (unless the fourth argument is true, in which
2065 case they are discarded).
2066
2067 For example, the following extracts substrings that are valid Perl variables:
2068
2069         @fields = extract_multiple($text,
2070                                    [ sub { extract_variable($_[0]) } ],
2071                                    undef, 1);
2072
2073 This example separates a text into fields which are quote delimited,
2074 curly bracketed, and anything else. The delimited and bracketed
2075 parts are also blessed to identify them (the "anything else" is unblessed):
2076
2077         @fields = extract_multiple($text,
2078                    [
2079                         { Delim => sub { extract_delimited($_[0],q{'"}) } },
2080                         { Brack => sub { extract_bracketed($_[0],'{}') } },
2081                    ]);
2082
2083 This call extracts the next single substring that is a valid Perl quotelike
2084 operator (and removes it from $text):
2085
2086         $quotelike = extract_multiple($text,
2087                                       [
2088                                         sub { extract_quotelike($_[0]) },
2089                                       ], undef, 1);
2090
2091 Finally, here is yet another way to do comma-separated value parsing:
2092
2093         @fields = extract_multiple($csv_text,
2094                                   [
2095                                         sub { extract_delimited($_[0],q{'"}) },
2096                                         qr/([^,]+)(.*)/,
2097                                   ],
2098                                   undef,1);
2099
2100 The list in the second argument means:
2101 I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2102 The undef third argument means:
2103 I<"...as many times as possible...">,
2104 and the true value in the fourth argument means
2105 I<"...discarding anything else that appears (i.e. the commas)">.
2106
2107 If you wanted the commas preserved as separate fields (i.e. like split
2108 does if your split pattern has capturing parentheses), you would
2109 just make the last parameter undefined (or remove it).
2110
2111
2112 =head2 C<gen_delimited_pat>
2113
2114 The C<gen_delimited_pat> subroutine takes a single (string) argument and
2115    > builds a Friedl-style optimized regex that matches a string delimited
2116 by any one of the characters in the single argument. For example:
2117
2118         gen_delimited_pat(q{'"})
2119
2120 returns the regex:
2121
2122         (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2123
2124 Note that the specified delimiters are automatically quotemeta'd.
2125
2126 A typical use of C<gen_delimited_pat> would be to build special purpose tags
2127 for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2128 (which might contain quoted strings):
2129
2130         my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2131
2132         extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2133
2134
2135 C<gen_delimited_pat> may also be called with an optional second argument,
2136 which specifies the "escape" character(s) to be used for each delimiter.
2137 For example to match a Pascal-style string (where ' is the delimiter
2138 and '' is a literal ' within the string):
2139
2140         gen_delimited_pat(q{'},q{'});
2141
2142 Different escape characters can be specified for different delimiters.
2143 For example, to specify that '/' is the escape for single quotes
2144 and '%' is the escape for double quotes:
2145
2146         gen_delimited_pat(q{'"},q{/%});
2147
2148 If more delimiters than escape chars are specified, the last escape char
2149 is used for the remaining delimiters.
2150 If no escape char is specified for a given specified delimiter, '\' is used.
2151
2152 Note that
2153 C<gen_delimited_pat> was previously called
2154 C<delimited_pat>. That name may still be used, but is now deprecated.
2155
2156
2157 =head1 DIAGNOSTICS
2158
2159 In a list context, all the functions return C<(undef,$original_text)>
2160 on failure. In a scalar context, failure is indicated by returning C<undef>
2161 (in this case the input text is not modified in any way).
2162
2163 In addition, on failure in I<any> context, the C<$@> variable is set.
2164 Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2165 below.
2166 Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2167 which the error was detected (although not necessarily where it occurred!)
2168 Printing C<$@> directly produces the error message, with the offset appended.
2169 On success, the C<$@> variable is guaranteed to be C<undef>.
2170
2171 The available diagnostics are:
2172
2173 =over 4
2174
2175 =item  C<Did not find a suitable bracket: "%s">
2176
2177 The delimiter provided to C<extract_bracketed> was not one of
2178 C<'()[]E<lt>E<gt>{}'>.
2179
2180 =item  C<Did not find prefix: /%s/>
2181
2182 A non-optional prefix was specified but wasn't found at the start of the text.
2183
2184 =item  C<Did not find opening bracket after prefix: "%s">
2185
2186 C<extract_bracketed> or C<extract_codeblock> was expecting a
2187 particular kind of bracket at the start of the text, and didn't find it.
2188
2189 =item  C<No quotelike operator found after prefix: "%s">
2190
2191 C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2192 C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2193 it was extracting.
2194
2195 =item  C<Unmatched closing bracket: "%c">
2196
2197 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2198 a closing bracket where none was expected.
2199
2200 =item  C<Unmatched opening bracket(s): "%s">
2201
2202 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2203 out of characters in the text before closing one or more levels of nested
2204 brackets.
2205
2206 =item C<Unmatched embedded quote (%s)>
2207
2208 C<extract_bracketed> attempted to match an embedded quoted substring, but
2209 failed to find a closing quote to match it.
2210
2211 =item C<Did not find closing delimiter to match '%s'>
2212
2213 C<extract_quotelike> was unable to find a closing delimiter to match the
2214 one that opened the quote-like operation.
2215
2216 =item  C<Mismatched closing bracket: expected "%c" but found "%s">
2217
2218 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2219 a valid bracket delimiter, but it was the wrong species. This usually
2220 indicates a nesting error, but may indicate incorrect quoting or escaping.
2221
2222 =item  C<No block delimiter found after quotelike "%s">
2223
2224 C<extract_quotelike> or C<extract_codeblock> found one of the
2225 quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2226 without a suitable block after it.
2227
2228 =item C<Did not find leading dereferencer>
2229
2230 C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2231 a variable, but didn't find any of them.
2232
2233 =item C<Bad identifier after dereferencer>
2234
2235 C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2236 character was not followed by a legal Perl identifier.
2237
2238 =item C<Did not find expected opening bracket at %s>
2239
2240 C<extract_codeblock> failed to find any of the outermost opening brackets
2241 that were specified.
2242
2243 =item C<Improperly nested codeblock at %s>
2244
2245 A nested code block was found that started with a delimiter that was specified
2246 as being only to be used as an outermost bracket.
2247
2248 =item  C<Missing second block for quotelike "%s">
2249
2250 C<extract_codeblock> or C<extract_quotelike> found one of the
2251 quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2252
2253 =item C<No match found for opening bracket>
2254
2255 C<extract_codeblock> failed to find a closing bracket to match the outermost
2256 opening bracket.
2257
2258 =item C<Did not find opening tag: /%s/>
2259
2260 C<extract_tagged> did not find a suitable opening tag (after any specified
2261 prefix was removed).
2262
2263 =item C<Unable to construct closing tag to match: /%s/>
2264
2265 C<extract_tagged> matched the specified opening tag and tried to
2266 modify the matched text to produce a matching closing tag (because
2267 none was specified). It failed to generate the closing tag, almost
2268 certainly because the opening tag did not start with a
2269 bracket of some kind.
2270
2271 =item C<Found invalid nested tag: %s>
2272
2273 C<extract_tagged> found a nested tag that appeared in the "reject" list
2274 (and the failure mode was not "MAX" or "PARA").
2275
2276 =item C<Found unbalanced nested tag: %s>
2277
2278 C<extract_tagged> found a nested opening tag that was not matched by a
2279 corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2280
2281 =item C<Did not find closing tag>
2282
2283 C<extract_tagged> reached the end of the text without finding a closing tag
2284 to match the original opening tag (and the failure mode was not
2285 "MAX" or "PARA").
2286
2287
2288
2289
2290 =back
2291
2292
2293 =head1 AUTHOR
2294
2295 Damian Conway (damian@conway.org)
2296
2297
2298 =head1 BUGS AND IRRITATIONS
2299
2300 There are undoubtedly serious bugs lurking somewhere in this code, if
2301 only because parts of it give the impression of understanding a great deal
2302 more about Perl than they really do.
2303
2304 Bug reports and other feedback are most welcome.
2305
2306
2307 =head1 COPYRIGHT
2308
2309  Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
2310  This module is free software. It may be used, redistributed
2311      and/or modified under the same terms as Perl itself.