lib/Text/Balanced.pm

   1 # EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
   2 # FOR FULL DOCUMENTATION SEE Balanced.pod
   3
   4 use 5.005;
   5 use strict;
   6
   7 package Text::Balanced;
   8
   9 use Exporter;
  10 use vars qw { $VERSION @ISA %EXPORT_TAGS };
  11
  12 $VERSION = '1.97';
  13 @ISA            = qw ( Exporter );
  14
  15 %EXPORT_TAGS    = ( ALL => [ qw(
  16                                 &extract_delimited
  17                                 &extract_bracketed
  18                                 &extract_quotelike
  19                                 &extract_codeblock
  20                                 &extract_variable
  21                                 &extract_tagged
  22                                 &extract_multiple
  23
  24                                 &gen_delimited_pat
  25                                 &gen_extract_tagged
  26
  27                                 &delimited_pat
  28                                ) ] );
  29
  30 Exporter::export_ok_tags('ALL');
  31
  32 # PROTOTYPES
  33
  34 sub _match_bracketed($$$$$$);
  35 sub _match_variable($$);
  36 sub _match_codeblock($$$$$$$);
  37 sub _match_quotelike($$$$);
  38
  39 sub carp {
  40   require Carp; goto &Carp::carp;
  41 }
  42
  43 sub croak {
  44   require Carp; goto &Carp::croak;
  45 }
  46
  47 # HANDLE RETURN VALUES IN VARIOUS CONTEXTS
  48
  49 sub _failmsg {
  50         my ($message, $pos) = @_;
  51         $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
  52 }
  53
  54 sub _fail
  55 {
  56         my ($wantarray, $textref, $message, $pos) = @_;
  57         _failmsg $message, $pos if $message;
  58         return (undef,$$textref,undef) if $wantarray;
  59         return undef;
  60 }
  61
  62 sub _succeed
  63 {
  64         $@ = undef;
  65         my ($wantarray,$textref) = splice @_, 0, 2;
  66         my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
  67         my ($startlen, $oppos) = $_[5,6];
  68         my $remainderpos = $_[2];
  69         if ($wantarray)
  70         {
  71                 my @res;
  72                 while (my ($from, $len) = splice @_, 0, 2)
  73                 {
  74                         push @res, substr($$textref,$from,$len);
  75                 }
  76                 if ($extralen) {        # CORRECT FILLET
  77                         my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n");
  78                         $res[1] = "$extra$res[1]";
  79                         eval { substr($$textref,$remainderpos,0) = $extra;
  80                                substr($$textref,$extrapos,$extralen,"\n")} ;
  81                                 #REARRANGE HERE DOC AND FILLET IF POSSIBLE
  82                         pos($$textref) = $remainderpos-$extralen+1; # RESET \G
  83                 }
  84                 else {
  85                         pos($$textref) = $remainderpos;             # RESET \G
  86                 }
  87                 return @res;
  88         }
  89         else
  90         {
  91                 my $match = substr($$textref,$_[0],$_[1]);
  92                 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
  93                 my $extra = $extralen
  94                         ? substr($$textref, $extrapos, $extralen)."\n" : "";
  95                 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ;     #CHOP OUT PREFIX & MATCH, IF POSSIBLE
  96                 pos($$textref) = $_[4];                         # RESET \G
  97                 return $match;
  98         }
  99 }
 100
 101 # BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
 102
 103 sub gen_delimited_pat($;$)  # ($delimiters;$escapes)
 104 {
 105         my ($dels, $escs) = @_;
 106         return "" unless $dels =~ /\S/;
 107         $escs = '\\' unless $escs;
 108         $escs .= substr($escs,-1) x (length($dels)-length($escs));
 109         my @pat = ();
 110         my $i;
 111         for ($i=0; $i<length $dels; $i++)
 112         {
 113                 my $del = quotemeta substr($dels,$i,1);
 114                 my $esc = quotemeta substr($escs,$i,1);
 115                 if ($del eq $esc)
 116                 {
 117                         push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
 118                 }
 119                 else
 120                 {
 121                         push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
 122                 }
 123         }
 124         my $pat = join '|', @pat;
 125         return "(?:$pat)";
 126 }
 127
 128 *delimited_pat = \&gen_delimited_pat;
 129
 130
 131 # THE EXTRACTION FUNCTIONS
 132
 133 sub extract_delimited (;$$$$)
 134 {
 135         my $textref = defined $_[0] ? \$_[0] : \$_;
 136         my $wantarray = wantarray;
 137         my $del  = defined $_[1] ? $_[1] : qq{\'\"\`};
 138         my $pre  = defined $_[2] ? $_[2] : '\s*';
 139         my $esc  = defined $_[3] ? $_[3] : qq{\\};
 140         my $pat = gen_delimited_pat($del, $esc);
 141         my $startpos = pos $$textref || 0;
 142         return _fail($wantarray, $textref, "Not a delimited pattern", 0)
 143                 unless $$textref =~ m/\G($pre)($pat)/gc;
 144         my $prelen = length($1);
 145         my $matchpos = $startpos+$prelen;
 146         my $endpos = pos $$textref;
 147         return _succeed $wantarray, $textref,
 148                         $matchpos, $endpos-$matchpos,           # MATCH
 149                         $endpos,   length($$textref)-$endpos,   # REMAINDER
 150                         $startpos, $prelen;                     # PREFIX
 151 }
 152
 153 sub extract_bracketed (;$$$)
 154 {
 155         my $textref = defined $_[0] ? \$_[0] : \$_;
 156         my $ldel = defined $_[1] ? $_[1] : '{([<';
 157         my $pre  = defined $_[2] ? $_[2] : '\s*';
 158         my $wantarray = wantarray;
 159         my $qdel = "";
 160         my $quotelike;
 161         $ldel =~ s/'//g and $qdel .= q{'};
 162         $ldel =~ s/"//g and $qdel .= q{"};
 163         $ldel =~ s/`//g and $qdel .= q{`};
 164         $ldel =~ s/q//g and $quotelike = 1;
 165         $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
 166         my $rdel = $ldel;
 167         unless ($rdel =~ tr/[({</])}>/)
 168         {
 169                 return _fail $wantarray, $textref,
 170                              "Did not find a suitable bracket in delimiter: \"$_[1]\"",
 171                              0;
 172         }
 173         my $posbug = pos;
 174         $ldel = join('|', map { quotemeta $_ } split('', $ldel));
 175         $rdel = join('|', map { quotemeta $_ } split('', $rdel));
 176         pos = $posbug;
 177
 178         my $startpos = pos $$textref || 0;
 179         my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
 180
 181         return _fail ($wantarray, $textref) unless @match;
 182
 183         return _succeed ( $wantarray, $textref,
 184                           $match[2], $match[5]+2,       # MATCH
 185                           @match[8,9],                  # REMAINDER
 186                           @match[0,1],                  # PREFIX
 187                         );
 188 }
 189
 190 sub _match_bracketed($$$$$$)    # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
 191 {
 192         my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
 193         my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
 194         unless ($$textref =~ m/\G$pre/gc)
 195         {
 196                 _failmsg "Did not find prefix: /$pre/", $startpos;
 197                 return;
 198         }
 199
 200         $ldelpos = pos $$textref;
 201
 202         unless ($$textref =~ m/\G($ldel)/gc)
 203         {
 204                 _failmsg "Did not find opening bracket after prefix: \"$pre\"",
 205                          pos $$textref;
 206                 pos $$textref = $startpos;
 207                 return;
 208         }
 209
 210         my @nesting = ( $1 );
 211         my $textlen = length $$textref;
 212         while (pos $$textref < $textlen)
 213         {
 214                 next if $$textref =~ m/\G\\./gcs;
 215
 216                 if ($$textref =~ m/\G($ldel)/gc)
 217                 {
 218                         push @nesting, $1;
 219                 }
 220                 elsif ($$textref =~ m/\G($rdel)/gc)
 221                 {
 222                         my ($found, $brackettype) = ($1, $1);
 223                         if ($#nesting < 0)
 224                         {
 225                                 _failmsg "Unmatched closing bracket: \"$found\"",
 226                                          pos $$textref;
 227                                 pos $$textref = $startpos;
 228                                 return;
 229                         }
 230                         my $expected = pop(@nesting);
 231                         $expected =~ tr/({[</)}]>/;
 232                         if ($expected ne $brackettype)
 233                         {
 234                                 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
 235                                          pos $$textref;
 236                                 pos $$textref = $startpos;
 237                                 return;
 238                         }
 239                         last if $#nesting < 0;
 240                 }
 241                 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
 242                 {
 243                         $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
 244                         _failmsg "Unmatched embedded quote ($1)",
 245                                  pos $$textref;
 246                         pos $$textref = $startpos;
 247                         return;
 248                 }
 249                 elsif ($quotelike && _match_quotelike($textref,"",1,0))
 250                 {
 251                         next;
 252                 }
 253
 254                 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
 255         }
 256         if ($#nesting>=0)
 257         {
 258                 _failmsg "Unmatched opening bracket(s): "
 259                                 . join("..",@nesting)."..",
 260                          pos $$textref;
 261                 pos $$textref = $startpos;
 262                 return;
 263         }
 264
 265         $endpos = pos $$textref;
 266
 267         return (
 268                 $startpos,  $ldelpos-$startpos,         # PREFIX
 269                 $ldelpos,   1,                          # OPENING BRACKET
 270                 $ldelpos+1, $endpos-$ldelpos-2,         # CONTENTS
 271                 $endpos-1,  1,                          # CLOSING BRACKET
 272                 $endpos,    length($$textref)-$endpos,  # REMAINDER
 273                );
 274 }
 275
 276 sub _revbracket($)
 277 {
 278         my $brack = reverse $_[0];
 279         $brack =~ tr/[({</])}>/;
 280         return $brack;
 281 }
 282
 283 my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
 284
 285 sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
 286 {
 287         my $textref = defined $_[0] ? \$_[0] : \$_;
 288         my $ldel    = $_[1];
 289         my $rdel    = $_[2];
 290         my $pre     = defined $_[3] ? $_[3] : '\s*';
 291         my %options = defined $_[4] ? %{$_[4]} : ();
 292         my $omode   = defined $options{fail} ? $options{fail} : '';
 293         my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
 294                     : defined($options{reject})        ? $options{reject}
 295                     :                                    ''
 296                     ;
 297         my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
 298                     : defined($options{ignore})        ? $options{ignore}
 299                     :                                    ''
 300                     ;
 301
 302         if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
 303         $@ = undef;
 304
 305         my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
 306
 307         return _fail(wantarray, $textref) unless @match;
 308         return _succeed wantarray, $textref,
 309                         $match[2], $match[3]+$match[5]+$match[7],       # MATCH
 310                         @match[8..9,0..1,2..7];                         # REM, PRE, BITS
 311 }
 312
 313 sub _match_tagged       # ($$$$$$$)
 314 {
 315         my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
 316         my $rdelspec;
 317
 318         my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
 319
 320         unless ($$textref =~ m/\G($pre)/gc)
 321         {
 322                 _failmsg "Did not find prefix: /$pre/", pos $$textref;
 323                 goto failed;
 324         }
 325
 326         $opentagpos = pos($$textref);
 327
 328         unless ($$textref =~ m/\G$ldel/gc)
 329         {
 330                 _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
 331                 goto failed;
 332         }
 333
 334         $textpos = pos($$textref);
 335
 336         if (!defined $rdel)
 337         {
 338                 $rdelspec = $&;
 339                 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes)
 340                 {
 341                         _failmsg "Unable to construct closing tag to match: $rdel",
 342                                  pos $$textref;
 343                         goto failed;
 344                 }
 345         }
 346         else
 347         {
 348                 $rdelspec = eval "qq{$rdel}" || do {
 349                         my $del;
 350                         for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
 351                                 { next if $rdel =~ /\Q$_/; $del = $_; last }
 352                         unless ($del) {
 353                                 croak ("Can't interpolate right delimiter $rdel")
 354                         }
 355                         eval "qq$del$rdel$del";
 356                 };
 357         }
 358
 359         while (pos($$textref) < length($$textref))
 360         {
 361                 next if $$textref =~ m/\G\\./gc;
 362
 363                 if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
 364                 {
 365                         $parapos = pos($$textref) - length($1)
 366                                 unless defined $parapos;
 367                 }
 368                 elsif ($$textref =~ m/\G($rdelspec)/gc )
 369                 {
 370                         $closetagpos = pos($$textref)-length($1);
 371                         goto matched;
 372                 }
 373                 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
 374                 {
 375                         next;
 376                 }
 377                 elsif ($bad && $$textref =~ m/\G($bad)/gcs)
 378                 {
 379                         pos($$textref) -= length($1);   # CUT OFF WHATEVER CAUSED THE SHORTNESS
 380                         goto short if ($omode eq 'PARA' || $omode eq 'MAX');
 381                         _failmsg "Found invalid nested tag: $1", pos $$textref;
 382                         goto failed;
 383                 }
 384                 elsif ($$textref =~ m/\G($ldel)/gc)
 385                 {
 386                         my $tag = $1;
 387                         pos($$textref) -= length($tag); # REWIND TO NESTED TAG
 388                         unless (_match_tagged(@_))      # MATCH NESTED TAG
 389                         {
 390                                 goto short if $omode eq 'PARA' || $omode eq 'MAX';
 391                                 _failmsg "Found unbalanced nested tag: $tag",
 392                                          pos $$textref;
 393                                 goto failed;
 394                         }
 395                 }
 396                 else { $$textref =~ m/./gcs }
 397         }
 398
 399 short:
 400         $closetagpos = pos($$textref);
 401         goto matched if $omode eq 'MAX';
 402         goto failed unless $omode eq 'PARA';
 403
 404         if (defined $parapos) { pos($$textref) = $parapos }
 405         else                  { $parapos = pos($$textref) }
 406
 407         return (
 408                 $startpos,    $opentagpos-$startpos,            # PREFIX
 409                 $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
 410                 $textpos,     $parapos-$textpos,                # TEXT
 411                 $parapos,     0,                                # NO CLOSING TAG
 412                 $parapos,     length($$textref)-$parapos,       # REMAINDER
 413                );
 414
 415 matched:
 416         $endpos = pos($$textref);
 417         return (
 418                 $startpos,    $opentagpos-$startpos,            # PREFIX
 419                 $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
 420                 $textpos,     $closetagpos-$textpos,            # TEXT
 421                 $closetagpos, $endpos-$closetagpos,             # CLOSING TAG
 422                 $endpos,      length($$textref)-$endpos,        # REMAINDER
 423                );
 424
 425 failed:
 426         _failmsg "Did not find closing tag", pos $$textref unless $@;
 427         pos($$textref) = $startpos;
 428         return;
 429 }
 430
 431 sub extract_variable (;$$)
 432 {
 433         my $textref = defined $_[0] ? \$_[0] : \$_;
 434         return ("","","") unless defined $$textref;
 435         my $pre  = defined $_[1] ? $_[1] : '\s*';
 436
 437         my @match = _match_variable($textref,$pre);
 438
 439         return _fail wantarray, $textref unless @match;
 440
 441         return _succeed wantarray, $textref,
 442                         @match[2..3,4..5,0..1];         # MATCH, REMAINDER, PREFIX
 443 }
 444
 445 sub _match_variable($$)
 446 {
 447 #  $#
 448 #  $^
 449 #  $$
 450         my ($textref, $pre) = @_;
 451         my $startpos = pos($$textref) = pos($$textref)||0;
 452         unless ($$textref =~ m/\G($pre)/gc)
 453         {
 454                 _failmsg "Did not find prefix: /$pre/", pos $$textref;
 455                 return;
 456         }
 457         my $varpos = pos($$textref);
 458         unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
 459         {
 460             unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
 461             {
 462                 _failmsg "Did not find leading dereferencer", pos $$textref;
 463                 pos $$textref = $startpos;
 464                 return;
 465             }
 466             my $deref = $1;
 467
 468             unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
 469                 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
 470                 or $deref eq '$#' or $deref eq '$$' )
 471             {
 472                 _failmsg "Bad identifier after dereferencer", pos $$textref;
 473                 pos $$textref = $startpos;
 474                 return;
 475             }
 476         }
 477
 478         while (1)
 479         {
 480                 next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
 481                 next if _match_codeblock($textref,
 482                                          qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
 483                                          qr/[({[]/, qr/[)}\]]/,
 484                                          qr/[({[]/, qr/[)}\]]/, 0);
 485                 next if _match_codeblock($textref,
 486                                          qr/\s*/, qr/[{[]/, qr/[}\]]/,
 487                                          qr/[{[]/, qr/[}\]]/, 0);
 488                 next if _match_variable($textref,'\s*->\s*');
 489                 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
 490                 last;
 491         }
 492
 493         my $endpos = pos($$textref);
 494         return ($startpos, $varpos-$startpos,
 495                 $varpos,   $endpos-$varpos,
 496                 $endpos,   length($$textref)-$endpos
 497                 );
 498 }
 499
 500 sub extract_codeblock (;$$$$$)
 501 {
 502         my $textref = defined $_[0] ? \$_[0] : \$_;
 503         my $wantarray = wantarray;
 504         my $ldel_inner = defined $_[1] ? $_[1] : '{';
 505         my $pre        = defined $_[2] ? $_[2] : '\s*';
 506         my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
 507         my $rd         = $_[4];
 508         my $rdel_inner = $ldel_inner;
 509         my $rdel_outer = $ldel_outer;
 510         my $posbug = pos;
 511         for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
 512         for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
 513         for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
 514         {
 515                 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
 516         }
 517         pos = $posbug;
 518
 519         my @match = _match_codeblock($textref, $pre,
 520                                      $ldel_outer, $rdel_outer,
 521                                      $ldel_inner, $rdel_inner,
 522                                      $rd);
 523         return _fail($wantarray, $textref) unless @match;
 524         return _succeed($wantarray, $textref,
 525                         @match[2..3,4..5,0..1]  # MATCH, REMAINDER, PREFIX
 526                        );
 527
 528 }
 529
 530 sub _match_codeblock($$$$$$$)
 531 {
 532         my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
 533         my $startpos = pos($$textref) = pos($$textref) || 0;
 534         unless ($$textref =~ m/\G($pre)/gc)
 535         {
 536                 _failmsg qq{Did not match prefix /$pre/ at"} .
 537                             substr($$textref,pos($$textref),20) .
 538                             q{..."},
 539                          pos $$textref;
 540                 return;
 541         }
 542         my $codepos = pos($$textref);
 543         unless ($$textref =~ m/\G($ldel_outer)/gc)      # OUTERMOST DELIMITER
 544         {
 545                 _failmsg qq{Did not find expected opening bracket at "} .
 546                              substr($$textref,pos($$textref),20) .
 547                              q{..."},
 548                          pos $$textref;
 549                 pos $$textref = $startpos;
 550                 return;
 551         }
 552         my $closing = $1;
 553            $closing =~ tr/([<{/)]>}/;
 554         my $matched;
 555         my $patvalid = 1;
 556         while (pos($$textref) < length($$textref))
 557         {
 558                 $matched = '';
 559                 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
 560                 {
 561                         $patvalid = 0;
 562                         next;
 563                 }
 564
 565                 if ($$textref =~ m/\G\s*#.*/gc)
 566                 {
 567                         next;
 568                 }
 569
 570                 if ($$textref =~ m/\G\s*($rdel_outer)/gc)
 571                 {
 572                         unless ($matched = ($closing && $1 eq $closing) )
 573                         {
 574                                 next if $1 eq '>';      # MIGHT BE A "LESS THAN"
 575                                 _failmsg q{Mismatched closing bracket at "} .
 576                                              substr($$textref,pos($$textref),20) .
 577                                              qq{...". Expected '$closing'},
 578                                          pos $$textref;
 579                         }
 580                         last;
 581                 }
 582
 583                 if (_match_variable($textref,'\s*') ||
 584                     _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
 585                 {
 586                         $patvalid = 0;
 587                         next;
 588                 }
 589
 590
 591                 # NEED TO COVER MANY MORE CASES HERE!!!
 592                 # NB 'case' is included here, because in Switch.pm,
 593                 # it's followed by a term, not an op
 594
 595                 if ($$textref =~ m#\G\s*(?!$ldel_inner)
 596                                         ( [-+*x/%^&|.]=?
 597                                         | [!=]~
 598                                         | =(?!>)
 599                                         | (\*\*|&&|\|\||<<|>>)=?
 600                                         | case|split|grep|map|return
 601                                         | [([]
 602                                         )#gcx)
 603                 {
 604                         $patvalid = 1;
 605                         next;
 606                 }
 607
 608                 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
 609                 {
 610                         $patvalid = 1;
 611                         next;
 612                 }
 613
 614                 if ($$textref =~ m/\G\s*$ldel_outer/gc)
 615                 {
 616                         _failmsg q{Improperly nested codeblock at "} .
 617                                      substr($$textref,pos($$textref),20) .
 618                                      q{..."},
 619                                  pos $$textref;
 620                         last;
 621                 }
 622
 623                 $patvalid = 0;
 624                 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
 625         }
 626         continue { $@ = undef }
 627
 628         unless ($matched)
 629         {
 630                 _failmsg 'No match found for opening bracket', pos $$textref
 631                         unless $@;
 632                 return;
 633         }
 634
 635         my $endpos = pos($$textref);
 636         return ( $startpos, $codepos-$startpos,
 637                  $codepos, $endpos-$codepos,
 638                  $endpos,  length($$textref)-$endpos,
 639                );
 640 }
 641
 642
 643 my %mods   = (
 644                 'none'  => '[cgimsox]*',
 645                 'm'     => '[cgimsox]*',
 646                 's'     => '[cegimsox]*',
 647                 'tr'    => '[cds]*',
 648                 'y'     => '[cds]*',
 649                 'qq'    => '',
 650                 'qx'    => '',
 651                 'qw'    => '',
 652                 'qr'    => '[imsx]*',
 653                 'q'     => '',
 654              );
 655
 656 sub extract_quotelike (;$$)
 657 {
 658         my $textref = $_[0] ? \$_[0] : \$_;
 659         my $wantarray = wantarray;
 660         my $pre  = defined $_[1] ? $_[1] : '\s*';
 661
 662         my @match = _match_quotelike($textref,$pre,1,0);
 663         return _fail($wantarray, $textref) unless @match;
 664         return _succeed($wantarray, $textref,
 665                         $match[2], $match[18]-$match[2],        # MATCH
 666                         @match[18,19],                          # REMAINDER
 667                         @match[0,1],                            # PREFIX
 668                         @match[2..17],                          # THE BITS
 669                         @match[20,21],                          # ANY FILLET?
 670                        );
 671 };
 672
 673 sub _match_quotelike($$$$)      # ($textref, $prepat, $allow_raw_match)
 674 {
 675         my ($textref, $pre, $rawmatch, $qmark) = @_;
 676
 677         my ($textlen,$startpos,
 678             $oppos,
 679             $preld1pos,$ld1pos,$str1pos,$rd1pos,
 680             $preld2pos,$ld2pos,$str2pos,$rd2pos,
 681             $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
 682
 683         unless ($$textref =~ m/\G($pre)/gc)
 684         {
 685                 _failmsg qq{Did not find prefix /$pre/ at "} .
 686                              substr($$textref, pos($$textref), 20) .
 687                              q{..."},
 688                          pos $$textref;
 689                 return;
 690         }
 691         $oppos = pos($$textref);
 692
 693         my $initial = substr($$textref,$oppos,1);
 694
 695         if ($initial && $initial =~ m|^[\"\'\`]|
 696                      || $rawmatch && $initial =~ m|^/|
 697                      || $qmark && $initial =~ m|^\?|)
 698         {
 699                 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
 700                 {
 701                         _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
 702                                      substr($$textref, $oppos, 20) .
 703                                      q{..."},
 704                                  pos $$textref;
 705                         pos $$textref = $startpos;
 706                         return;
 707                 }
 708                 $modpos= pos($$textref);
 709                 $rd1pos = $modpos-1;
 710
 711                 if ($initial eq '/' || $initial eq '?')
 712                 {
 713                         $$textref =~ m/\G$mods{none}/gc
 714                 }
 715
 716                 my $endpos = pos($$textref);
 717                 return (
 718                         $startpos,      $oppos-$startpos,       # PREFIX
 719                         $oppos,         0,                      # NO OPERATOR
 720                         $oppos,         1,                      # LEFT DEL
 721                         $oppos+1,       $rd1pos-$oppos-1,       # STR/PAT
 722                         $rd1pos,        1,                      # RIGHT DEL
 723                         $modpos,        0,                      # NO 2ND LDEL
 724                         $modpos,        0,                      # NO 2ND STR
 725                         $modpos,        0,                      # NO 2ND RDEL
 726                         $modpos,        $endpos-$modpos,        # MODIFIERS
 727                         $endpos,        $textlen-$endpos,       # REMAINDER
 728                        );
 729         }
 730
 731         unless ($$textref =~
 732     m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<(?=\s*["'A-Za-z_]))}gc)
 733         {
 734                 _failmsg q{No quotelike operator found after prefix at "} .
 735                              substr($$textref, pos($$textref), 20) .
 736                              q{..."},
 737                          pos $$textref;
 738                 pos $$textref = $startpos;
 739                 return;
 740         }
 741
 742         my $op = $1;
 743         $preld1pos = pos($$textref);
 744         if ($op eq '<<') {
 745                 $ld1pos = pos($$textref);
 746                 my $label;
 747                 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
 748                         $label = $1;
 749                 }
 750                 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
 751                                      | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
 752                                      | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
 753                                      }gcsx) {
 754                         $label = $+;
 755                 }
 756                 else {
 757                         $label = "";
 758                 }
 759                 my $extrapos = pos($$textref);
 760                 $$textref =~ m{.*\n}gc;
 761                 $str1pos = pos($$textref)--;
 762                 unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) {
 763                         _failmsg qq{Missing here doc terminator ('$label') after "} .
 764                                      substr($$textref, $startpos, 20) .
 765                                      q{..."},
 766                                  pos $$textref;
 767                         pos $$textref = $startpos;
 768                         return;
 769                 }
 770                 $rd1pos = pos($$textref);
 771                 $$textref =~ m{\Q$label\E\n}gc;
 772                 $ld2pos = pos($$textref);
 773                 return (
 774                         $startpos,      $oppos-$startpos,       # PREFIX
 775                         $oppos,         length($op),            # OPERATOR
 776                         $ld1pos,        $extrapos-$ld1pos,      # LEFT DEL
 777                         $str1pos,       $rd1pos-$str1pos,       # STR/PAT
 778                         $rd1pos,        $ld2pos-$rd1pos,        # RIGHT DEL
 779                         $ld2pos,        0,                      # NO 2ND LDEL
 780                         $ld2pos,        0,                      # NO 2ND STR
 781                         $ld2pos,        0,                      # NO 2ND RDEL
 782                         $ld2pos,        0,                      # NO MODIFIERS
 783                         $ld2pos,        $textlen-$ld2pos,       # REMAINDER
 784                         $extrapos,      $str1pos-$extrapos,     # FILLETED BIT
 785                        );
 786         }
 787
 788         $$textref =~ m/\G\s*/gc;
 789         $ld1pos = pos($$textref);
 790         $str1pos = $ld1pos+1;
 791
 792         unless ($$textref =~ m/\G(\S)/gc)       # SHOULD USE LOOKAHEAD
 793         {
 794                 _failmsg "No block delimiter found after quotelike $op",
 795                          pos $$textref;
 796                 pos $$textref = $startpos;
 797                 return;
 798         }
 799         pos($$textref) = $ld1pos;       # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
 800         my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
 801         if ($ldel1 =~ /[[(<{]/)
 802         {
 803                 $rdel1 =~ tr/[({</])}>/;
 804                 defined(_match_bracketed($textref,"",$ldel1,"","",$rdel1))
 805                 || do { pos $$textref = $startpos; return };
 806         }
 807         else
 808         {
 809                 $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
 810                 || do { pos $$textref = $startpos; return };
 811         }
 812         $ld2pos = $rd1pos = pos($$textref)-1;
 813
 814         my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
 815         if ($second_arg)
 816         {
 817                 my ($ldel2, $rdel2);
 818                 if ($ldel1 =~ /[[(<{]/)
 819                 {
 820                         unless ($$textref =~ /\G\s*(\S)/gc)     # SHOULD USE LOOKAHEAD
 821                         {
 822                                 _failmsg "Missing second block for quotelike $op",
 823                                          pos $$textref;
 824                                 pos $$textref = $startpos;
 825                                 return;
 826                         }
 827                         $ldel2 = $rdel2 = "\Q$1";
 828                         $rdel2 =~ tr/[({</])}>/;
 829                 }
 830                 else
 831                 {
 832                         $ldel2 = $rdel2 = $ldel1;
 833                 }
 834                 $str2pos = $ld2pos+1;
 835
 836                 if ($ldel2 =~ /[[(<{]/)
 837                 {
 838                         pos($$textref)--;       # OVERCOME BROKEN LOOKAHEAD
 839                         defined(_match_bracketed($textref,"",$ldel2,"","",$rdel2))
 840                         || do { pos $$textref = $startpos; return };
 841                 }
 842                 else
 843                 {
 844                         $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
 845                         || do { pos $$textref = $startpos; return };
 846                 }
 847                 $rd2pos = pos($$textref)-1;
 848         }
 849         else
 850         {
 851                 $ld2pos = $str2pos = $rd2pos = $rd1pos;
 852         }
 853
 854         $modpos = pos $$textref;
 855
 856         $$textref =~ m/\G($mods{$op})/gc;
 857         my $endpos = pos $$textref;
 858
 859         return (
 860                 $startpos,      $oppos-$startpos,       # PREFIX
 861                 $oppos,         length($op),            # OPERATOR
 862                 $ld1pos,        1,                      # LEFT DEL
 863                 $str1pos,       $rd1pos-$str1pos,       # STR/PAT
 864                 $rd1pos,        1,                      # RIGHT DEL
 865                 $ld2pos,        $second_arg,            # 2ND LDEL (MAYBE)
 866                 $str2pos,       $rd2pos-$str2pos,       # 2ND STR (MAYBE)
 867                 $rd2pos,        $second_arg,            # 2ND RDEL (MAYBE)
 868                 $modpos,        $endpos-$modpos,        # MODIFIERS
 869                 $endpos,        $textlen-$endpos,       # REMAINDER
 870                );
 871 }
 872
 873 my $def_func =
 874 [
 875         sub { extract_variable($_[0], '') },
 876         sub { extract_quotelike($_[0],'') },
 877         sub { extract_codeblock($_[0],'{}','') },
 878 ];
 879
 880 sub extract_multiple (;$$$$)    # ($text, $functions_ref, $max_fields, $ignoreunknown)
 881 {
 882         my $textref = defined($_[0]) ? \$_[0] : \$_;
 883         my $posbug = pos;
 884         my ($lastpos, $firstpos);
 885         my @fields = ();
 886
 887         #for ($$textref)
 888         {
 889                 my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
 890                 my $max  = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
 891                 my $igunk = $_[3];
 892
 893                 pos $$textref ||= 0;
 894
 895                 unless (wantarray)
 896                 {
 897                         carp ("extract_multiple reset maximal count to 1 in scalar context")
 898                                 if $^W && defined($_[2]) && $max > 1;
 899                         $max = 1
 900                 }
 901
 902                 my $unkpos;
 903                 my $func;
 904                 my $class;
 905
 906                 my @class;
 907                 foreach $func ( @func )
 908                 {
 909                         if (ref($func) eq 'HASH')
 910                         {
 911                                 push @class, (keys %$func)[0];
 912                                 $func = (values %$func)[0];
 913                         }
 914                         else
 915                         {
 916                                 push @class, undef;
 917                         }
 918                 }
 919
 920                 FIELD: while (pos($$textref) < length($$textref))
 921                 {
 922                         my ($field, $rem);
 923                         my @bits;
 924                         foreach my $i ( 0..$#func )
 925                         {
 926                                 my $pref;
 927                                 $func = $func[$i];
 928                                 $class = $class[$i];
 929                                 $lastpos = pos $$textref;
 930                                 if (ref($func) eq 'CODE')
 931                                         { ($field,$rem,$pref) = @bits = $func->($$textref) }
 932                                 elsif (ref($func) eq 'Text::Balanced::Extractor')
 933                                         { @bits = $field = $func->extract($$textref) }
 934                                 elsif( $$textref =~ m/\G$func/gc )
 935                                         { @bits = $field = defined($1) ? $1 : $& }
 936                                 $pref ||= "";
 937                                 if (defined($field) && length($field))
 938                                 {
 939                                         if (!$igunk) {
 940                                                 $unkpos = $lastpos
 941                                                         if length($pref) && !defined($unkpos);
 942                                                 if (defined $unkpos)
 943                                                 {
 944                                                         push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
 945                                                         $firstpos = $unkpos unless defined $firstpos;
 946                                                         undef $unkpos;
 947                                                         last FIELD if @fields == $max;
 948                                                 }
 949                                         }
 950                                         push @fields, $class
 951                                                 ? bless (\$field, $class)
 952                                                 : $field;
 953                                         $firstpos = $lastpos unless defined $firstpos;
 954                                         $lastpos = pos $$textref;
 955                                         last FIELD if @fields == $max;
 956                                         next FIELD;
 957                                 }
 958                         }
 959                         if ($$textref =~ /\G(.)/gcs)
 960                         {
 961                                 $unkpos = pos($$textref)-1
 962                                         unless $igunk || defined $unkpos;
 963                         }
 964                 }
 965
 966                 if (defined $unkpos)
 967                 {
 968                         push @fields, substr($$textref, $unkpos);
 969                         $firstpos = $unkpos unless defined $firstpos;
 970                         $lastpos = length $$textref;
 971                 }
 972                 last;
 973         }
 974
 975         pos $$textref = $lastpos;
 976         return @fields if wantarray;
 977
 978         $firstpos ||= 0;
 979         eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
 980                pos $$textref = $firstpos };
 981         return $fields[0];
 982 }
 983
 984
 985 sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
 986 {
 987         my $ldel    = $_[0];
 988         my $rdel    = $_[1];
 989         my $pre     = defined $_[2] ? $_[2] : '\s*';
 990         my %options = defined $_[3] ? %{$_[3]} : ();
 991         my $omode   = defined $options{fail} ? $options{fail} : '';
 992         my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
 993                     : defined($options{reject})        ? $options{reject}
 994                     :                                    ''
 995                     ;
 996         my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
 997                     : defined($options{ignore})        ? $options{ignore}
 998                     :                                    ''
 999                     ;
1000
1001         if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
1002
1003         my $posbug = pos;
1004         for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
1005         pos = $posbug;
1006
1007         my $closure = sub
1008         {
1009                 my $textref = defined $_[0] ? \$_[0] : \$_;
1010                 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
1011
1012                 return _fail(wantarray, $textref) unless @match;
1013                 return _succeed wantarray, $textref,
1014                                 $match[2], $match[3]+$match[5]+$match[7],       # MATCH
1015                                 @match[8..9,0..1,2..7];                         # REM, PRE, BITS
1016         };
1017
1018         bless $closure, 'Text::Balanced::Extractor';
1019 }
1020
1021 package Text::Balanced::Extractor;
1022
1023 sub extract($$) # ($self, $text)
1024 {
1025         &{$_[0]}($_[1]);
1026 }
1027
1028 package Text::Balanced::ErrorMsg;
1029
1030 use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
1031
1032 1;
1033
1034 __END__
1035
1036 =head1 NAME
1037
1038 Text::Balanced - Extract delimited text sequences from strings.
1039
1040
1041 =head1 SYNOPSIS
1042
1043  use Text::Balanced qw (
1044                         extract_delimited
1045                         extract_bracketed
1046                         extract_quotelike
1047                         extract_codeblock
1048                         extract_variable
1049                         extract_tagged
1050                         extract_multiple
1051
1052                         gen_delimited_pat
1053                         gen_extract_tagged
1054                        );
1055
1056  # Extract the initial substring of $text that is delimited by
1057  # two (unescaped) instances of the first character in $delim.
1058
1059         ($extracted, $remainder) = extract_delimited($text,$delim);
1060
1061
1062  # Extract the initial substring of $text that is bracketed
1063  # with a delimiter(s) specified by $delim (where the string
1064  # in $delim contains one or more of '(){}[]<>').
1065
1066         ($extracted, $remainder) = extract_bracketed($text,$delim);
1067
1068
1069  # Extract the initial substring of $text that is bounded by
1070  # an XML tag.
1071
1072         ($extracted, $remainder) = extract_tagged($text);
1073
1074
1075  # Extract the initial substring of $text that is bounded by
1076  # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1077
1078         ($extracted, $remainder) =
1079                 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1080
1081
1082  # Extract the initial substring of $text that represents a
1083  # Perl "quote or quote-like operation"
1084
1085         ($extracted, $remainder) = extract_quotelike($text);
1086
1087
1088  # Extract the initial substring of $text that represents a block
1089  # of Perl code, bracketed by any of character(s) specified by $delim
1090  # (where the string $delim contains one or more of '(){}[]<>').
1091
1092         ($extracted, $remainder) = extract_codeblock($text,$delim);
1093
1094
1095  # Extract the initial substrings of $text that would be extracted by
1096  # one or more sequential applications of the specified functions
1097  # or regular expressions
1098
1099         @extracted = extract_multiple($text,
1100                                       [ \&extract_bracketed,
1101                                         \&extract_quotelike,
1102                                         \&some_other_extractor_sub,
1103                                         qr/[xyz]*/,
1104                                         'literal',
1105                                       ]);
1106
1107 # Create a string representing an optimized pattern (a la Friedl)
1108 # that matches a substring delimited by any of the specified characters
1109 # (in this case: any type of quote or a slash)
1110
1111         $patstring = gen_delimited_pat(q{'"`/});
1112
1113
1114 # Generate a reference to an anonymous sub that is just like extract_tagged
1115 # but pre-compiled and optimized for a specific pair of tags, and consequently
1116 # much faster (i.e. 3 times faster). It uses qr// for better performance on
1117 # repeated calls, so it only works under Perl 5.005 or later.
1118
1119         $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1120
1121         ($extracted, $remainder) = $extract_head->($text);
1122
1123
1124 =head1 DESCRIPTION
1125
1126 The various C<extract_...> subroutines may be used to
1127 extract a delimited substring, possibly after skipping a
1128 specified prefix string. By default, that prefix is
1129 optional whitespace (C</\s*/>), but you can change it to whatever
1130 you wish (see below).
1131
1132 The substring to be extracted must appear at the
1133 current C<pos> location of the string's variable
1134 (or at index zero, if no C<pos> position is defined).
1135 In other words, the C<extract_...> subroutines I<don't>
1136 extract the first occurrence of a substring anywhere
1137 in a string (like an unanchored regex would). Rather,
1138 they extract an occurrence of the substring appearing
1139 immediately at the current matching position in the
1140 string (like a C<\G>-anchored regex would).
1141
1142
1143
1144 =head2 General behaviour in list contexts
1145
1146 In a list context, all the subroutines return a list, the first three
1147 elements of which are always:
1148
1149 =over 4
1150
1151 =item [0]
1152
1153 The extracted string, including the specified delimiters.
1154 If the extraction fails C<undef> is returned.
1155
1156 =item [1]
1157
1158 The remainder of the input string (i.e. the characters after the
1159 extracted string). On failure, the entire string is returned.
1160
1161 =item [2]
1162
1163 The skipped prefix (i.e. the characters before the extracted string).
1164 On failure, C<undef> is returned.
1165
1166 =back
1167
1168 Note that in a list context, the contents of the original input text (the first
1169 argument) are not modified in any way.
1170
1171 However, if the input text was passed in a variable, that variable's
1172 C<pos> value is updated to point at the first character after the
1173 extracted text. That means that in a list context the various
1174 subroutines can be used much like regular expressions. For example:
1175
1176         while ( $next = (extract_quotelike($text))[0] )
1177         {
1178                 # process next quote-like (in $next)
1179         }
1180
1181
1182 =head2 General behaviour in scalar and void contexts
1183
1184 In a scalar context, the extracted string is returned, having first been
1185 removed from the input text. Thus, the following code also processes
1186 each quote-like operation, but actually removes them from $text:
1187
1188         while ( $next = extract_quotelike($text) )
1189         {
1190                 # process next quote-like (in $next)
1191         }
1192
1193 Note that if the input text is a read-only string (i.e. a literal),
1194 no attempt is made to remove the extracted text.
1195
1196 In a void context the behaviour of the extraction subroutines is
1197 exactly the same as in a scalar context, except (of course) that the
1198 extracted substring is not returned.
1199
1200 =head2 A note about prefixes
1201
1202 Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1203 This can bite you if you're expecting a prefix specification like
1204 '.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1205 pattern will only succeed if the <H1> tag is on the current line, since
1206 . normally doesn't match newlines.
1207
1208 To overcome this limitation, you need to turn on /s matching within
1209 the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1210
1211
1212 =head2 C<extract_delimited>
1213
1214 The C<extract_delimited> function formalizes the common idiom
1215 of extracting a single-character-delimited substring from the start of
1216 a string. For example, to extract a single-quote delimited string, the
1217 following code is typically used:
1218
1219         ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1220         $extracted = $1;
1221
1222 but with C<extract_delimited> it can be simplified to:
1223
1224         ($extracted,$remainder) = extract_delimited($text, "'");
1225
1226 C<extract_delimited> takes up to four scalars (the input text, the
1227 delimiters, a prefix pattern to be skipped, and any escape characters)
1228 and extracts the initial substring of the text that
1229 is appropriately delimited. If the delimiter string has multiple
1230 characters, the first one encountered in the text is taken to delimit
1231 the substring.
1232 The third argument specifies a prefix pattern that is to be skipped
1233 (but must be present!) before the substring is extracted.
1234 The final argument specifies the escape character to be used for each
1235 delimiter.
1236
1237 All arguments are optional. If the escape characters are not specified,
1238 every delimiter is escaped with a backslash (C<\>).
1239 If the prefix is not specified, the
1240 pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1241 is also not specified, the set C</["'`]/> is used. If the text to be processed
1242 is not specified either, C<$_> is used.
1243
1244 In list context, C<extract_delimited> returns a array of three
1245 elements, the extracted substring (I<including the surrounding
1246 delimiters>), the remainder of the text, and the skipped prefix (if
1247 any). If a suitable delimited substring is not found, the first
1248 element of the array is the empty string, the second is the complete
1249 original text, and the prefix returned in the third element is an
1250 empty string.
1251
1252 In a scalar context, just the extracted substring is returned. In
1253 a void context, the extracted substring (and any prefix) are simply
1254 removed from the beginning of the first argument.
1255
1256 Examples:
1257
1258         # Remove a single-quoted substring from the very beginning of $text:
1259
1260                 $substring = extract_delimited($text, "'", '');
1261
1262         # Remove a single-quoted Pascalish substring (i.e. one in which
1263         # doubling the quote character escapes it) from the very
1264         # beginning of $text:
1265
1266                 $substring = extract_delimited($text, "'", '', "'");
1267
1268         # Extract a single- or double- quoted substring from the
1269         # beginning of $text, optionally after some whitespace
1270         # (note the list context to protect $text from modification):
1271
1272                 ($substring) = extract_delimited $text, q{"'};
1273
1274
1275         # Delete the substring delimited by the first '/' in $text:
1276
1277                 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1278
1279 Note that this last example is I<not> the same as deleting the first
1280 quote-like pattern. For instance, if C<$text> contained the string:
1281
1282         "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1283
1284 then after the deletion it would contain:
1285
1286         "if ('.$UNIXCMD/s) { $cmd = $1; }"
1287
1288 not:
1289
1290         "if ('./cmd' =~ ms) { $cmd = $1; }"
1291
1292
1293 See L<"extract_quotelike"> for a (partial) solution to this problem.
1294
1295
1296 =head2 C<extract_bracketed>
1297
1298 Like C<"extract_delimited">, the C<extract_bracketed> function takes
1299 up to three optional scalar arguments: a string to extract from, a delimiter
1300 specifier, and a prefix pattern. As before, a missing prefix defaults to
1301 optional whitespace and a missing text defaults to C<$_>. However, a missing
1302 delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1303
1304 C<extract_bracketed> extracts a balanced-bracket-delimited
1305 substring (using any one (or more) of the user-specified delimiter
1306 brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1307 respect quoted unbalanced brackets (see below).
1308
1309 A "delimiter bracket" is a bracket in list of delimiters passed as
1310 C<extract_bracketed>'s second argument. Delimiter brackets are
1311 specified by giving either the left or right (or both!) versions
1312 of the required bracket(s). Note that the order in which
1313 two or more delimiter brackets are specified is not significant.
1314
1315 A "balanced-bracket-delimited substring" is a substring bounded by
1316 matched brackets, such that any other (left or right) delimiter
1317 bracket I<within> the substring is also matched by an opposite
1318 (right or left) delimiter bracket I<at the same level of nesting>. Any
1319 type of bracket not in the delimiter list is treated as an ordinary
1320 character.
1321
1322 In other words, each type of bracket specified as a delimiter must be
1323 balanced and correctly nested within the substring, and any other kind of
1324 ("non-delimiter") bracket in the substring is ignored.
1325
1326 For example, given the string:
1327
1328         $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1329
1330 then a call to C<extract_bracketed> in a list context:
1331
1332         @result = extract_bracketed( $text, '{}' );
1333
1334 would return:
1335
1336         ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1337
1338 since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1339 (In a scalar context just the first element of the array would be returned. In
1340 a void context, C<$text> would be replaced by an empty string.)
1341
1342 Likewise the call in:
1343
1344         @result = extract_bracketed( $text, '{[' );
1345
1346 would return the same result, since all sets of both types of specified
1347 delimiter brackets are correctly nested and balanced.
1348
1349 However, the call in:
1350
1351         @result = extract_bracketed( $text, '{([<' );
1352
1353 would fail, returning:
1354
1355         ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }"  );
1356
1357 because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1358 the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1359 return an empty string. In a void context, C<$text> would be unchanged.)
1360
1361 Note that the embedded single-quotes in the string don't help in this
1362 case, since they have not been specified as acceptable delimiters and are
1363 therefore treated as non-delimiter characters (and ignored).
1364
1365 However, if a particular species of quote character is included in the
1366 delimiter specification, then that type of quote will be correctly handled.
1367 for example, if C<$text> is:
1368
1369         $text = '<A HREF=">>>>">link</A>';
1370
1371 then
1372
1373         @result = extract_bracketed( $text, '<">' );
1374
1375 returns:
1376
1377         ( '<A HREF=">>>>">', 'link</A>', "" )
1378
1379 as expected. Without the specification of C<"> as an embedded quoter:
1380
1381         @result = extract_bracketed( $text, '<>' );
1382
1383 the result would be:
1384
1385         ( '<A HREF=">', '>>>">link</A>', "" )
1386
1387 In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1388 quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1389 letter 'q' as a delimiter. Hence:
1390
1391         @result = extract_bracketed( $text, '<q>' );
1392
1393 would correctly match something like this:
1394
1395         $text = '<leftop: conj /and/ conj>';
1396
1397 See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1398
1399
1400 =head2 C<extract_variable>
1401
1402 C<extract_variable> extracts any valid Perl variable or
1403 variable-involved expression, including scalars, arrays, hashes, array
1404 accesses, hash look-ups, method calls through objects, subroutine calls
1405 through subroutine references, etc.
1406
1407 The subroutine takes up to two optional arguments:
1408
1409 =over 4
1410
1411 =item 1.
1412
1413 A string to be processed (C<$_> if the string is omitted or C<undef>)
1414
1415 =item 2.
1416
1417 A string specifying a pattern to be matched as a prefix (which is to be
1418 skipped). If omitted, optional whitespace is skipped.
1419
1420 =back
1421
1422 On success in a list context, an array of 3 elements is returned. The
1423 elements are:
1424
1425 =over 4
1426
1427 =item [0]
1428
1429 the extracted variable, or variablish expression
1430
1431 =item [1]
1432
1433 the remainder of the input text,
1434
1435 =item [2]
1436
1437 the prefix substring (if any),
1438
1439 =back
1440
1441 On failure, all of these values (except the remaining text) are C<undef>.
1442
1443 In a scalar context, C<extract_variable> returns just the complete
1444 substring that matched a variablish expression. C<undef> is returned on
1445 failure. In addition, the original input text has the returned substring
1446 (and any prefix) removed from it.
1447
1448 In a void context, the input text just has the matched substring (and
1449 any specified prefix) removed.
1450
1451
1452 =head2 C<extract_tagged>
1453
1454 C<extract_tagged> extracts and segments text between (balanced)
1455 specified tags.
1456
1457 The subroutine takes up to five optional arguments:
1458
1459 =over 4
1460
1461 =item 1.
1462
1463 A string to be processed (C<$_> if the string is omitted or C<undef>)
1464
1465 =item 2.
1466
1467 A string specifying a pattern to be matched as the opening tag.
1468 If the pattern string is omitted (or C<undef>) then a pattern
1469 that matches any standard XML tag is used.
1470
1471 =item 3.
1472
1473 A string specifying a pattern to be matched at the closing tag.
1474 If the pattern string is omitted (or C<undef>) then the closing
1475 tag is constructed by inserting a C</> after any leading bracket
1476 characters in the actual opening tag that was matched (I<not> the pattern
1477 that matched the tag). For example, if the opening tag pattern
1478 is specified as C<'{{\w+}}'> and actually matched the opening tag
1479 C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1480
1481 =item 4.
1482
1483 A string specifying a pattern to be matched as a prefix (which is to be
1484 skipped). If omitted, optional whitespace is skipped.
1485
1486 =item 5.
1487
1488 A hash reference containing various parsing options (see below)
1489
1490 =back
1491
1492 The various options that can be specified are:
1493
1494 =over 4
1495
1496 =item C<reject =E<gt> $listref>
1497
1498 The list reference contains one or more strings specifying patterns
1499 that must I<not> appear within the tagged text.
1500
1501 For example, to extract
1502 an HTML link (which should not contain nested links) use:
1503
1504         extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1505
1506 =item C<ignore =E<gt> $listref>
1507
1508 The list reference contains one or more strings specifying patterns
1509 that are I<not> be be treated as nested tags within the tagged text
1510 (even if they would match the start tag pattern).
1511
1512 For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1513
1514         extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1515
1516 (also see L<"gen_delimited_pat"> below).
1517
1518
1519 =item C<fail =E<gt> $str>
1520
1521 The C<fail> option indicates the action to be taken if a matching end
1522 tag is not encountered (i.e. before the end of the string or some
1523 C<reject> pattern matches). By default, a failure to match a closing
1524 tag causes C<extract_tagged> to immediately fail.
1525
1526 However, if the string value associated with <reject> is "MAX", then
1527 C<extract_tagged> returns the complete text up to the point of failure.
1528 If the string is "PARA", C<extract_tagged> returns only the first paragraph
1529 after the tag (up to the first line that is either empty or contains
1530 only whitespace characters).
1531 If the string is "", the the default behaviour (i.e. failure) is reinstated.
1532
1533 For example, suppose the start tag "/para" introduces a paragraph, which then
1534 continues until the next "/endpara" tag or until another "/para" tag is
1535 encountered:
1536
1537         $text = "/para line 1\n\nline 3\n/para line 4";
1538
1539         extract_tagged($text, '/para', '/endpara', undef,
1540                                 {reject => '/para', fail => MAX );
1541
1542         # EXTRACTED: "/para line 1\n\nline 3\n"
1543
1544 Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1545 tag refers only to the immediately following paragraph:
1546
1547         $text = "/para line 1\n\nline 3\n/para line 4";
1548
1549         extract_tagged($text, '/para', '/endpara', undef,
1550                         {reject => '/para', fail => MAX );
1551
1552         # EXTRACTED: "/para line 1\n"
1553
1554 Note that the specified C<fail> behaviour applies to nested tags as well.
1555
1556 =back
1557
1558 On success in a list context, an array of 6 elements is returned. The elements are:
1559
1560 =over 4
1561
1562 =item [0]
1563
1564 the extracted tagged substring (including the outermost tags),
1565
1566 =item [1]
1567
1568 the remainder of the input text,
1569
1570 =item [2]
1571
1572 the prefix substring (if any),
1573
1574 =item [3]
1575
1576 the opening tag
1577
1578 =item [4]
1579
1580 the text between the opening and closing tags
1581
1582 =item [5]
1583
1584 the closing tag (or "" if no closing tag was found)
1585
1586 =back
1587
1588 On failure, all of these values (except the remaining text) are C<undef>.
1589
1590 In a scalar context, C<extract_tagged> returns just the complete
1591 substring that matched a tagged text (including the start and end
1592 tags). C<undef> is returned on failure. In addition, the original input
1593 text has the returned substring (and any prefix) removed from it.
1594
1595 In a void context, the input text just has the matched substring (and
1596 any specified prefix) removed.
1597
1598
1599 =head2 C<gen_extract_tagged>
1600
1601 (Note: This subroutine is only available under Perl5.005)
1602
1603 C<gen_extract_tagged> generates a new anonymous subroutine which
1604 extracts text between (balanced) specified tags. In other words,
1605 it generates a function identical in function to C<extract_tagged>.
1606
1607 The difference between C<extract_tagged> and the anonymous
1608 subroutines generated by
1609 C<gen_extract_tagged>, is that those generated subroutines:
1610
1611 =over 4
1612
1613 =item *
1614
1615 do not have to reparse tag specification or parsing options every time
1616 they are called (whereas C<extract_tagged> has to effectively rebuild
1617 its tag parser on every call);
1618
1619 =item *
1620
1621 make use of the new qr// construct to pre-compile the regexes they use
1622 (whereas C<extract_tagged> uses standard string variable interpolation
1623 to create tag-matching patterns).
1624
1625 =back
1626
1627 The subroutine takes up to four optional arguments (the same set as
1628 C<extract_tagged> except for the string to be processed). It returns
1629 a reference to a subroutine which in turn takes a single argument (the text to
1630 be extracted from).
1631
1632 In other words, the implementation of C<extract_tagged> is exactly
1633 equivalent to:
1634
1635         sub extract_tagged
1636         {
1637                 my $text = shift;
1638                 $extractor = gen_extract_tagged(@_);
1639                 return $extractor->($text);
1640         }
1641
1642 (although C<extract_tagged> is not currently implemented that way, in order
1643 to preserve pre-5.005 compatibility).
1644
1645 Using C<gen_extract_tagged> to create extraction functions for specific tags
1646 is a good idea if those functions are going to be called more than once, since
1647 their performance is typically twice as good as the more general-purpose
1648 C<extract_tagged>.
1649
1650
1651 =head2 C<extract_quotelike>
1652
1653 C<extract_quotelike> attempts to recognize, extract, and segment any
1654 one of the various Perl quotes and quotelike operators (see
1655 L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1656 delimiters (for the quotelike operators), and trailing modifiers are
1657 all caught. For example, in:
1658
1659         extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1660
1661         extract_quotelike '  "You said, \"Use sed\"."  '
1662
1663         extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1664
1665         extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1666
1667 the full Perl quotelike operations are all extracted correctly.
1668
1669 Note too that, when using the /x modifier on a regex, any comment
1670 containing the current pattern delimiter will cause the regex to be
1671 immediately terminated. In other words:
1672
1673         'm /
1674                 (?i)            # CASE INSENSITIVE
1675                 [a-z_]          # LEADING ALPHABETIC/UNDERSCORE
1676                 [a-z0-9]*       # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1677            /x'
1678
1679 will be extracted as if it were:
1680
1681         'm /
1682                 (?i)            # CASE INSENSITIVE
1683                 [a-z_]          # LEADING ALPHABETIC/'
1684
1685 This behaviour is identical to that of the actual compiler.
1686
1687 C<extract_quotelike> takes two arguments: the text to be processed and
1688 a prefix to be matched at the very beginning of the text. If no prefix
1689 is specified, optional whitespace is the default. If no text is given,
1690 C<$_> is used.
1691
1692 In a list context, an array of 11 elements is returned. The elements are:
1693
1694 =over 4
1695
1696 =item [0]
1697
1698 the extracted quotelike substring (including trailing modifiers),
1699
1700 =item [1]
1701
1702 the remainder of the input text,
1703
1704 =item [2]
1705
1706 the prefix substring (if any),
1707
1708 =item [3]
1709
1710 the name of the quotelike operator (if any),
1711
1712 =item [4]
1713
1714 the left delimiter of the first block of the operation,
1715
1716 =item [5]
1717
1718 the text of the first block of the operation
1719 (that is, the contents of
1720 a quote, the regex of a match or substitution or the target list of a
1721 translation),
1722
1723 =item [6]
1724
1725 the right delimiter of the first block of the operation,
1726
1727 =item [7]
1728
1729 the left delimiter of the second block of the operation
1730 (that is, if it is a C<s>, C<tr>, or C<y>),
1731
1732 =item [8]
1733
1734 the text of the second block of the operation
1735 (that is, the replacement of a substitution or the translation list
1736 of a translation),
1737
1738 =item [9]
1739
1740 the right delimiter of the second block of the operation (if any),
1741
1742 =item [10]
1743
1744 the trailing modifiers on the operation (if any).
1745
1746 =back
1747
1748 For each of the fields marked "(if any)" the default value on success is
1749 an empty string.
1750 On failure, all of these values (except the remaining text) are C<undef>.
1751
1752
1753 In a scalar context, C<extract_quotelike> returns just the complete substring
1754 that matched a quotelike operation (or C<undef> on failure). In a scalar or
1755 void context, the input text has the same substring (and any specified
1756 prefix) removed.
1757
1758 Examples:
1759
1760         # Remove the first quotelike literal that appears in text
1761
1762                 $quotelike = extract_quotelike($text,'.*?');
1763
1764         # Replace one or more leading whitespace-separated quotelike
1765         # literals in $_ with "<QLL>"
1766
1767                 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1768
1769
1770         # Isolate the search pattern in a quotelike operation from $text
1771
1772                 ($op,$pat) = (extract_quotelike $text)[3,5];
1773                 if ($op =~ /[ms]/)
1774                 {
1775                         print "search pattern: $pat\n";
1776                 }
1777                 else
1778                 {
1779                         print "$op is not a pattern matching operation\n";
1780                 }
1781
1782
1783 =head2 C<extract_quotelike> and "here documents"
1784
1785 C<extract_quotelike> can successfully extract "here documents" from an input
1786 string, but with an important caveat in list contexts.
1787
1788 Unlike other types of quote-like literals, a here document is rarely
1789 a contiguous substring. For example, a typical piece of code using
1790 here document might look like this:
1791
1792         <<'EOMSG' || die;
1793         This is the message.
1794         EOMSG
1795         exit;
1796
1797 Given this as an input string in a scalar context, C<extract_quotelike>
1798 would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1799 leaving the string " || die;\nexit;" in the original variable. In other words,
1800 the two separate pieces of the here document are successfully extracted and
1801 concatenated.
1802
1803 In a list context, C<extract_quotelike> would return the list
1804
1805 =over 4
1806
1807 =item [0]
1808
1809 "<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1810 including fore and aft delimiters),
1811
1812 =item [1]
1813
1814 " || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1815
1816 =item [2]
1817
1818 "" (i.e. the prefix substring -- trivial in this case),
1819
1820 =item [3]
1821
1822 "<<" (i.e. the "name" of the quotelike operator)
1823
1824 =item [4]
1825
1826 "'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1827
1828 =item [5]
1829
1830 "This is the message.\n" (i.e. the text of the here document),
1831
1832 =item [6]
1833
1834 "EOMSG" (i.e. the right delimiter of the here document),
1835
1836 =item [7..10]
1837
1838 "" (a here document has no second left delimiter, second text, second right
1839 delimiter, or trailing modifiers).
1840
1841 =back
1842
1843 However, the matching position of the input variable would be set to
1844 "exit;" (i.e. I<after> the closing delimiter of the here document),
1845 which would cause the earlier " || die;\nexit;" to be skipped in any
1846 sequence of code fragment extractions.
1847
1848 To avoid this problem, when it encounters a here document whilst
1849 extracting from a modifiable string, C<extract_quotelike> silently
1850 rearranges the string to an equivalent piece of Perl:
1851
1852         <<'EOMSG'
1853         This is the message.
1854         EOMSG
1855         || die;
1856         exit;
1857
1858 in which the here document I<is> contiguous. It still leaves the
1859 matching position after the here document, but now the rest of the line
1860 on which the here document starts is not skipped.
1861
1862 To prevent <extract_quotelike> from mucking about with the input in this way
1863 (this is the only case where a list-context C<extract_quotelike> does so),
1864 you can pass the input variable as an interpolated literal:
1865
1866         $quotelike = extract_quotelike("$var");
1867
1868
1869 =head2 C<extract_codeblock>
1870
1871 C<extract_codeblock> attempts to recognize and extract a balanced
1872 bracket delimited substring that may contain unbalanced brackets
1873 inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1874 is like a combination of C<"extract_bracketed"> and
1875 C<"extract_quotelike">.
1876
1877 C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1878 a text to process, a set of delimiter brackets to look for, and a prefix to
1879 match first. It also takes an optional fourth parameter, which allows the
1880 outermost delimiter brackets to be specified separately (see below).
1881
1882 Omitting the first argument (input text) means process C<$_> instead.
1883 Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1884 Omitting the third argument (prefix argument) implies optional whitespace at the start.
1885 Omitting the fourth argument (outermost delimiter brackets) indicates that the
1886 value of the second argument is to be used for the outermost delimiters.
1887
1888 Once the prefix an dthe outermost opening delimiter bracket have been
1889 recognized, code blocks are extracted by stepping through the input text and
1890 trying the following alternatives in sequence:
1891
1892 =over 4
1893
1894 =item 1.
1895
1896 Try and match a closing delimiter bracket. If the bracket was the same
1897 species as the last opening bracket, return the substring to that
1898 point. If the bracket was mismatched, return an error.
1899
1900 =item 2.
1901
1902 Try to match a quote or quotelike operator. If found, call
1903 C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1904 the error it returned. Otherwise go back to step 1.
1905
1906 =item 3.
1907
1908 Try to match an opening delimiter bracket. If found, call
1909 C<extract_codeblock> recursively to eat the embedded block. If the
1910 recursive call fails, return an error. Otherwise, go back to step 1.
1911
1912 =item 4.
1913
1914 Unconditionally match a bareword or any other single character, and
1915 then go back to step 1.
1916
1917 =back
1918
1919
1920 Examples:
1921
1922         # Find a while loop in the text
1923
1924                 if ($text =~ s/.*?while\s*\{/{/)
1925                 {
1926                         $loop = "while " . extract_codeblock($text);
1927                 }
1928
1929         # Remove the first round-bracketed list (which may include
1930         # round- or curly-bracketed code blocks or quotelike operators)
1931
1932                 extract_codeblock $text, "(){}", '[^(]*';
1933
1934
1935 The ability to specify a different outermost delimiter bracket is useful
1936 in some circumstances. For example, in the Parse::RecDescent module,
1937 parser actions which are to be performed only on a successful parse
1938 are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1939
1940         sentence: subject verb object
1941                         <defer: {$::theVerb = $item{verb}} >
1942
1943 Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1944 within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1945
1946 A deferred action like this:
1947
1948                         <defer: {if ($count>10) {$count--}} >
1949
1950 will be incorrectly parsed as:
1951
1952                         <defer: {if ($count>
1953
1954 because the "less than" operator is interpreted as a closing delimiter.
1955
1956 But, by extracting the directive using
1957 S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1958 the '>' character is only treated as a delimited at the outermost
1959 level of the code block, so the directive is parsed correctly.
1960
1961 =head2 C<extract_multiple>
1962
1963 The C<extract_multiple> subroutine takes a string to be processed and a
1964 list of extractors (subroutines or regular expressions) to apply to that string.
1965
1966 In an array context C<extract_multiple> returns an array of substrings
1967 of the original string, as extracted by the specified extractors.
1968 In a scalar context, C<extract_multiple> returns the first
1969 substring successfully extracted from the original string. In both
1970 scalar and void contexts the original string has the first successfully
1971 extracted substring removed from it. In all contexts
1972 C<extract_multiple> starts at the current C<pos> of the string, and
1973 sets that C<pos> appropriately after it matches.
1974
1975 Hence, the aim of of a call to C<extract_multiple> in a list context
1976 is to split the processed string into as many non-overlapping fields as
1977 possible, by repeatedly applying each of the specified extractors
1978 to the remainder of the string. Thus C<extract_multiple> is
1979 a generalized form of Perl's C<split> subroutine.
1980
1981 The subroutine takes up to four optional arguments:
1982
1983 =over 4
1984
1985 =item 1.
1986
1987 A string to be processed (C<$_> if the string is omitted or C<undef>)
1988
1989 =item 2.
1990
1991 A reference to a list of subroutine references and/or qr// objects and/or
1992 literal strings and/or hash references, specifying the extractors
1993 to be used to split the string. If this argument is omitted (or
1994 C<undef>) the list:
1995
1996         [
1997                 sub { extract_variable($_[0], '') },
1998                 sub { extract_quotelike($_[0],'') },
1999                 sub { extract_codeblock($_[0],'{}','') },
2000         ]
2001
2002 is used.
2003
2004
2005 =item 3.
2006
2007 An number specifying the maximum number of fields to return. If this
2008 argument is omitted (or C<undef>), split continues as long as possible.
2009
2010 If the third argument is I<N>, then extraction continues until I<N> fields
2011 have been successfully extracted, or until the string has been completely
2012 processed.
2013
2014 Note that in scalar and void contexts the value of this argument is
2015 automatically reset to 1 (under C<-w>, a warning is issued if the argument
2016 has to be reset).
2017
2018 =item 4.
2019
2020 A value indicating whether unmatched substrings (see below) within the
2021 text should be skipped or returned as fields. If the value is true,
2022 such substrings are skipped. Otherwise, they are returned.
2023
2024 =back
2025
2026 The extraction process works by applying each extractor in
2027 sequence to the text string.
2028
2029 If the extractor is a subroutine it is called in a list context and is
2030 expected to return a list of a single element, namely the extracted
2031 text. It may optionally also return two further arguments: a string
2032 representing the text left after extraction (like $' for a pattern
2033 match), and a string representing any prefix skipped before the
2034 extraction (like $` in a pattern match). Note that this is designed
2035 to facilitate the use of other Text::Balanced subroutines with
2036 C<extract_multiple>. Note too that the value returned by an extractor
2037 subroutine need not bear any relationship to the corresponding substring
2038 of the original text (see examples below).
2039
2040 If the extractor is a precompiled regular expression or a string,
2041 it is matched against the text in a scalar context with a leading
2042 '\G' and the gc modifiers enabled. The extracted value is either
2043 $1 if that variable is defined after the match, or else the
2044 complete match (i.e. $&).
2045
2046 If the extractor is a hash reference, it must contain exactly one element.
2047 The value of that element is one of the
2048 above extractor types (subroutine reference, regular expression, or string).
2049 The key of that element is the name of a class into which the successful
2050 return value of the extractor will be blessed.
2051
2052 If an extractor returns a defined value, that value is immediately
2053 treated as the next extracted field and pushed onto the list of fields.
2054 If the extractor was specified in a hash reference, the field is also
2055 blessed into the appropriate class,
2056
2057 If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
2058 assumed to have failed to extract.
2059 If none of the extractor subroutines succeeds, then one
2060 character is extracted from the start of the text and the extraction
2061 subroutines reapplied. Characters which are thus removed are accumulated and
2062 eventually become the next field (unless the fourth argument is true, in which
2063 case they are discarded).
2064
2065 For example, the following extracts substrings that are valid Perl variables:
2066
2067         @fields = extract_multiple($text,
2068                                    [ sub { extract_variable($_[0]) } ],
2069                                    undef, 1);
2070
2071 This example separates a text into fields which are quote delimited,
2072 curly bracketed, and anything else. The delimited and bracketed
2073 parts are also blessed to identify them (the "anything else" is unblessed):
2074
2075         @fields = extract_multiple($text,
2076                    [
2077                         { Delim => sub { extract_delimited($_[0],q{'"}) } },
2078                         { Brack => sub { extract_bracketed($_[0],'{}') } },
2079                    ]);
2080
2081 This call extracts the next single substring that is a valid Perl quotelike
2082 operator (and removes it from $text):
2083
2084         $quotelike = extract_multiple($text,
2085                                       [
2086                                         sub { extract_quotelike($_[0]) },
2087                                       ], undef, 1);
2088
2089 Finally, here is yet another way to do comma-separated value parsing:
2090
2091         @fields = extract_multiple($csv_text,
2092                                   [
2093                                         sub { extract_delimited($_[0],q{'"}) },
2094                                         qr/([^,]+)(.*)/,
2095                                   ],
2096                                   undef,1);
2097
2098 The list in the second argument means:
2099 I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2100 The undef third argument means:
2101 I<"...as many times as possible...">,
2102 and the true value in the fourth argument means
2103 I<"...discarding anything else that appears (i.e. the commas)">.
2104
2105 If you wanted the commas preserved as separate fields (i.e. like split
2106 does if your split pattern has capturing parentheses), you would
2107 just make the last parameter undefined (or remove it).
2108
2109
2110 =head2 C<gen_delimited_pat>
2111
2112 The C<gen_delimited_pat> subroutine takes a single (string) argument and
2113    > builds a Friedl-style optimized regex that matches a string delimited
2114 by any one of the characters in the single argument. For example:
2115
2116         gen_delimited_pat(q{'"})
2117
2118 returns the regex:
2119
2120         (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2121
2122 Note that the specified delimiters are automatically quotemeta'd.
2123
2124 A typical use of C<gen_delimited_pat> would be to build special purpose tags
2125 for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2126 (which might contain quoted strings):
2127
2128         my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2129
2130         extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2131
2132
2133 C<gen_delimited_pat> may also be called with an optional second argument,
2134 which specifies the "escape" character(s) to be used for each delimiter.
2135 For example to match a Pascal-style string (where ' is the delimiter
2136 and '' is a literal ' within the string):
2137
2138         gen_delimited_pat(q{'},q{'});
2139
2140 Different escape characters can be specified for different delimiters.
2141 For example, to specify that '/' is the escape for single quotes
2142 and '%' is the escape for double quotes:
2143
2144         gen_delimited_pat(q{'"},q{/%});
2145
2146 If more delimiters than escape chars are specified, the last escape char
2147 is used for the remaining delimiters.
2148 If no escape char is specified for a given specified delimiter, '\' is used.
2149
2150 =head2 C<delimited_pat>
2151
2152 Note that C<gen_delimited_pat> was previously called C<delimited_pat>.
2153 That name may still be used, but is now deprecated.
2154
2155
2156 =head1 DIAGNOSTICS
2157
2158 In a list context, all the functions return C<(undef,$original_text)>
2159 on failure. In a scalar context, failure is indicated by returning C<undef>
2160 (in this case the input text is not modified in any way).
2161
2162 In addition, on failure in I<any> context, the C<$@> variable is set.
2163 Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2164 below.
2165 Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2166 which the error was detected (although not necessarily where it occurred!)
2167 Printing C<$@> directly produces the error message, with the offset appended.
2168 On success, the C<$@> variable is guaranteed to be C<undef>.
2169
2170 The available diagnostics are:
2171
2172 =over 4
2173
2174 =item  C<Did not find a suitable bracket: "%s">
2175
2176 The delimiter provided to C<extract_bracketed> was not one of
2177 C<'()[]E<lt>E<gt>{}'>.
2178
2179 =item  C<Did not find prefix: /%s/>
2180
2181 A non-optional prefix was specified but wasn't found at the start of the text.
2182
2183 =item  C<Did not find opening bracket after prefix: "%s">
2184
2185 C<extract_bracketed> or C<extract_codeblock> was expecting a
2186 particular kind of bracket at the start of the text, and didn't find it.
2187
2188 =item  C<No quotelike operator found after prefix: "%s">
2189
2190 C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2191 C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2192 it was extracting.
2193
2194 =item  C<Unmatched closing bracket: "%c">
2195
2196 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2197 a closing bracket where none was expected.
2198
2199 =item  C<Unmatched opening bracket(s): "%s">
2200
2201 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2202 out of characters in the text before closing one or more levels of nested
2203 brackets.
2204
2205 =item C<Unmatched embedded quote (%s)>
2206
2207 C<extract_bracketed> attempted to match an embedded quoted substring, but
2208 failed to find a closing quote to match it.
2209
2210 =item C<Did not find closing delimiter to match '%s'>
2211
2212 C<extract_quotelike> was unable to find a closing delimiter to match the
2213 one that opened the quote-like operation.
2214
2215 =item  C<Mismatched closing bracket: expected "%c" but found "%s">
2216
2217 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2218 a valid bracket delimiter, but it was the wrong species. This usually
2219 indicates a nesting error, but may indicate incorrect quoting or escaping.
2220
2221 =item  C<No block delimiter found after quotelike "%s">
2222
2223 C<extract_quotelike> or C<extract_codeblock> found one of the
2224 quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2225 without a suitable block after it.
2226
2227 =item C<Did not find leading dereferencer>
2228
2229 C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2230 a variable, but didn't find any of them.
2231
2232 =item C<Bad identifier after dereferencer>
2233
2234 C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2235 character was not followed by a legal Perl identifier.
2236
2237 =item C<Did not find expected opening bracket at %s>
2238
2239 C<extract_codeblock> failed to find any of the outermost opening brackets
2240 that were specified.
2241
2242 =item C<Improperly nested codeblock at %s>
2243
2244 A nested code block was found that started with a delimiter that was specified
2245 as being only to be used as an outermost bracket.
2246
2247 =item  C<Missing second block for quotelike "%s">
2248
2249 C<extract_codeblock> or C<extract_quotelike> found one of the
2250 quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2251
2252 =item C<No match found for opening bracket>
2253
2254 C<extract_codeblock> failed to find a closing bracket to match the outermost
2255 opening bracket.
2256
2257 =item C<Did not find opening tag: /%s/>
2258
2259 C<extract_tagged> did not find a suitable opening tag (after any specified
2260 prefix was removed).
2261
2262 =item C<Unable to construct closing tag to match: /%s/>
2263
2264 C<extract_tagged> matched the specified opening tag and tried to
2265 modify the matched text to produce a matching closing tag (because
2266 none was specified). It failed to generate the closing tag, almost
2267 certainly because the opening tag did not start with a
2268 bracket of some kind.
2269
2270 =item C<Found invalid nested tag: %s>
2271
2272 C<extract_tagged> found a nested tag that appeared in the "reject" list
2273 (and the failure mode was not "MAX" or "PARA").
2274
2275 =item C<Found unbalanced nested tag: %s>
2276
2277 C<extract_tagged> found a nested opening tag that was not matched by a
2278 corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2279
2280 =item C<Did not find closing tag>
2281
2282 C<extract_tagged> reached the end of the text without finding a closing tag
2283 to match the original opening tag (and the failure mode was not
2284 "MAX" or "PARA").
2285
2286
2287
2288
2289 =back
2290
2291
2292 =head1 AUTHOR
2293
2294 Damian Conway (damian@conway.org)
2295
2296
2297 =head1 BUGS AND IRRITATIONS
2298
2299 There are undoubtedly serious bugs lurking somewhere in this code, if
2300 only because parts of it give the impression of understanding a great deal
2301 more about Perl than they really do.
2302
2303 Bug reports and other feedback are most welcome.
2304
2305
2306 =head1 COPYRIGHT
2307
2308  Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
2309  This module is free software. It may be used, redistributed
2310      and/or modified under the same terms as Perl itself.