lib/Text/Balanced.pm

   1 # EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
   2 # FOR FULL DOCUMENTATION SEE Balanced.pod
   3
   4 use 5.005;
   5 use strict;
   6
   7 package Text::Balanced;
   8
   9 use Exporter;
  10 use SelfLoader;
  11 use vars qw { $VERSION @ISA %EXPORT_TAGS };
  12
  13 $VERSION = '1.86';
  14 @ISA            = qw ( Exporter );
  15
  16 %EXPORT_TAGS    = ( ALL => [ qw(
  17                                 &extract_delimited
  18                                 &extract_bracketed
  19                                 &extract_quotelike
  20                                 &extract_codeblock
  21                                 &extract_variable
  22                                 &extract_tagged
  23                                 &extract_multiple
  24
  25                                 &gen_delimited_pat
  26                                 &gen_extract_tagged
  27
  28                                 &delimited_pat
  29                                ) ] );
  30
  31 Exporter::export_ok_tags('ALL');
  32
  33 # PROTOTYPES
  34
  35 sub _match_bracketed($$$$$$);
  36 sub _match_variable($$);
  37 sub _match_codeblock($$$$$$$);
  38 sub _match_quotelike($$$$);
  39
  40 # HANDLE RETURN VALUES IN VARIOUS CONTEXTS
  41
  42 sub _failmsg {
  43         my ($message, $pos) = @_;
  44         $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
  45 }
  46
  47 sub _fail
  48 {
  49         my ($wantarray, $textref, $message, $pos) = @_;
  50         _failmsg $message, $pos if $message;
  51         return ("",$$textref,"") if $wantarray;
  52         return undef;
  53 }
  54
  55 sub _succeed
  56 {
  57         $@ = undef;
  58         my ($wantarray,$textref) = splice @_, 0, 2;
  59         my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
  60         my ($startlen) = $_[5];
  61         my $remainderpos = $_[2];
  62         if ($wantarray)
  63         {
  64                 my @res;
  65                 while (my ($from, $len) = splice @_, 0, 2)
  66                 {
  67                         push @res, substr($$textref,$from,$len);
  68                 }
  69                 if ($extralen) {        # CORRECT FILLET
  70                         my $extra = substr($res[0], $extrapos-$startlen, $extralen, "\n");
  71                         $res[1] = "$extra$res[1]";
  72                         eval { substr($$textref,$remainderpos,0) = $extra;
  73                                substr($$textref,$extrapos,$extralen,"\n")} ;
  74                                 #REARRANGE HERE DOC AND FILLET IF POSSIBLE
  75                         pos($$textref) = $remainderpos-$extralen+1; # RESET \G
  76                 }
  77                 else {
  78                         pos($$textref) = $remainderpos;             # RESET \G
  79                 }
  80                 return @res;
  81         }
  82         else
  83         {
  84                 my $match = substr($$textref,$_[0],$_[1]);
  85                 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
  86                 my $extra = $extralen
  87                         ? substr($$textref, $extrapos, $extralen)."\n" : "";
  88                 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ;     #CHOP OUT PREFIX & MATCH, IF POSSIBLE
  89                 pos($$textref) = $_[4];                         # RESET \G
  90                 return $match;
  91         }
  92 }
  93
  94 # BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
  95
  96 sub gen_delimited_pat($;$)  # ($delimiters;$escapes)
  97 {
  98         my ($dels, $escs) = @_;
  99         return "" unless $dels =~ /\S/;
 100         $escs = '\\' unless $escs;
 101         $escs .= substr($escs,-1) x (length($dels)-length($escs));
 102         my @pat = ();
 103         my $i;
 104         for ($i=0; $i<length $dels; $i++)
 105         {
 106                 my $del = quotemeta substr($dels,$i,1);
 107                 my $esc = quotemeta substr($escs,$i,1);
 108                 if ($del eq $esc)
 109                 {
 110                         push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
 111                 }
 112                 else
 113                 {
 114                         push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
 115                 }
 116         }
 117         my $pat = join '|', @pat;
 118         return "(?:$pat)";
 119 }
 120
 121 *delimited_pat = \&gen_delimited_pat;
 122
 123
 124 # THE EXTRACTION FUNCTIONS
 125
 126 sub extract_delimited (;$$$$)
 127 {
 128         my $textref = defined $_[0] ? \$_[0] : \$_;
 129         my $wantarray = wantarray;
 130         my $del  = defined $_[1] ? $_[1] : qq{\'\"\`};
 131         my $pre  = defined $_[2] ? $_[2] : '\s*';
 132         my $esc  = defined $_[3] ? $_[3] : qq{\\};
 133         my $pat = gen_delimited_pat($del, $esc);
 134         my $startpos = pos $$textref || 0;
 135         return _fail($wantarray, $textref, "Not a delimited pattern", 0)
 136                 unless $$textref =~ m/\G($pre)($pat)/gc;
 137         my $prelen = length($1);
 138         my $matchpos = $startpos+$prelen;
 139         my $endpos = pos $$textref;
 140         return _succeed $wantarray, $textref,
 141                         $matchpos, $endpos-$matchpos,           # MATCH
 142                         $endpos,   length($$textref)-$endpos,   # REMAINDER
 143                         $startpos, $prelen;                     # PREFIX
 144 }
 145
 146 sub extract_bracketed (;$$$)
 147 {
 148         my $textref = defined $_[0] ? \$_[0] : \$_;
 149         my $ldel = defined $_[1] ? $_[1] : '{([<';
 150         my $pre  = defined $_[2] ? $_[2] : '\s*';
 151         my $wantarray = wantarray;
 152         my $qdel = "";
 153         my $quotelike;
 154         $ldel =~ s/'//g and $qdel .= q{'};
 155         $ldel =~ s/"//g and $qdel .= q{"};
 156         $ldel =~ s/`//g and $qdel .= q{`};
 157         $ldel =~ s/q//g and $quotelike = 1;
 158         $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
 159         my $rdel = $ldel;
 160         unless ($rdel =~ tr/[({</])}>/)
 161         {
 162                 return _fail $wantarray, $textref,
 163                              "Did not find a suitable bracket in delimiter: \"$_[1]\"",
 164                              0;
 165         }
 166         my $posbug = pos;
 167         $ldel = join('|', map { quotemeta $_ } split('', $ldel));
 168         $rdel = join('|', map { quotemeta $_ } split('', $rdel));
 169         pos = $posbug;
 170
 171         my $startpos = pos $$textref || 0;
 172         my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
 173
 174         return _fail ($wantarray, $textref) unless @match;
 175
 176         return _succeed ( $wantarray, $textref,
 177                           $match[2], $match[5]+2,       # MATCH
 178                           @match[8,9],                  # REMAINDER
 179                           @match[0,1],                  # PREFIX
 180                         );
 181 }
 182
 183 sub _match_bracketed($$$$$$)    # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
 184 {
 185         my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
 186         my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
 187         unless ($$textref =~ m/\G$pre/gc)
 188         {
 189                 _failmsg "Did not find prefix: /$pre/", $startpos;
 190                 return;
 191         }
 192
 193         $ldelpos = pos $$textref;
 194
 195         unless ($$textref =~ m/\G($ldel)/gc)
 196         {
 197                 _failmsg "Did not find opening bracket after prefix: \"$pre\"",
 198                          pos $$textref;
 199                 pos $$textref = $startpos;
 200                 return;
 201         }
 202
 203         my @nesting = ( $1 );
 204         my $textlen = length $$textref;
 205         while (pos $$textref < $textlen)
 206         {
 207                 next if $$textref =~ m/\G\\./gcs;
 208
 209                 if ($$textref =~ m/\G($ldel)/gc)
 210                 {
 211                         push @nesting, $1;
 212                 }
 213                 elsif ($$textref =~ m/\G($rdel)/gc)
 214                 {
 215                         my ($found, $brackettype) = ($1, $1);
 216                         if ($#nesting < 0)
 217                         {
 218                                 _failmsg "Unmatched closing bracket: \"$found\"",
 219                                          pos $$textref;
 220                                 pos $$textref = $startpos;
 221                                 return;
 222                         }
 223                         my $expected = pop(@nesting);
 224                         $expected =~ tr/({[</)}]>/;
 225                         if ($expected ne $brackettype)
 226                         {
 227                                 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
 228                                          pos $$textref;
 229                                 pos $$textref = $startpos;
 230                                 return;
 231                         }
 232                         last if $#nesting < 0;
 233                 }
 234                 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
 235                 {
 236                         $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
 237                         _failmsg "Unmatched embedded quote ($1)",
 238                                  pos $$textref;
 239                         pos $$textref = $startpos;
 240                         return;
 241                 }
 242                 elsif ($quotelike && _match_quotelike($textref,"",1,0))
 243                 {
 244                         next;
 245                 }
 246
 247                 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
 248         }
 249         if ($#nesting>=0)
 250         {
 251                 _failmsg "Unmatched opening bracket(s): "
 252                                 . join("..",@nesting)."..",
 253                          pos $$textref;
 254                 pos $$textref = $startpos;
 255                 return;
 256         }
 257
 258         $endpos = pos $$textref;
 259
 260         return (
 261                 $startpos,  $ldelpos-$startpos,         # PREFIX
 262                 $ldelpos,   1,                          # OPENING BRACKET
 263                 $ldelpos+1, $endpos-$ldelpos-2,         # CONTENTS
 264                 $endpos-1,  1,                          # CLOSING BRACKET
 265                 $endpos,    length($$textref)-$endpos,  # REMAINDER
 266                );
 267 }
 268
 269 sub revbracket($)
 270 {
 271         my $brack = reverse $_[0];
 272         $brack =~ tr/[({</])}>/;
 273         return $brack;
 274 }
 275
 276 my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
 277
 278 sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
 279 {
 280         my $textref = defined $_[0] ? \$_[0] : \$_;
 281         my $ldel    = $_[1];
 282         my $rdel    = $_[2];
 283         my $pre     = defined $_[3] ? $_[3] : '\s*';
 284         my %options = defined $_[4] ? %{$_[4]} : ();
 285         my $omode   = defined $options{fail} ? $options{fail} : '';
 286         my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
 287                     : defined($options{reject})        ? $options{reject}
 288                     :                                    ''
 289                     ;
 290         my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
 291                     : defined($options{ignore})        ? $options{ignore}
 292                     :                                    ''
 293                     ;
 294
 295         if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
 296         $@ = undef;
 297
 298         my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
 299
 300         return _fail(wantarray, $textref) unless @match;
 301         return _succeed wantarray, $textref,
 302                         $match[2], $match[3]+$match[5]+$match[7],       # MATCH
 303                         @match[8..9,0..1,2..7];                         # REM, PRE, BITS
 304 }
 305
 306 sub _match_tagged       # ($$$$$$$)
 307 {
 308         my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
 309         my $rdelspec;
 310
 311         my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
 312
 313         unless ($$textref =~ m/\G($pre)/gc)
 314         {
 315                 _failmsg "Did not find prefix: /$pre/", pos $$textref;
 316                 goto failed;
 317         }
 318
 319         $opentagpos = pos($$textref);
 320
 321         unless ($$textref =~ m/\G$ldel/gc)
 322         {
 323                 _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
 324                 goto failed;
 325         }
 326
 327         $textpos = pos($$textref);
 328
 329         if (!defined $rdel)
 330         {
 331                 $rdelspec = $&;
 332                 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes)
 333                 {
 334                         _failmsg "Unable to construct closing tag to match: $rdel",
 335                                  pos $$textref;
 336                         goto failed;
 337                 }
 338         }
 339         else
 340         {
 341                 $rdelspec = eval "qq{$rdel}";
 342         }
 343
 344         while (pos($$textref) < length($$textref))
 345         {
 346                 next if $$textref =~ m/\G\\./gc;
 347
 348                 if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
 349                 {
 350                         $parapos = pos($$textref) - length($1)
 351                                 unless defined $parapos;
 352                 }
 353                 elsif ($$textref =~ m/\G($rdelspec)/gc )
 354                 {
 355                         $closetagpos = pos($$textref)-length($1);
 356                         goto matched;
 357                 }
 358                 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
 359                 {
 360                         next;
 361                 }
 362                 elsif ($bad && $$textref =~ m/\G($bad)/gcs)
 363                 {
 364                         pos($$textref) -= length($1);   # CUT OFF WHATEVER CAUSED THE SHORTNESS
 365                         goto short if ($omode eq 'PARA' || $omode eq 'MAX');
 366                         _failmsg "Found invalid nested tag: $1", pos $$textref;
 367                         goto failed;
 368                 }
 369                 elsif ($$textref =~ m/\G($ldel)/gc)
 370                 {
 371                         my $tag = $1;
 372                         pos($$textref) -= length($tag); # REWIND TO NESTED TAG
 373                         unless (_match_tagged(@_))      # MATCH NESTED TAG
 374                         {
 375                                 goto short if $omode eq 'PARA' || $omode eq 'MAX';
 376                                 _failmsg "Found unbalanced nested tag: $tag",
 377                                          pos $$textref;
 378                                 goto failed;
 379                         }
 380                 }
 381                 else { $$textref =~ m/./gcs }
 382         }
 383
 384 short:
 385         $closetagpos = pos($$textref);
 386         goto matched if $omode eq 'MAX';
 387         goto failed unless $omode eq 'PARA';
 388
 389         if (defined $parapos) { pos($$textref) = $parapos }
 390         else                  { $parapos = pos($$textref) }
 391
 392         return (
 393                 $startpos,    $opentagpos-$startpos,            # PREFIX
 394                 $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
 395                 $textpos,     $parapos-$textpos,                # TEXT
 396                 $parapos,     0,                                # NO CLOSING TAG
 397                 $parapos,     length($$textref)-$parapos,       # REMAINDER
 398                );
 399
 400 matched:
 401         $endpos = pos($$textref);
 402         return (
 403                 $startpos,    $opentagpos-$startpos,            # PREFIX
 404                 $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
 405                 $textpos,     $closetagpos-$textpos,            # TEXT
 406                 $closetagpos, $endpos-$closetagpos,             # CLOSING TAG
 407                 $endpos,      length($$textref)-$endpos,        # REMAINDER
 408                );
 409
 410 failed:
 411         _failmsg "Did not find closing tag", pos $$textref unless $@;
 412         pos($$textref) = $startpos;
 413         return;
 414 }
 415
 416 sub extract_variable (;$$)
 417 {
 418         my $textref = defined $_[0] ? \$_[0] : \$_;
 419         return ("","","") unless defined $$textref;
 420         my $pre  = defined $_[1] ? $_[1] : '\s*';
 421
 422         my @match = _match_variable($textref,$pre);
 423
 424         return _fail wantarray, $textref unless @match;
 425
 426         return _succeed wantarray, $textref,
 427                         @match[2..3,4..5,0..1];         # MATCH, REMAINDER, PREFIX
 428 }
 429
 430 sub _match_variable($$)
 431 {
 432         my ($textref, $pre) = @_;
 433         my $startpos = pos($$textref) = pos($$textref)||0;
 434         unless ($$textref =~ m/\G($pre)/gc)
 435         {
 436                 _failmsg "Did not find prefix: /$pre/", pos $$textref;
 437                 return;
 438         }
 439         my $varpos = pos($$textref);
 440         unless ($$textref =~ m/\G(\$#?|[*\@\%]|\\&)+/gc)
 441         {
 442                 _failmsg "Did not find leading dereferencer", pos $$textref;
 443                 pos $$textref = $startpos;
 444                 return;
 445         }
 446
 447         unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
 448                 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0))
 449         {
 450                 _failmsg "Bad identifier after dereferencer", pos $$textref;
 451                 pos $$textref = $startpos;
 452                 return;
 453         }
 454
 455         while (1)
 456         {
 457                 next if _match_codeblock($textref,
 458                                          qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
 459                                          qr/[({[]/, qr/[)}\]]/,
 460                                          qr/[({[]/, qr/[)}\]]/, 0);
 461                 next if _match_codeblock($textref,
 462                                          qr/\s*/, qr/[{[]/, qr/[}\]]/,
 463                                          qr/[{[]/, qr/[}\]]/, 0);
 464                 next if _match_variable($textref,'\s*->\s*');
 465                 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
 466                 last;
 467         }
 468
 469         my $endpos = pos($$textref);
 470         return ($startpos, $varpos-$startpos,
 471                 $varpos,   $endpos-$varpos,
 472                 $endpos,   length($$textref)-$endpos
 473                 );
 474 }
 475
 476 sub extract_codeblock (;$$$$$)
 477 {
 478         my $textref = defined $_[0] ? \$_[0] : \$_;
 479         my $wantarray = wantarray;
 480         my $ldel_inner = defined $_[1] ? $_[1] : '{';
 481         my $pre        = defined $_[2] ? $_[2] : '\s*';
 482         my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
 483         my $rd         = $_[4];
 484         my $rdel_inner = $ldel_inner;
 485         my $rdel_outer = $ldel_outer;
 486         my $posbug = pos;
 487         for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
 488         for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
 489         for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
 490         {
 491                 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
 492         }
 493         pos = $posbug;
 494
 495         my @match = _match_codeblock($textref, $pre,
 496                                      $ldel_outer, $rdel_outer,
 497                                      $ldel_inner, $rdel_inner,
 498                                      $rd);
 499         return _fail($wantarray, $textref) unless @match;
 500         return _succeed($wantarray, $textref,
 501                         @match[2..3,4..5,0..1]  # MATCH, REMAINDER, PREFIX
 502                        );
 503
 504 }
 505
 506 sub _match_codeblock($$$$$$$)
 507 {
 508         my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
 509         my $startpos = pos($$textref) = pos($$textref) || 0;
 510         unless ($$textref =~ m/\G($pre)/gc)
 511         {
 512                 _failmsg qq{Did not match prefix /$pre/ at"} .
 513                             substr($$textref,pos($$textref),20) .
 514                             q{..."},
 515                          pos $$textref;
 516                 return;
 517         }
 518         my $codepos = pos($$textref);
 519         unless ($$textref =~ m/\G($ldel_outer)/gc)      # OUTERMOST DELIMITER
 520         {
 521                 _failmsg qq{Did not find expected opening bracket at "} .
 522                              substr($$textref,pos($$textref),20) .
 523                              q{..."},
 524                          pos $$textref;
 525                 pos $$textref = $startpos;
 526                 return;
 527         }
 528         my $closing = $1;
 529            $closing =~ tr/([<{/)]>}/;
 530         my $matched;
 531         my $patvalid = 1;
 532         while (pos($$textref) < length($$textref))
 533         {
 534                 $matched = '';
 535                 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
 536                 {
 537                         $patvalid = 0;
 538                         next;
 539                 }
 540
 541                 if ($$textref =~ m/\G\s*#.*/gc)
 542                 {
 543                         next;
 544                 }
 545
 546                 if ($$textref =~ m/\G\s*($rdel_outer)/gc)
 547                 {
 548                         unless ($matched = ($closing && $1 eq $closing) )
 549                         {
 550                                 next if $1 eq '>';      # MIGHT BE A "LESS THAN"
 551                                 _failmsg q{Mismatched closing bracket at "} .
 552                                              substr($$textref,pos($$textref),20) .
 553                                              qq{...". Expected '$closing'},
 554                                          pos $$textref;
 555                         }
 556                         last;
 557                 }
 558
 559                 if (_match_variable($textref,'\s*') ||
 560                     _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
 561                 {
 562                         $patvalid = 0;
 563                         next;
 564                 }
 565
 566
 567                 # NEED TO COVER MANY MORE CASES HERE!!!
 568                 if ($$textref =~ m#\G\s*( [-+*x/%^&|.]=?
 569                                         | [!=]~
 570                                         | =(?!>)
 571                                         | (\*\*|&&|\|\||<<|>>)=?
 572                                         | split|grep|map|return
 573                                         )#gcx)
 574                 {
 575                         $patvalid = 1;
 576                         next;
 577                 }
 578
 579                 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
 580                 {
 581                         $patvalid = 1;
 582                         next;
 583                 }
 584
 585                 if ($$textref =~ m/\G\s*$ldel_outer/gc)
 586                 {
 587                         _failmsg q{Improperly nested codeblock at "} .
 588                                      substr($$textref,pos($$textref),20) .
 589                                      q{..."},
 590                                  pos $$textref;
 591                         last;
 592                 }
 593
 594                 $patvalid = 0;
 595                 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
 596         }
 597         continue { $@ = undef }
 598
 599         unless ($matched)
 600         {
 601                 _failmsg 'No match found for opening bracket', pos $$textref
 602                         unless $@;
 603                 return;
 604         }
 605
 606         my $endpos = pos($$textref);
 607         return ( $startpos, $codepos-$startpos,
 608                  $codepos, $endpos-$codepos,
 609                  $endpos,  length($$textref)-$endpos,
 610                );
 611 }
 612
 613
 614 my %mods   = (
 615                 'none'  => '[cgimsox]*',
 616                 'm'     => '[cgimsox]*',
 617                 's'     => '[cegimsox]*',
 618                 'tr'    => '[cds]*',
 619                 'y'     => '[cds]*',
 620                 'qq'    => '',
 621                 'qx'    => '',
 622                 'qw'    => '',
 623                 'qr'    => '[imsx]*',
 624                 'q'     => '',
 625              );
 626
 627 sub extract_quotelike (;$$)
 628 {
 629         my $textref = $_[0] ? \$_[0] : \$_;
 630         my $wantarray = wantarray;
 631         my $pre  = defined $_[1] ? $_[1] : '\s*';
 632
 633         my @match = _match_quotelike($textref,$pre,1,0);
 634         return _fail($wantarray, $textref) unless @match;
 635         return _succeed($wantarray, $textref,
 636                         $match[2], $match[18]-$match[2],        # MATCH
 637                         @match[18,19],                          # REMAINDER
 638                         @match[0,1],                            # PREFIX
 639                         @match[2..17],                          # THE BITS
 640                         @match[20,21],                          # ANY FILLET?
 641                        );
 642 };
 643
 644 sub _match_quotelike($$$$)      # ($textref, $prepat, $allow_raw_match)
 645 {
 646         my ($textref, $pre, $rawmatch, $qmark) = @_;
 647
 648         my ($textlen,$startpos,
 649             $oppos,
 650             $preld1pos,$ld1pos,$str1pos,$rd1pos,
 651             $preld2pos,$ld2pos,$str2pos,$rd2pos,
 652             $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
 653
 654         unless ($$textref =~ m/\G($pre)/gc)
 655         {
 656                 _failmsg qq{Did not find prefix /$pre/ at "} .
 657                              substr($$textref, pos($$textref), 20) .
 658                              q{..."},
 659                          pos $$textref;
 660                 return;
 661         }
 662         $oppos = pos($$textref);
 663
 664         my $initial = substr($$textref,$oppos,1);
 665
 666         if ($initial && $initial =~ m|^[\"\'\`]|
 667                      || $rawmatch && $initial =~ m|^/|
 668                      || $qmark && $initial =~ m|^\?|)
 669         {
 670                 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
 671                 {
 672                         _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
 673                                      substr($$textref, $oppos, 20) .
 674                                      q{..."},
 675                                  pos $$textref;
 676                         pos $$textref = $startpos;
 677                         return;
 678                 }
 679                 $modpos= pos($$textref);
 680                 $rd1pos = $modpos-1;
 681
 682                 if ($initial eq '/' || $initial eq '?')
 683                 {
 684                         $$textref =~ m/\G$mods{none}/gc
 685                 }
 686
 687                 my $endpos = pos($$textref);
 688                 return (
 689                         $startpos,      $oppos-$startpos,       # PREFIX
 690                         $oppos,         0,                      # NO OPERATOR
 691                         $oppos,         1,                      # LEFT DEL
 692                         $oppos+1,       $rd1pos-$oppos-1,       # STR/PAT
 693                         $rd1pos,        1,                      # RIGHT DEL
 694                         $modpos,        0,                      # NO 2ND LDEL
 695                         $modpos,        0,                      # NO 2ND STR
 696                         $modpos,        0,                      # NO 2ND RDEL
 697                         $modpos,        $endpos-$modpos,        # MODIFIERS
 698                         $endpos,        $textlen-$endpos,       # REMAINDER
 699                        );
 700         }
 701
 702         unless ($$textref =~ m{\G((?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
 703         {
 704                 _failmsg q{No quotelike operator found after prefix at "} .
 705                              substr($$textref, pos($$textref), 20) .
 706                              q{..."},
 707                          pos $$textref;
 708                 pos $$textref = $startpos;
 709                 return;
 710         }
 711
 712         my $op = $1;
 713         $preld1pos = pos($$textref);
 714         if ($op eq '<<') {
 715                 $ld1pos = pos($$textref);
 716                 my $label;
 717                 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
 718                         $label = $1;
 719                 }
 720                 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
 721                                      | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
 722                                      | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
 723                                      }gcsx) {
 724                         $label = $+;
 725                 }
 726                 else {
 727                         $label = "";
 728                 }
 729                 my $extrapos = pos($$textref);
 730                 $$textref =~ m{.*\n}gc;
 731                 $str1pos = pos($$textref);
 732                 unless ($$textref =~ m{.*?\n(?=$label\n)}gc) {
 733                         _failmsg qq{Missing here doc terminator ('$label') after "} .
 734                                      substr($$textref, $startpos, 20) .
 735                                      q{..."},
 736                                  pos $$textref;
 737                         pos $$textref = $startpos;
 738                         return;
 739                 }
 740                 $rd1pos = pos($$textref);
 741                 $$textref =~ m{$label\n}gc;
 742                 $ld2pos = pos($$textref);
 743                 return (
 744                         $startpos,      $oppos-$startpos,       # PREFIX
 745                         $oppos,         length($op),            # OPERATOR
 746                         $ld1pos,        $extrapos-$ld1pos,      # LEFT DEL
 747                         $str1pos,       $rd1pos-$str1pos,       # STR/PAT
 748                         $rd1pos,        $ld2pos-$rd1pos,        # RIGHT DEL
 749                         $ld2pos,        0,                      # NO 2ND LDEL
 750                         $ld2pos,        0,                      # NO 2ND STR
 751                         $ld2pos,        0,                      # NO 2ND RDEL
 752                         $ld2pos,        0,                      # NO MODIFIERS
 753                         $ld2pos,        $textlen-$ld2pos,       # REMAINDER
 754                         $extrapos,      $str1pos-$extrapos,     # FILLETED BIT
 755                        );
 756         }
 757
 758         $$textref =~ m/\G\s*/gc;
 759         $ld1pos = pos($$textref);
 760         $str1pos = $ld1pos+1;
 761
 762         unless ($$textref =~ m/\G(\S)/gc)       # SHOULD USE LOOKAHEAD
 763         {
 764                 _failmsg "No block delimiter found after quotelike $op",
 765                          pos $$textref;
 766                 pos $$textref = $startpos;
 767                 return;
 768         }
 769         pos($$textref) = $ld1pos;       # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
 770         my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
 771         if ($ldel1 =~ /[[(<{]/)
 772         {
 773                 $rdel1 =~ tr/[({</])}>/;
 774                 _match_bracketed($textref,"",$ldel1,"","",$rdel1)
 775                 || do { pos $$textref = $startpos; return };
 776         }
 777         else
 778         {
 779                 $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
 780                 || do { pos $$textref = $startpos; return };
 781         }
 782         $ld2pos = $rd1pos = pos($$textref)-1;
 783
 784         my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
 785         if ($second_arg)
 786         {
 787                 my ($ldel2, $rdel2);
 788                 if ($ldel1 =~ /[[(<{]/)
 789                 {
 790                         unless ($$textref =~ /\G\s*(\S)/gc)     # SHOULD USE LOOKAHEAD
 791                         {
 792                                 _failmsg "Missing second block for quotelike $op",
 793                                          pos $$textref;
 794                                 pos $$textref = $startpos;
 795                                 return;
 796                         }
 797                         $ldel2 = $rdel2 = "\Q$1";
 798                         $rdel2 =~ tr/[({</])}>/;
 799                 }
 800                 else
 801                 {
 802                         $ldel2 = $rdel2 = $ldel1;
 803                 }
 804                 $str2pos = $ld2pos+1;
 805
 806                 if ($ldel2 =~ /[[(<{]/)
 807                 {
 808                         pos($$textref)--;       # OVERCOME BROKEN LOOKAHEAD
 809                         _match_bracketed($textref,"",$ldel2,"","",$rdel2)
 810                         || do { pos $$textref = $startpos; return };
 811                 }
 812                 else
 813                 {
 814                         $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
 815                         || do { pos $$textref = $startpos; return };
 816                 }
 817                 $rd2pos = pos($$textref)-1;
 818         }
 819         else
 820         {
 821                 $ld2pos = $str2pos = $rd2pos = $rd1pos;
 822         }
 823
 824         $modpos = pos $$textref;
 825
 826         $$textref =~ m/\G($mods{$op})/gc;
 827         my $endpos = pos $$textref;
 828
 829         return (
 830                 $startpos,      $oppos-$startpos,       # PREFIX
 831                 $oppos,         length($op),            # OPERATOR
 832                 $ld1pos,        1,                      # LEFT DEL
 833                 $str1pos,       $rd1pos-$str1pos,       # STR/PAT
 834                 $rd1pos,        1,                      # RIGHT DEL
 835                 $ld2pos,        $second_arg,            # 2ND LDEL (MAYBE)
 836                 $str2pos,       $rd2pos-$str2pos,       # 2ND STR (MAYBE)
 837                 $rd2pos,        $second_arg,            # 2ND RDEL (MAYBE)
 838                 $modpos,        $endpos-$modpos,        # MODIFIERS
 839                 $endpos,        $textlen-$endpos,       # REMAINDER
 840                );
 841 }
 842
 843 my $def_func =
 844 [
 845         sub { extract_variable($_[0], '') },
 846         sub { extract_quotelike($_[0],'') },
 847         sub { extract_codeblock($_[0],'{}','') },
 848 ];
 849
 850 sub extract_multiple (;$$$$)    # ($text, $functions_ref, $max_fields, $ignoreunknown)
 851 {
 852         my $textref = defined($_[0]) ? \$_[0] : \$_;
 853         my $posbug = pos;
 854         my ($lastpos, $firstpos);
 855         my @fields = ();
 856
 857         for ($$textref)
 858         {
 859                 my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
 860                 my $max  = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
 861                 my $igunk = $_[3];
 862
 863                 pos ||= 0;
 864
 865                 unless (wantarray)
 866                 {
 867                         use Carp;
 868                         carp "extract_multiple reset maximal count to 1 in scalar context"
 869                                 if $^W && defined($_[2]) && $max > 1;
 870                         $max = 1
 871                 }
 872
 873                 my $unkpos;
 874                 my $func;
 875                 my $class;
 876
 877                 my @class;
 878                 foreach $func ( @func )
 879                 {
 880                         if (ref($func) eq 'HASH')
 881                         {
 882                                 push @class, (keys %$func)[0];
 883                                 $func = (values %$func)[0];
 884                         }
 885                         else
 886                         {
 887                                 push @class, undef;
 888                         }
 889                 }
 890
 891                 FIELD: while (pos() < length())
 892                 {
 893                         my $field;
 894                         foreach my $i ( 0..$#func )
 895                         {
 896                                 $func = $func[$i];
 897                                 $class = $class[$i];
 898                                 $lastpos = pos;
 899                                 if (ref($func) eq 'CODE')
 900                                         { ($field) = $func->($_) }
 901                                 elsif (ref($func) eq 'Text::Balanced::Extractor')
 902                                         { $field = $func->extract($_) }
 903                                 elsif( m/\G$func/gc )
 904                                         { $field = defined($1) ? $1 : $& }
 905
 906                                 if (defined($field) && length($field))
 907                                 {
 908                                         if (defined($unkpos) && !$igunk)
 909                                         {
 910                                                 push @fields, substr($_, $unkpos, $lastpos-$unkpos);
 911                                                 $firstpos = $unkpos unless defined $firstpos;
 912                                                 undef $unkpos;
 913                                                 last FIELD if @fields == $max;
 914                                         }
 915                                         push @fields, $class
 916                                                 ? bless(\$field, $class)
 917                                                 : $field;
 918                                         $firstpos = $lastpos unless defined $firstpos;
 919                                         $lastpos = pos;
 920                                         last FIELD if @fields == $max;
 921                                         next FIELD;
 922                                 }
 923                         }
 924                         if (/\G(.)/gcs)
 925                         {
 926                                 $unkpos = pos()-1
 927                                         unless $igunk || defined $unkpos;
 928                         }
 929                 }
 930
 931                 if (defined $unkpos)
 932                 {
 933                         push @fields, substr($_, $unkpos);
 934                         $firstpos = $unkpos unless defined $firstpos;
 935                         $lastpos = length;
 936                 }
 937                 last;
 938         }
 939
 940         pos $$textref = $lastpos;
 941         return @fields if wantarray;
 942
 943         $firstpos ||= 0;
 944         eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
 945                pos $$textref = $firstpos };
 946         return $fields[0];
 947 }
 948
 949
 950 sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
 951 {
 952         my $ldel    = $_[0];
 953         my $rdel    = $_[1];
 954         my $pre     = defined $_[2] ? $_[2] : '\s*';
 955         my %options = defined $_[3] ? %{$_[3]} : ();
 956         my $omode   = defined $options{fail} ? $options{fail} : '';
 957         my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
 958                     : defined($options{reject})        ? $options{reject}
 959                     :                                    ''
 960                     ;
 961         my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
 962                     : defined($options{ignore})        ? $options{ignore}
 963                     :                                    ''
 964                     ;
 965
 966         if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
 967
 968         my $posbug = pos;
 969         for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
 970         pos = $posbug;
 971
 972         my $closure = sub
 973         {
 974                 my $textref = defined $_[0] ? \$_[0] : \$_;
 975                 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
 976
 977                 return _fail(wantarray, $textref) unless @match;
 978                 return _succeed wantarray, $textref,
 979                                 $match[2], $match[3]+$match[5]+$match[7],       # MATCH
 980                                 @match[8..9,0..1,2..7];                         # REM, PRE, BITS
 981         };
 982
 983         bless $closure, 'Text::Balanced::Extractor';
 984 }
 985
 986 package Text::Balanced::Extractor;
 987
 988 sub extract($$) # ($self, $text)
 989 {
 990         &{$_[0]}($_[1]);
 991 }
 992
 993 package Text::Balanced::ErrorMsg;
 994
 995 use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
 996
 997 1;
 998
 999 __END__
1000
1001 =head1 NAME
1002
1003 Text::Balanced - Extract delimited text sequences from strings.
1004
1005
1006 =head1 SYNOPSIS
1007
1008  use Text::Balanced qw (
1009                         extract_delimited
1010                         extract_bracketed
1011                         extract_quotelike
1012                         extract_codeblock
1013                         extract_variable
1014                         extract_tagged
1015                         extract_multiple
1016
1017                         gen_delimited_pat
1018                         gen_extract_tagged
1019                        );
1020
1021  # Extract the initial substring of $text that is delimited by
1022  # two (unescaped) instances of the first character in $delim.
1023
1024         ($extracted, $remainder) = extract_delimited($text,$delim);
1025
1026
1027  # Extract the initial substring of $text that is bracketed
1028  # with a delimiter(s) specified by $delim (where the string
1029  # in $delim contains one or more of '(){}[]<>').
1030
1031         ($extracted, $remainder) = extract_bracketed($text,$delim);
1032
1033
1034  # Extract the initial substring of $text that is bounded by
1035  # an HTML/XML tag.
1036
1037         ($extracted, $remainder) = extract_tagged($text);
1038
1039
1040  # Extract the initial substring of $text that is bounded by
1041  # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1042
1043         ($extracted, $remainder) =
1044                 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1045
1046
1047  # Extract the initial substring of $text that represents a
1048  # Perl "quote or quote-like operation"
1049
1050         ($extracted, $remainder) = extract_quotelike($text);
1051
1052
1053  # Extract the initial substring of $text that represents a block
1054  # of Perl code, bracketed by any of character(s) specified by $delim
1055  # (where the string $delim contains one or more of '(){}[]<>').
1056
1057         ($extracted, $remainder) = extract_codeblock($text,$delim);
1058
1059
1060  # Extract the initial substrings of $text that would be extracted by
1061  # one or more sequential applications of the specified functions
1062  # or regular expressions
1063
1064         @extracted = extract_multiple($text,
1065                                       [ \&extract_bracketed,
1066                                         \&extract_quotelike,
1067                                         \&some_other_extractor_sub,
1068                                         qr/[xyz]*/,
1069                                         'literal',
1070                                       ]);
1071
1072 # Create a string representing an optimized pattern (a la Friedl)
1073 # that matches a substring delimited by any of the specified characters
1074 # (in this case: any type of quote or a slash)
1075
1076         $patstring = gen_delimited_pat(q{'"`/});
1077
1078
1079 # Generate a reference to an anonymous sub that is just like extract_tagged
1080 # but pre-compiled and optimized for a specific pair of tags, and consequently
1081 # much faster (i.e. 3 times faster). It uses qr// for better performance on
1082 # repeated calls, so it only works under Perl 5.005 or later.
1083
1084         $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1085
1086         ($extracted, $remainder) = $extract_head->($text);
1087
1088
1089 =head1 DESCRIPTION
1090
1091 The various C<extract_...> subroutines may be used to extract a
1092 delimited string (possibly after skipping a specified prefix string).
1093 The search for the string always begins at the current C<pos>
1094 location of the string's variable (or at index zero, if no C<pos>
1095 position is defined).
1096
1097 =head2 General behaviour in list contexts
1098
1099 In a list context, all the subroutines return a list, the first three
1100 elements of which are always:
1101
1102 =over 4
1103
1104 =item [0]
1105
1106 The extracted string, including the specified delimiters.
1107 If the extraction fails an empty string is returned.
1108
1109 =item [1]
1110
1111 The remainder of the input string (i.e. the characters after the
1112 extracted string). On failure, the entire string is returned.
1113
1114 =item [2]
1115
1116 The skipped prefix (i.e. the characters before the extracted string).
1117 On failure, the empty string is returned.
1118
1119 =back
1120
1121 Note that in a list context, the contents of the original input text (the first
1122 argument) are not modified in any way.
1123
1124 However, if the input text was passed in a variable, that variable's
1125 C<pos> value is updated to point at the first character after the
1126 extracted text. That means that in a list context the various
1127 subroutines can be used much like regular expressions. For example:
1128
1129         while ( $next = (extract_quotelike($text))[0] )
1130         {
1131                 # process next quote-like (in $next)
1132         }
1133
1134
1135 =head2 General behaviour in scalar and void contexts
1136
1137 In a scalar context, the extracted string is returned, having first been
1138 removed from the input text. Thus, the following code also processes
1139 each quote-like operation, but actually removes them from $text:
1140
1141         while ( $next = extract_quotelike($text) )
1142         {
1143                 # process next quote-like (in $next)
1144         }
1145
1146 Note that if the input text is a read-only string (i.e. a literal),
1147 no attempt is made to remove the extracted text.
1148
1149 In a void context the behaviour of the extraction subroutines is
1150 exactly the same as in a scalar context, except (of course) that the
1151 extracted substring is not returned.
1152
1153 =head2 A note about prefixes
1154
1155 Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1156 This can bite you if you're expecting a prefix specification like
1157 '.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1158 pattern will only succeed if the <H1> tag is on the current line, since
1159 . normally doesn't match newlines.
1160
1161 To overcome this limitation, you need to turn on /s matching within
1162 the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1163
1164
1165 =head2 C<extract_delimited>
1166
1167 The C<extract_delimited> function formalizes the common idiom
1168 of extracting a single-character-delimited substring from the start of
1169 a string. For example, to extract a single-quote delimited string, the
1170 following code is typically used:
1171
1172         ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1173         $extracted = $1;
1174
1175 but with C<extract_delimited> it can be simplified to:
1176
1177         ($extracted,$remainder) = extract_delimited($text, "'");
1178
1179 C<extract_delimited> takes up to four scalars (the input text, the
1180 delimiters, a prefix pattern to be skipped, and any escape characters)
1181 and extracts the initial substring of the text that
1182 is appropriately delimited. If the delimiter string has multiple
1183 characters, the first one encountered in the text is taken to delimit
1184 the substring.
1185 The third argument specifies a prefix pattern that is to be skipped
1186 (but must be present!) before the substring is extracted.
1187 The final argument specifies the escape character to be used for each
1188 delimiter.
1189
1190 All arguments are optional. If the escape characters are not specified,
1191 every delimiter is escaped with a backslash (C<\>).
1192 If the prefix is not specified, the
1193 pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1194 is also not specified, the set C</["'`]/> is used. If the text to be processed
1195 is not specified either, C<$_> is used.
1196
1197 In list context, C<extract_delimited> returns a array of three
1198 elements, the extracted substring (I<including the surrounding
1199 delimiters>), the remainder of the text, and the skipped prefix (if
1200 any). If a suitable delimited substring is not found, the first
1201 element of the array is the empty string, the second is the complete
1202 original text, and the prefix returned in the third element is an
1203 empty string.
1204
1205 In a scalar context, just the extracted substring is returned. In
1206 a void context, the extracted substring (and any prefix) are simply
1207 removed from the beginning of the first argument.
1208
1209 Examples:
1210
1211         # Remove a single-quoted substring from the very beginning of $text:
1212
1213                 $substring = extract_delimited($text, "'", '');
1214
1215         # Remove a single-quoted Pascalish substring (i.e. one in which
1216         # doubling the quote character escapes it) from the very
1217         # beginning of $text:
1218
1219                 $substring = extract_delimited($text, "'", '', "'");
1220
1221         # Extract a single- or double- quoted substring from the
1222         # beginning of $text, optionally after some whitespace
1223         # (note the list context to protect $text from modification):
1224
1225                 ($substring) = extract_delimited $text, q{"'};
1226
1227
1228         # Delete the substring delimited by the first '/' in $text:
1229
1230                 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1231
1232 Note that this last example is I<not> the same as deleting the first
1233 quote-like pattern. For instance, if C<$text> contained the string:
1234
1235         "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1236
1237 then after the deletion it would contain:
1238
1239         "if ('.$UNIXCMD/s) { $cmd = $1; }"
1240
1241 not:
1242
1243         "if ('./cmd' =~ ms) { $cmd = $1; }"
1244
1245
1246 See L<"extract_quotelike"> for a (partial) solution to this problem.
1247
1248
1249 =head2 C<extract_bracketed>
1250
1251 Like C<"extract_delimited">, the C<extract_bracketed> function takes
1252 up to three optional scalar arguments: a string to extract from, a delimiter
1253 specifier, and a prefix pattern. As before, a missing prefix defaults to
1254 optional whitespace and a missing text defaults to C<$_>. However, a missing
1255 delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1256
1257 C<extract_bracketed> extracts a balanced-bracket-delimited
1258 substring (using any one (or more) of the user-specified delimiter
1259 brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1260 respect quoted unbalanced brackets (see below).
1261
1262 A "delimiter bracket" is a bracket in list of delimiters passed as
1263 C<extract_bracketed>'s second argument. Delimiter brackets are
1264 specified by giving either the left or right (or both!) versions
1265 of the required bracket(s). Note that the order in which
1266 two or more delimiter brackets are specified is not significant.
1267
1268 A "balanced-bracket-delimited substring" is a substring bounded by
1269 matched brackets, such that any other (left or right) delimiter
1270 bracket I<within> the substring is also matched by an opposite
1271 (right or left) delimiter bracket I<at the same level of nesting>. Any
1272 type of bracket not in the delimiter list is treated as an ordinary
1273 character.
1274
1275 In other words, each type of bracket specified as a delimiter must be
1276 balanced and correctly nested within the substring, and any other kind of
1277 ("non-delimiter") bracket in the substring is ignored.
1278
1279 For example, given the string:
1280
1281         $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1282
1283 then a call to C<extract_bracketed> in a list context:
1284
1285         @result = extract_bracketed( $text, '{}' );
1286
1287 would return:
1288
1289         ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1290
1291 since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1292 (In a scalar context just the first element of the array would be returned. In
1293 a void context, C<$text> would be replaced by an empty string.)
1294
1295 Likewise the call in:
1296
1297         @result = extract_bracketed( $text, '{[' );
1298
1299 would return the same result, since all sets of both types of specified
1300 delimiter brackets are correctly nested and balanced.
1301
1302 However, the call in:
1303
1304         @result = extract_bracketed( $text, '{([<' );
1305
1306 would fail, returning:
1307
1308         ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }"  );
1309
1310 because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1311 the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1312 return an empty string. In a void context, C<$text> would be unchanged.)
1313
1314 Note that the embedded single-quotes in the string don't help in this
1315 case, since they have not been specified as acceptable delimiters and are
1316 therefore treated as non-delimiter characters (and ignored).
1317
1318 However, if a particular species of quote character is included in the
1319 delimiter specification, then that type of quote will be correctly handled.
1320 for example, if C<$text> is:
1321
1322         $text = '<A HREF=">>>>">link</A>';
1323
1324 then
1325
1326         @result = extract_bracketed( $text, '<">' );
1327
1328 returns:
1329
1330         ( '<A HREF=">>>>">', 'link</A>', "" )
1331
1332 as expected. Without the specification of C<"> as an embedded quoter:
1333
1334         @result = extract_bracketed( $text, '<>' );
1335
1336 the result would be:
1337
1338         ( '<A HREF=">', '>>>">link</A>', "" )
1339
1340 In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1341 quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1342 letter 'q' as a delimiter. Hence:
1343
1344         @result = extract_bracketed( $text, '<q>' );
1345
1346 would correctly match something like this:
1347
1348         $text = '<leftop: conj /and/ conj>';
1349
1350 See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1351
1352
1353 =head2 C<extract_tagged>
1354
1355 C<extract_tagged> extracts and segments text between (balanced)
1356 specified tags.
1357
1358 The subroutine takes up to five optional arguments:
1359
1360 =over 4
1361
1362 =item 1.
1363
1364 A string to be processed (C<$_> if the string is omitted or C<undef>)
1365
1366 =item 2.
1367
1368 A string specifying a pattern to be matched as the opening tag.
1369 If the pattern string is omitted (or C<undef>) then a pattern
1370 that matches any standard HTML/XML tag is used.
1371
1372 =item 3.
1373
1374 A string specifying a pattern to be matched at the closing tag.
1375 If the pattern string is omitted (or C<undef>) then the closing
1376 tag is constructed by inserting a C</> after any leading bracket
1377 characters in the actual opening tag that was matched (I<not> the pattern
1378 that matched the tag). For example, if the opening tag pattern
1379 is specified as C<'{{\w+}}'> and actually matched the opening tag
1380 C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1381
1382 =item 4.
1383
1384 A string specifying a pattern to be matched as a prefix (which is to be
1385 skipped). If omitted, optional whitespace is skipped.
1386
1387 =item 5.
1388
1389 A hash reference containing various parsing options (see below)
1390
1391 =back
1392
1393 The various options that can be specified are:
1394
1395 =over 4
1396
1397 =item C<reject =E<gt> $listref>
1398
1399 The list reference contains one or more strings specifying patterns
1400 that must I<not> appear within the tagged text.
1401
1402 For example, to extract
1403 an HTML link (which should not contain nested links) use:
1404
1405         extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1406
1407 =item C<ignore =E<gt> $listref>
1408
1409 The list reference contains one or more strings specifying patterns
1410 that are I<not> be be treated as nested tags within the tagged text
1411 (even if they would match the start tag pattern).
1412
1413 For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1414
1415         extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1416
1417 (also see L<"gen_delimited_pat"> below).
1418
1419
1420 =item C<fail =E<gt> $str>
1421
1422 The C<fail> option indicates the action to be taken if a matching end
1423 tag is not encountered (i.e. before the end of the string or some
1424 C<reject> pattern matches). By default, a failure to match a closing
1425 tag causes C<extract_tagged> to immediately fail.
1426
1427 However, if the string value associated with <reject> is "MAX", then
1428 C<extract_tagged> returns the complete text up to the point of failure.
1429 If the string is "PARA", C<extract_tagged> returns only the first paragraph
1430 after the tag (up to the first line that is either empty or contains
1431 only whitespace characters).
1432 If the string is "", the the default behaviour (i.e. failure) is reinstated.
1433
1434 For example, suppose the start tag "/para" introduces a paragraph, which then
1435 continues until the next "/endpara" tag or until another "/para" tag is
1436 encountered:
1437
1438         $text = "/para line 1\n\nline 3\n/para line 4";
1439
1440         extract_tagged($text, '/para', '/endpara', undef,
1441                                 {reject => '/para', fail => MAX );
1442
1443         # EXTRACTED: "/para line 1\n\nline 3\n"
1444
1445 Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1446 tag refers only to the immediately following paragraph:
1447
1448         $text = "/para line 1\n\nline 3\n/para line 4";
1449
1450         extract_tagged($text, '/para', '/endpara', undef,
1451                         {reject => '/para', fail => MAX );
1452
1453         # EXTRACTED: "/para line 1\n"
1454
1455 Note that the specified C<fail> behaviour applies to nested tags as well.
1456
1457 =back
1458
1459 On success in a list context, an array of 6 elements is returned. The elements are:
1460
1461 =over 4
1462
1463 =item [0]
1464
1465 the extracted tagged substring (including the outermost tags),
1466
1467 =item [1]
1468
1469 the remainder of the input text,
1470
1471 =item [2]
1472
1473 the prefix substring (if any),
1474
1475 =item [3]
1476
1477 the opening tag
1478
1479 =item [4]
1480
1481 the text between the opening and closing tags
1482
1483 =item [5]
1484
1485 the closing tag (or "" if no closing tag was found)
1486
1487 =back
1488
1489 On failure, all of these values (except the remaining text) are C<undef>.
1490
1491 In a scalar context, C<extract_tagged> returns just the complete
1492 substring that matched a tagged text (including the start and end
1493 tags). C<undef> is returned on failure. In addition, the original input
1494 text has the returned substring (and any prefix) removed from it.
1495
1496 In a void context, the input text just has the matched substring (and
1497 any specified prefix) removed.
1498
1499
1500 =head2 C<gen_extract_tagged>
1501
1502 (Note: This subroutine is only available under Perl5.005)
1503
1504 C<gen_extract_tagged> generates a new anonymous subroutine which
1505 extracts text between (balanced) specified tags. In other words,
1506 it generates a function identical in function to C<extract_tagged>.
1507
1508 The difference between C<extract_tagged> and the anonymous
1509 subroutines generated by
1510 C<gen_extract_tagged>, is that those generated subroutines:
1511
1512 =over 4
1513
1514 =item *
1515
1516 do not have to reparse tag specification or parsing options every time
1517 they are called (whereas C<extract_tagged> has to effectively rebuild
1518 its tag parser on every call);
1519
1520 =item *
1521
1522 make use of the new qr// construct to pre-compile the regexes they use
1523 (whereas C<extract_tagged> uses standard string variable interpolation
1524 to create tag-matching patterns).
1525
1526 =back
1527
1528 The subroutine takes up to four optional arguments (the same set as
1529 C<extract_tagged> except for the string to be processed). It returns
1530 a reference to a subroutine which in turn takes a single argument (the text to
1531 be extracted from).
1532
1533 In other words, the implementation of C<extract_tagged> is exactly
1534 equivalent to:
1535
1536         sub extract_tagged
1537         {
1538                 my $text = shift;
1539                 $extractor = gen_extract_tagged(@_);
1540                 return $extractor->($text);
1541         }
1542
1543 (although C<extract_tagged> is not currently implemented that way, in order
1544 to preserve pre-5.005 compatibility).
1545
1546 Using C<gen_extract_tagged> to create extraction functions for specific tags
1547 is a good idea if those functions are going to be called more than once, since
1548 their performance is typically twice as good as the more general-purpose
1549 C<extract_tagged>.
1550
1551
1552 =head2 C<extract_quotelike>
1553
1554 C<extract_quotelike> attempts to recognize, extract, and segment any
1555 one of the various Perl quotes and quotelike operators (see
1556 L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1557 delimiters (for the quotelike operators), and trailing modifiers are
1558 all caught. For example, in:
1559
1560         extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1561
1562         extract_quotelike '  "You said, \"Use sed\"."  '
1563
1564         extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1565
1566         extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1567
1568 the full Perl quotelike operations are all extracted correctly.
1569
1570 Note too that, when using the /x modifier on a regex, any comment
1571 containing the current pattern delimiter will cause the regex to be
1572 immediately terminated. In other words:
1573
1574         'm /
1575                 (?i)            # CASE INSENSITIVE
1576                 [a-z_]          # LEADING ALPHABETIC/UNDERSCORE
1577                 [a-z0-9]*       # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1578            /x'
1579
1580 will be extracted as if it were:
1581
1582         'm /
1583                 (?i)            # CASE INSENSITIVE
1584                 [a-z_]          # LEADING ALPHABETIC/'
1585
1586 This behaviour is identical to that of the actual compiler.
1587
1588 C<extract_quotelike> takes two arguments: the text to be processed and
1589 a prefix to be matched at the very beginning of the text. If no prefix
1590 is specified, optional whitespace is the default. If no text is given,
1591 C<$_> is used.
1592
1593 In a list context, an array of 11 elements is returned. The elements are:
1594
1595 =over 4
1596
1597 =item [0]
1598
1599 the extracted quotelike substring (including trailing modifiers),
1600
1601 =item [1]
1602
1603 the remainder of the input text,
1604
1605 =item [2]
1606
1607 the prefix substring (if any),
1608
1609 =item [3]
1610
1611 the name of the quotelike operator (if any),
1612
1613 =item [4]
1614
1615 the left delimiter of the first block of the operation,
1616
1617 =item [5]
1618
1619 the text of the first block of the operation
1620 (that is, the contents of
1621 a quote, the regex of a match or substitution or the target list of a
1622 translation),
1623
1624 =item [6]
1625
1626 the right delimiter of the first block of the operation,
1627
1628 =item [7]
1629
1630 the left delimiter of the second block of the operation
1631 (that is, if it is a C<s>, C<tr>, or C<y>),
1632
1633 =item [8]
1634
1635 the text of the second block of the operation
1636 (that is, the replacement of a substitution or the translation list
1637 of a translation),
1638
1639 =item [9]
1640
1641 the right delimiter of the second block of the operation (if any),
1642
1643 =item [10]
1644
1645 the trailing modifiers on the operation (if any).
1646
1647 =back
1648
1649 For each of the fields marked "(if any)" the default value on success is
1650 an empty string.
1651 On failure, all of these values (except the remaining text) are C<undef>.
1652
1653
1654 In a scalar context, C<extract_quotelike> returns just the complete substring
1655 that matched a quotelike operation (or C<undef> on failure). In a scalar or
1656 void context, the input text has the same substring (and any specified
1657 prefix) removed.
1658
1659 Examples:
1660
1661         # Remove the first quotelike literal that appears in text
1662
1663                 $quotelike = extract_quotelike($text,'.*?');
1664
1665         # Replace one or more leading whitespace-separated quotelike
1666         # literals in $_ with "<QLL>"
1667
1668                 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1669
1670
1671         # Isolate the search pattern in a quotelike operation from $text
1672
1673                 ($op,$pat) = (extract_quotelike $text)[3,5];
1674                 if ($op =~ /[ms]/)
1675                 {
1676                         print "search pattern: $pat\n";
1677                 }
1678                 else
1679                 {
1680                         print "$op is not a pattern matching operation\n";
1681                 }
1682
1683
1684 =head2 C<extract_quotelike> and "here documents"
1685
1686 C<extract_quotelike> can successfully extract "here documents" from an input
1687 string, but with an important caveat in list contexts.
1688
1689 Unlike other types of quote-like literals, a here document is rarely
1690 a contiguous substring. For example, a typical piece of code using
1691 here document might look like this:
1692
1693         <<'EOMSG' || die;
1694         This is the message.
1695         EOMSG
1696         exit;
1697
1698 Given this as an input string in a scalar context, C<extract_quotelike>
1699 would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1700 leaving the string " || die;\nexit;" in the original variable. In other words,
1701 the two separate pieces of the here document are successfully extracted and
1702 concatenated.
1703
1704 In a list context, C<extract_quotelike> would return the list
1705
1706 =over 4
1707
1708 =item [0]
1709
1710 "<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1711 including fore and aft delimiters),
1712
1713 =item [1]
1714
1715 " || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1716
1717 =item [2]
1718
1719 "" (i.e. the prefix substring -- trivial in this case),
1720
1721 =item [3]
1722
1723 "<<" (i.e. the "name" of the quotelike operator)
1724
1725 =item [4]
1726
1727 "'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1728
1729 =item [5]
1730
1731 "This is the message.\n" (i.e. the text of the here document),
1732
1733 =item [6]
1734
1735 "EOMSG" (i.e. the right delimiter of the here document),
1736
1737 =item [7..10]
1738
1739 "" (a here document has no second left delimiter, second text, second right
1740 delimiter, or trailing modifiers).
1741
1742 =back
1743
1744 However, the matching position of the input variable would be set to
1745 "exit;" (i.e. I<after> the closing delimiter of the here document),
1746 which would cause the earlier " || die;\nexit;" to be skipped in any
1747 sequence of code fragment extractions.
1748
1749 To avoid this problem, when it encounters a here document whilst
1750 extracting from a modifiable string, C<extract_quotelike> silently
1751 rearranges the string to an equivalent piece of Perl:
1752
1753         <<'EOMSG'
1754         This is the message.
1755         EOMSG
1756         || die;
1757         exit;
1758
1759 in which the here document I<is> contiguous. It still leaves the
1760 matching position after the here document, but now the rest of the line
1761 on which the here document starts is not skipped.
1762
1763 To prevent <extract_quotelike> from mucking about with the input in this way
1764 (this is the only case where a list-context C<extract_quotelike> does so),
1765 you can pass the input variable as an interpolated literal:
1766
1767         $quotelike = extract_quotelike("$var");
1768
1769
1770 =head2 C<extract_codeblock>
1771
1772 C<extract_codeblock> attempts to recognize and extract a balanced
1773 bracket delimited substring that may contain unbalanced brackets
1774 inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1775 is like a combination of C<"extract_bracketed"> and
1776 C<"extract_quotelike">.
1777
1778 C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1779 a text to process, a set of delimiter brackets to look for, and a prefix to
1780 match first. It also takes an optional fourth parameter, which allows the
1781 outermost delimiter brackets to be specified separately (see below).
1782
1783 Omitting the first argument (input text) means process C<$_> instead.
1784 Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1785 Omitting the third argument (prefix argument) implies optional whitespace at the start.
1786 Omitting the fourth argument (outermost delimiter brackets) indicates that the
1787 value of the second argument is to be used for the outermost delimiters.
1788
1789 Once the prefix an dthe outermost opening delimiter bracket have been
1790 recognized, code blocks are extracted by stepping through the input text and
1791 trying the following alternatives in sequence:
1792
1793 =over 4
1794
1795 =item 1.
1796
1797 Try and match a closing delimiter bracket. If the bracket was the same
1798 species as the last opening bracket, return the substring to that
1799 point. If the bracket was mismatched, return an error.
1800
1801 =item 2.
1802
1803 Try to match a quote or quotelike operator. If found, call
1804 C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1805 the error it returned. Otherwise go back to step 1.
1806
1807 =item 3.
1808
1809 Try to match an opening delimiter bracket. If found, call
1810 C<extract_codeblock> recursively to eat the embedded block. If the
1811 recursive call fails, return an error. Otherwise, go back to step 1.
1812
1813 =item 4.
1814
1815 Unconditionally match a bareword or any other single character, and
1816 then go back to step 1.
1817
1818 =back
1819
1820
1821 Examples:
1822
1823         # Find a while loop in the text
1824
1825                 if ($text =~ s/.*?while\s*\{/{/)
1826                 {
1827                         $loop = "while " . extract_codeblock($text);
1828                 }
1829
1830         # Remove the first round-bracketed list (which may include
1831         # round- or curly-bracketed code blocks or quotelike operators)
1832
1833                 extract_codeblock $text, "(){}", '[^(]*';
1834
1835
1836 The ability to specify a different outermost delimiter bracket is useful
1837 in some circumstances. For example, in the Parse::RecDescent module,
1838 parser actions which are to be performed only on a successful parse
1839 are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1840
1841         sentence: subject verb object
1842                         <defer: {$::theVerb = $item{verb}} >
1843
1844 Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1845 within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1846
1847 A deferred action like this:
1848
1849                         <defer: {if ($count>10) {$count--}} >
1850
1851 will be incorrectly parsed as:
1852
1853                         <defer: {if ($count>
1854
1855 because the "less than" operator is interpreted as a closing delimiter.
1856
1857 But, by extracting the directive using
1858 S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1859 the '>' character is only treated as a delimited at the outermost
1860 level of the code block, so the directive is parsed correctly.
1861
1862 =head2 C<extract_multiple>
1863
1864 The C<extract_multiple> subroutine takes a string to be processed and a
1865 list of extractors (subroutines or regular expressions) to apply to that string.
1866
1867 In an array context C<extract_multiple> returns an array of substrings
1868 of the original string, as extracted by the specified extractors.
1869 In a scalar context, C<extract_multiple> returns the first
1870 substring successfully extracted from the original string. In both
1871 scalar and void contexts the original string has the first successfully
1872 extracted substring removed from it. In all contexts
1873 C<extract_multiple> starts at the current C<pos> of the string, and
1874 sets that C<pos> appropriately after it matches.
1875
1876 Hence, the aim of of a call to C<extract_multiple> in a list context
1877 is to split the processed string into as many non-overlapping fields as
1878 possible, by repeatedly applying each of the specified extractors
1879 to the remainder of the string. Thus C<extract_multiple> is
1880 a generalized form of Perl's C<split> subroutine.
1881
1882 The subroutine takes up to four optional arguments:
1883
1884 =over 4
1885
1886 =item 1.
1887
1888 A string to be processed (C<$_> if the string is omitted or C<undef>)
1889
1890 =item 2.
1891
1892 A reference to a list of subroutine references and/or qr// objects and/or
1893 literal strings and/or hash references, specifying the extractors
1894 to be used to split the string. If this argument is omitted (or
1895 C<undef>) the list:
1896
1897         [
1898                 sub { extract_variable($_[0], '') },
1899                 sub { extract_quotelike($_[0],'') },
1900                 sub { extract_codeblock($_[0],'{}','') },
1901         ]
1902
1903 is used.
1904
1905
1906 =item 3.
1907
1908 An number specifying the maximum number of fields to return. If this
1909 argument is omitted (or C<undef>), split continues as long as possible.
1910
1911 If the third argument is I<N>, then extraction continues until I<N> fields
1912 have been successfully extracted, or until the string has been completely
1913 processed.
1914
1915 Note that in scalar and void contexts the value of this argument is
1916 automatically reset to 1 (under C<-w>, a warning is issued if the argument
1917 has to be reset).
1918
1919 =item 4.
1920
1921 A value indicating whether unmatched substrings (see below) within the
1922 text should be skipped or returned as fields. If the value is true,
1923 such substrings are skipped. Otherwise, they are returned.
1924
1925 =back
1926
1927 The extraction process works by applying each extractor in
1928 sequence to the text string. If the extractor is a subroutine it
1929 is called in a list
1930 context and is expected to return a list of a single element, namely
1931 the extracted text.
1932 Note that the value returned by an extractor subroutine need not bear any
1933 relationship to the corresponding substring of the original text (see
1934 examples below).
1935
1936 If the extractor is a precompiled regular expression or a string,
1937 it is matched against the text in a scalar context with a leading
1938 '\G' and the gc modifiers enabled. The extracted value is either
1939 $1 if that variable is defined after the match, or else the
1940 complete match (i.e. $&).
1941
1942 If the extractor is a hash reference, it must contain exactly one element.
1943 The value of that element is one of the
1944 above extractor types (subroutine reference, regular expression, or string).
1945 The key of that element is the name of a class into which the successful
1946 return value of the extractor will be blessed.
1947
1948 If an extractor returns a defined value, that value is immediately
1949 treated as the next extracted field and pushed onto the list of fields.
1950 If the extractor was specified in a hash reference, the field is also
1951 blessed into the appropriate class,
1952
1953 If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
1954 assumed to have failed to extract.
1955 If none of the extractor subroutines succeeds, then one
1956 character is extracted from the start of the text and the extraction
1957 subroutines reapplied. Characters which are thus removed are accumulated and
1958 eventually become the next field (unless the fourth argument is true, in which
1959 case they are disgarded).
1960
1961 For example, the following extracts substrings that are valid Perl variables:
1962
1963         @fields = extract_multiple($text,
1964                                    [ sub { extract_variable($_[0]) } ],
1965                                    undef, 1);
1966
1967 This example separates a text into fields which are quote delimited,
1968 curly bracketed, and anything else. The delimited and bracketed
1969 parts are also blessed to identify them (the "anything else" is unblessed):
1970
1971         @fields = extract_multiple($text,
1972                    [
1973                         { Delim => sub { extract_delimited($_[0],q{'"}) } },
1974                         { Brack => sub { extract_bracketed($_[0],'{}') } },
1975                    ]);
1976
1977 This call extracts the next single substring that is a valid Perl quotelike
1978 operator (and removes it from $text):
1979
1980         $quotelike = extract_multiple($text,
1981                                       [
1982                                         sub { extract_quotelike($_[0]) },
1983                                       ], undef, 1);
1984
1985 Finally, here is yet another way to do comma-separated value parsing:
1986
1987         @fields = extract_multiple($csv_text,
1988                                   [
1989                                         sub { extract_delimited($_[0],q{'"}) },
1990                                         qr/([^,]+)(.*)/,
1991                                   ],
1992                                   undef,1);
1993
1994 The list in the second argument means:
1995 I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
1996 The undef third argument means:
1997 I<"...as many times as possible...">,
1998 and the true value in the fourth argument means
1999 I<"...discarding anything else that appears (i.e. the commas)">.
2000
2001 If you wanted the commas preserved as separate fields (i.e. like split
2002 does if your split pattern has capturing parentheses), you would
2003 just make the last parameter undefined (or remove it).
2004
2005
2006 =head2 C<gen_delimited_pat>
2007
2008 The C<gen_delimited_pat> subroutine takes a single (string) argument and
2009    > builds a Friedl-style optimized regex that matches a string delimited
2010 by any one of the characters in the single argument. For example:
2011
2012         gen_delimited_pat(q{'"})
2013
2014 returns the regex:
2015
2016         (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2017
2018 Note that the specified delimiters are automatically quotemeta'd.
2019
2020 A typical use of C<gen_delimited_pat> would be to build special purpose tags
2021 for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2022 (which might contain quoted strings):
2023
2024         my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2025
2026         extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2027
2028
2029 C<gen_delimited_pat> may also be called with an optional second argument,
2030 which specifies the "escape" character(s) to be used for each delimiter.
2031 For example to match a Pascal-style string (where ' is the delimiter
2032 and '' is a literal ' within the string):
2033
2034         gen_delimited_pat(q{'},q{'});
2035
2036 Different escape characters can be specified for different delimiters.
2037 For example, to specify that '/' is the escape for single quotes
2038 and '%' is the escape for double quotes:
2039
2040         gen_delimited_pat(q{'"},q{/%});
2041
2042 If more delimiters than escape chars are specified, the last escape char
2043 is used for the remaining delimiters.
2044 If no escape char is specified for a given specified delimiter, '\' is used.
2045
2046 Note that
2047 C<gen_delimited_pat> was previously called
2048 C<delimited_pat>. That name may still be used, but is now deprecated.
2049
2050
2051 =head1 DIAGNOSTICS
2052
2053 In a list context, all the functions return C<(undef,$original_text)>
2054 on failure. In a scalar context, failure is indicated by returning C<undef>
2055 (in this case the input text is not modified in any way).
2056
2057 In addition, on failure in I<any> context, the C<$@> variable is set.
2058 Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2059 below.
2060 Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2061 which the error was detected (although not necessarily where it occurred!)
2062 Printing C<$@> directly produces the error message, with the offset appended.
2063 On success, the C<$@> variable is guaranteed to be C<undef>.
2064
2065 The available diagnostics are:
2066
2067 =over 4
2068
2069 =item  C<Did not find a suitable bracket: "%s">
2070
2071 The delimiter provided to C<extract_bracketed> was not one of
2072 C<'()[]E<lt>E<gt>{}'>.
2073
2074 =item  C<Did not find prefix: /%s/>
2075
2076 A non-optional prefix was specified but wasn't found at the start of the text.
2077
2078 =item  C<Did not find opening bracket after prefix: "%s">
2079
2080 C<extract_bracketed> or C<extract_codeblock> was expecting a
2081 particular kind of bracket at the start of the text, and didn't find it.
2082
2083 =item  C<No quotelike operator found after prefix: "%s">
2084
2085 C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2086 C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2087 it was extracting.
2088
2089 =item  C<Unmatched closing bracket: "%c">
2090
2091 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2092 a closing bracket where none was expected.
2093
2094 =item  C<Unmatched opening bracket(s): "%s">
2095
2096 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2097 out of characters in the text before closing one or more levels of nested
2098 brackets.
2099
2100 =item C<Unmatched embedded quote (%s)>
2101
2102 C<extract_bracketed> attempted to match an embedded quoted substring, but
2103 failed to find a closing quote to match it.
2104
2105 =item C<Did not find closing delimiter to match '%s'>
2106
2107 C<extract_quotelike> was unable to find a closing delimiter to match the
2108 one that opened the quote-like operation.
2109
2110 =item  C<Mismatched closing bracket: expected "%c" but found "%s">
2111
2112 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2113 a valid bracket delimiter, but it was the wrong species. This usually
2114 indicates a nesting error, but may indicate incorrect quoting or escaping.
2115
2116 =item  C<No block delimiter found after quotelike "%s">
2117
2118 C<extract_quotelike> or C<extract_codeblock> found one of the
2119 quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2120 without a suitable block after it.
2121
2122 =item C<Did not find leading dereferencer>
2123
2124 C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2125 a variable, but didn't find any of them.
2126
2127 =item C<Bad identifier after dereferencer>
2128
2129 C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2130 character was not followed by a legal Perl identifier.
2131
2132 =item C<Did not find expected opening bracket at %s>
2133
2134 C<extract_codeblock> failed to find any of the outermost opening brackets
2135 that were specified.
2136
2137 =item C<Improperly nested codeblock at %s>
2138
2139 A nested code block was found that started with a delimiter that was specified
2140 as being only to be used as an outermost bracket.
2141
2142 =item  C<Missing second block for quotelike "%s">
2143
2144 C<extract_codeblock> or C<extract_quotelike> found one of the
2145 quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2146
2147 =item C<No match found for opening bracket>
2148
2149 C<extract_codeblock> failed to find a closing bracket to match the outermost
2150 opening bracket.
2151
2152 =item C<Did not find opening tag: /%s/>
2153
2154 C<extract_tagged> did not find a suitable opening tag (after any specified
2155 prefix was removed).
2156
2157 =item C<Unable to construct closing tag to match: /%s/>
2158
2159 C<extract_tagged> matched the specified opening tag and tried to
2160 modify the matched text to produce a matching closing tag (because
2161 none was specified). It failed to generate the closing tag, almost
2162 certainly because the opening tag did not start with a
2163 bracket of some kind.
2164
2165 =item C<Found invalid nested tag: %s>
2166
2167 C<extract_tagged> found a nested tag that appeared in the "reject" list
2168 (and the failure mode was not "MAX" or "PARA").
2169
2170 =item C<Found unbalanced nested tag: %s>
2171
2172 C<extract_tagged> found a nested opening tag that was not matched by a
2173 corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2174
2175 =item C<Did not find closing tag>
2176
2177 C<extract_tagged> reached the end of the text without finding a closing tag
2178 to match the original opening tag (and the failure mode was not
2179 "MAX" or "PARA").
2180
2181
2182
2183
2184 =back
2185
2186
2187 =head1 AUTHOR
2188
2189 Damian Conway (damian@conway.org)
2190
2191
2192 =head1 BUGS AND IRRITATIONS
2193
2194 There are undoubtedly serious bugs lurking somewhere in this code, if
2195 only because parts of it give the impression of understanding a great deal
2196 more about Perl than they really do.
2197
2198 Bug reports and other feedback are most welcome.
2199
2200
2201 =head1 COPYRIGHT
2202
2203  Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
2204  This module is free software. It may be used, redistributed
2205      and/or modified under the same terms as Perl itself.