lib/Text/Balanced.pm

   1 # EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
   2 # FOR FULL DOCUMENTATION SEE Balanced.pod
   3
   4 use 5.005;
   5 use strict;
   6
   7 package Text::Balanced;
   8
   9 use Exporter;
  10 use SelfLoader;
  11 use vars qw { $VERSION @ISA %EXPORT_TAGS };
  12
  13 $VERSION = '1.89';
  14 @ISA            = qw ( Exporter );
  15
  16 %EXPORT_TAGS    = ( ALL => [ qw(
  17                                 &extract_delimited
  18                                 &extract_bracketed
  19                                 &extract_quotelike
  20                                 &extract_codeblock
  21                                 &extract_variable
  22                                 &extract_tagged
  23                                 &extract_multiple
  24
  25                                 &gen_delimited_pat
  26                                 &gen_extract_tagged
  27
  28                                 &delimited_pat
  29                                ) ] );
  30
  31 Exporter::export_ok_tags('ALL');
  32
  33 ##
  34 ## These shenanagins are to avoid using $& in perl5.6+
  35 ##
  36 my $GetMatchedText = ($] < 5.006) ? eval 'sub { $& } '
  37                                   : eval 'sub {
  38                                            substr($_[0], $-[0], $+[0] - $-[0])
  39                                           }';
  40
  41
  42 # PROTOTYPES
  43
  44 sub _match_bracketed($$$$$$);
  45 sub _match_variable($$);
  46 sub _match_codeblock($$$$$$$);
  47 sub _match_quotelike($$$$);
  48
  49 # HANDLE RETURN VALUES IN VARIOUS CONTEXTS
  50
  51 sub _failmsg {
  52         my ($message, $pos) = @_;
  53         $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
  54 }
  55
  56 sub _fail
  57 {
  58         my ($wantarray, $textref, $message, $pos) = @_;
  59         _failmsg $message, $pos if $message;
  60         return ("",$$textref,"") if $wantarray;
  61         return undef;
  62 }
  63
  64 sub _succeed
  65 {
  66         $@ = undef;
  67         my ($wantarray,$textref) = splice @_, 0, 2;
  68         my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
  69         my ($startlen) = $_[5];
  70         my $remainderpos = $_[2];
  71         if ($wantarray)
  72         {
  73                 my @res;
  74                 while (my ($from, $len) = splice @_, 0, 2)
  75                 {
  76                         push @res, substr($$textref,$from,$len);
  77                 }
  78                 if ($extralen) {        # CORRECT FILLET
  79                         my $extra = substr($res[0], $extrapos-$startlen, $extralen, "\n");
  80                         $res[1] = "$extra$res[1]";
  81                         eval { substr($$textref,$remainderpos,0) = $extra;
  82                                substr($$textref,$extrapos,$extralen,"\n")} ;
  83                                 #REARRANGE HERE DOC AND FILLET IF POSSIBLE
  84                         pos($$textref) = $remainderpos-$extralen+1; # RESET \G
  85                 }
  86                 else {
  87                         pos($$textref) = $remainderpos;             # RESET \G
  88                 }
  89                 return @res;
  90         }
  91         else
  92         {
  93                 my $match = substr($$textref,$_[0],$_[1]);
  94                 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
  95                 my $extra = $extralen
  96                         ? substr($$textref, $extrapos, $extralen)."\n" : "";
  97                 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ;     #CHOP OUT PREFIX & MATCH, IF POSSIBLE
  98                 pos($$textref) = $_[4];                         # RESET \G
  99                 return $match;
 100         }
 101 }
 102
 103 # BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
 104
 105 sub gen_delimited_pat($;$)  # ($delimiters;$escapes)
 106 {
 107         my ($dels, $escs) = @_;
 108         return "" unless $dels =~ /\S/;
 109         $escs = '\\' unless $escs;
 110         $escs .= substr($escs,-1) x (length($dels)-length($escs));
 111         my @pat = ();
 112         my $i;
 113         for ($i=0; $i<length $dels; $i++)
 114         {
 115                 my $del = quotemeta substr($dels,$i,1);
 116                 my $esc = quotemeta substr($escs,$i,1);
 117                 if ($del eq $esc)
 118                 {
 119                         push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
 120                 }
 121                 else
 122                 {
 123                         push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
 124                 }
 125         }
 126         my $pat = join '|', @pat;
 127         return "(?:$pat)";
 128 }
 129
 130 *delimited_pat = \&gen_delimited_pat;
 131
 132
 133 # THE EXTRACTION FUNCTIONS
 134
 135 sub extract_delimited (;$$$$)
 136 {
 137         my $textref = defined $_[0] ? \$_[0] : \$_;
 138         my $wantarray = wantarray;
 139         my $del  = defined $_[1] ? $_[1] : qq{\'\"\`};
 140         my $pre  = defined $_[2] ? $_[2] : '\s*';
 141         my $esc  = defined $_[3] ? $_[3] : qq{\\};
 142         my $pat = gen_delimited_pat($del, $esc);
 143         my $startpos = pos $$textref || 0;
 144         return _fail($wantarray, $textref, "Not a delimited pattern", 0)
 145                 unless $$textref =~ m/\G($pre)($pat)/gc;
 146         my $prelen = length($1);
 147         my $matchpos = $startpos+$prelen;
 148         my $endpos = pos $$textref;
 149         return _succeed $wantarray, $textref,
 150                         $matchpos, $endpos-$matchpos,           # MATCH
 151                         $endpos,   length($$textref)-$endpos,   # REMAINDER
 152                         $startpos, $prelen;                     # PREFIX
 153 }
 154
 155 sub extract_bracketed (;$$$)
 156 {
 157         my $textref = defined $_[0] ? \$_[0] : \$_;
 158         my $ldel = defined $_[1] ? $_[1] : '{([<';
 159         my $pre  = defined $_[2] ? $_[2] : '\s*';
 160         my $wantarray = wantarray;
 161         my $qdel = "";
 162         my $quotelike;
 163         $ldel =~ s/'//g and $qdel .= q{'};
 164         $ldel =~ s/"//g and $qdel .= q{"};
 165         $ldel =~ s/`//g and $qdel .= q{`};
 166         $ldel =~ s/q//g and $quotelike = 1;
 167         $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
 168         my $rdel = $ldel;
 169         unless ($rdel =~ tr/[({</])}>/)
 170         {
 171                 return _fail $wantarray, $textref,
 172                              "Did not find a suitable bracket in delimiter: \"$_[1]\"",
 173                              0;
 174         }
 175         my $posbug = pos;
 176         $ldel = join('|', map { quotemeta $_ } split('', $ldel));
 177         $rdel = join('|', map { quotemeta $_ } split('', $rdel));
 178         pos = $posbug;
 179
 180         my $startpos = pos $$textref || 0;
 181         my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
 182
 183         return _fail ($wantarray, $textref) unless @match;
 184
 185         return _succeed ( $wantarray, $textref,
 186                           $match[2], $match[5]+2,       # MATCH
 187                           @match[8,9],                  # REMAINDER
 188                           @match[0,1],                  # PREFIX
 189                         );
 190 }
 191
 192 sub _match_bracketed($$$$$$)    # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
 193 {
 194         my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
 195         my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
 196         unless ($$textref =~ m/\G$pre/gc)
 197         {
 198                 _failmsg "Did not find prefix: /$pre/", $startpos;
 199                 return;
 200         }
 201
 202         $ldelpos = pos $$textref;
 203
 204         unless ($$textref =~ m/\G($ldel)/gc)
 205         {
 206                 _failmsg "Did not find opening bracket after prefix: \"$pre\"",
 207                          pos $$textref;
 208                 pos $$textref = $startpos;
 209                 return;
 210         }
 211
 212         my @nesting = ( $1 );
 213         my $textlen = length $$textref;
 214         while (pos $$textref < $textlen)
 215         {
 216                 next if $$textref =~ m/\G\\./gcs;
 217
 218                 if ($$textref =~ m/\G($ldel)/gc)
 219                 {
 220                         push @nesting, $1;
 221                 }
 222                 elsif ($$textref =~ m/\G($rdel)/gc)
 223                 {
 224                         my ($found, $brackettype) = ($1, $1);
 225                         if ($#nesting < 0)
 226                         {
 227                                 _failmsg "Unmatched closing bracket: \"$found\"",
 228                                          pos $$textref;
 229                                 pos $$textref = $startpos;
 230                                 return;
 231                         }
 232                         my $expected = pop(@nesting);
 233                         $expected =~ tr/({[</)}]>/;
 234                         if ($expected ne $brackettype)
 235                         {
 236                                 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
 237                                          pos $$textref;
 238                                 pos $$textref = $startpos;
 239                                 return;
 240                         }
 241                         last if $#nesting < 0;
 242                 }
 243                 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
 244                 {
 245                         $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
 246                         _failmsg "Unmatched embedded quote ($1)",
 247                                  pos $$textref;
 248                         pos $$textref = $startpos;
 249                         return;
 250                 }
 251                 elsif ($quotelike && _match_quotelike($textref,"",1,0))
 252                 {
 253                         next;
 254                 }
 255
 256                 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
 257         }
 258         if ($#nesting>=0)
 259         {
 260                 _failmsg "Unmatched opening bracket(s): "
 261                                 . join("..",@nesting)."..",
 262                          pos $$textref;
 263                 pos $$textref = $startpos;
 264                 return;
 265         }
 266
 267         $endpos = pos $$textref;
 268
 269         return (
 270                 $startpos,  $ldelpos-$startpos,         # PREFIX
 271                 $ldelpos,   1,                          # OPENING BRACKET
 272                 $ldelpos+1, $endpos-$ldelpos-2,         # CONTENTS
 273                 $endpos-1,  1,                          # CLOSING BRACKET
 274                 $endpos,    length($$textref)-$endpos,  # REMAINDER
 275                );
 276 }
 277
 278 sub revbracket($)
 279 {
 280         my $brack = reverse $_[0];
 281         $brack =~ tr/[({</])}>/;
 282         return $brack;
 283 }
 284
 285 my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
 286
 287 sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
 288 {
 289         my $textref = defined $_[0] ? \$_[0] : \$_;
 290         my $ldel    = $_[1];
 291         my $rdel    = $_[2];
 292         my $pre     = defined $_[3] ? $_[3] : '\s*';
 293         my %options = defined $_[4] ? %{$_[4]} : ();
 294         my $omode   = defined $options{fail} ? $options{fail} : '';
 295         my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
 296                     : defined($options{reject})        ? $options{reject}
 297                     :                                    ''
 298                     ;
 299         my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
 300                     : defined($options{ignore})        ? $options{ignore}
 301                     :                                    ''
 302                     ;
 303
 304         if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
 305         $@ = undef;
 306
 307         my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
 308
 309         return _fail(wantarray, $textref) unless @match;
 310         return _succeed wantarray, $textref,
 311                         $match[2], $match[3]+$match[5]+$match[7],       # MATCH
 312                         @match[8..9,0..1,2..7];                         # REM, PRE, BITS
 313 }
 314
 315 sub _match_tagged       # ($$$$$$$)
 316 {
 317         my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
 318         my $rdelspec;
 319
 320         my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
 321
 322         unless ($$textref =~ m/\G($pre)/gc)
 323         {
 324                 _failmsg "Did not find prefix: /$pre/", pos $$textref;
 325                 goto failed;
 326         }
 327
 328         $opentagpos = pos($$textref);
 329
 330         unless ($$textref =~ m/\G$ldel/gc)
 331         {
 332                 _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
 333                 goto failed;
 334         }
 335
 336         $textpos = pos($$textref);
 337
 338         if (!defined $rdel)
 339         {
 340                 $rdelspec = &$GetMatchedText($$textref);
 341
 342                 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes)
 343                 {
 344                         _failmsg "Unable to construct closing tag to match: $rdel",
 345                                  pos $$textref;
 346                         goto failed;
 347                 }
 348         }
 349         else
 350         {
 351                 $rdelspec = eval "qq{$rdel}";
 352         }
 353
 354         while (pos($$textref) < length($$textref))
 355         {
 356                 next if $$textref =~ m/\G\\./gc;
 357
 358                 if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
 359                 {
 360                         $parapos = pos($$textref) - length($1)
 361                                 unless defined $parapos;
 362                 }
 363                 elsif ($$textref =~ m/\G($rdelspec)/gc )
 364                 {
 365                         $closetagpos = pos($$textref)-length($1);
 366                         goto matched;
 367                 }
 368                 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
 369                 {
 370                         next;
 371                 }
 372                 elsif ($bad && $$textref =~ m/\G($bad)/gcs)
 373                 {
 374                         pos($$textref) -= length($1);   # CUT OFF WHATEVER CAUSED THE SHORTNESS
 375                         goto short if ($omode eq 'PARA' || $omode eq 'MAX');
 376                         _failmsg "Found invalid nested tag: $1", pos $$textref;
 377                         goto failed;
 378                 }
 379                 elsif ($$textref =~ m/\G($ldel)/gc)
 380                 {
 381                         my $tag = $1;
 382                         pos($$textref) -= length($tag); # REWIND TO NESTED TAG
 383                         unless (_match_tagged(@_))      # MATCH NESTED TAG
 384                         {
 385                                 goto short if $omode eq 'PARA' || $omode eq 'MAX';
 386                                 _failmsg "Found unbalanced nested tag: $tag",
 387                                          pos $$textref;
 388                                 goto failed;
 389                         }
 390                 }
 391                 else { $$textref =~ m/./gcs }
 392         }
 393
 394 short:
 395         $closetagpos = pos($$textref);
 396         goto matched if $omode eq 'MAX';
 397         goto failed unless $omode eq 'PARA';
 398
 399         if (defined $parapos) { pos($$textref) = $parapos }
 400         else                  { $parapos = pos($$textref) }
 401
 402         return (
 403                 $startpos,    $opentagpos-$startpos,            # PREFIX
 404                 $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
 405                 $textpos,     $parapos-$textpos,                # TEXT
 406                 $parapos,     0,                                # NO CLOSING TAG
 407                 $parapos,     length($$textref)-$parapos,       # REMAINDER
 408                );
 409
 410 matched:
 411         $endpos = pos($$textref);
 412         return (
 413                 $startpos,    $opentagpos-$startpos,            # PREFIX
 414                 $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
 415                 $textpos,     $closetagpos-$textpos,            # TEXT
 416                 $closetagpos, $endpos-$closetagpos,             # CLOSING TAG
 417                 $endpos,      length($$textref)-$endpos,        # REMAINDER
 418                );
 419
 420 failed:
 421         _failmsg "Did not find closing tag", pos $$textref unless $@;
 422         pos($$textref) = $startpos;
 423         return;
 424 }
 425
 426 sub extract_variable (;$$)
 427 {
 428         my $textref = defined $_[0] ? \$_[0] : \$_;
 429         return ("","","") unless defined $$textref;
 430         my $pre  = defined $_[1] ? $_[1] : '\s*';
 431
 432         my @match = _match_variable($textref,$pre);
 433
 434         return _fail wantarray, $textref unless @match;
 435
 436         return _succeed wantarray, $textref,
 437                         @match[2..3,4..5,0..1];         # MATCH, REMAINDER, PREFIX
 438 }
 439
 440 sub _match_variable($$)
 441 {
 442 #  $#
 443 #  $^
 444 #  $$
 445         my ($textref, $pre) = @_;
 446         my $startpos = pos($$textref) = pos($$textref)||0;
 447         unless ($$textref =~ m/\G($pre)/gc)
 448         {
 449                 _failmsg "Did not find prefix: /$pre/", pos $$textref;
 450                 return;
 451         }
 452         my $varpos = pos($$textref);
 453         unless ($$textref =~ m{\G\$\s*(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
 454         {
 455             unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
 456             {
 457                 _failmsg "Did not find leading dereferencer", pos $$textref;
 458                 pos $$textref = $startpos;
 459                 return;
 460             }
 461             my $deref = $1;
 462
 463             unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
 464                 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
 465                 or $deref eq '$#' or $deref eq '$$' )
 466             {
 467                 _failmsg "Bad identifier after dereferencer", pos $$textref;
 468                 pos $$textref = $startpos;
 469                 return;
 470             }
 471         }
 472
 473         while (1)
 474         {
 475                 next if _match_codeblock($textref,
 476                                          qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
 477                                          qr/[({[]/, qr/[)}\]]/,
 478                                          qr/[({[]/, qr/[)}\]]/, 0);
 479                 next if _match_codeblock($textref,
 480                                          qr/\s*/, qr/[{[]/, qr/[}\]]/,
 481                                          qr/[{[]/, qr/[}\]]/, 0);
 482                 next if _match_variable($textref,'\s*->\s*');
 483                 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
 484                 last;
 485         }
 486
 487         my $endpos = pos($$textref);
 488         return ($startpos, $varpos-$startpos,
 489                 $varpos,   $endpos-$varpos,
 490                 $endpos,   length($$textref)-$endpos
 491                 );
 492 }
 493
 494 sub extract_codeblock (;$$$$$)
 495 {
 496         my $textref = defined $_[0] ? \$_[0] : \$_;
 497         my $wantarray = wantarray;
 498         my $ldel_inner = defined $_[1] ? $_[1] : '{';
 499         my $pre        = defined $_[2] ? $_[2] : '\s*';
 500         my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
 501         my $rd         = $_[4];
 502         my $rdel_inner = $ldel_inner;
 503         my $rdel_outer = $ldel_outer;
 504         my $posbug = pos;
 505         for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
 506         for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
 507         for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
 508         {
 509                 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
 510         }
 511         pos = $posbug;
 512
 513         my @match = _match_codeblock($textref, $pre,
 514                                      $ldel_outer, $rdel_outer,
 515                                      $ldel_inner, $rdel_inner,
 516                                      $rd);
 517         return _fail($wantarray, $textref) unless @match;
 518         return _succeed($wantarray, $textref,
 519                         @match[2..3,4..5,0..1]  # MATCH, REMAINDER, PREFIX
 520                        );
 521
 522 }
 523
 524 sub _match_codeblock($$$$$$$)
 525 {
 526         my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
 527         my $startpos = pos($$textref) = pos($$textref) || 0;
 528         unless ($$textref =~ m/\G($pre)/gc)
 529         {
 530                 _failmsg qq{Did not match prefix /$pre/ at"} .
 531                             substr($$textref,pos($$textref),20) .
 532                             q{..."},
 533                          pos $$textref;
 534                 return;
 535         }
 536         my $codepos = pos($$textref);
 537         unless ($$textref =~ m/\G($ldel_outer)/gc)      # OUTERMOST DELIMITER
 538         {
 539                 _failmsg qq{Did not find expected opening bracket at "} .
 540                              substr($$textref,pos($$textref),20) .
 541                              q{..."},
 542                          pos $$textref;
 543                 pos $$textref = $startpos;
 544                 return;
 545         }
 546         my $closing = $1;
 547            $closing =~ tr/([<{/)]>}/;
 548         my $matched;
 549         my $patvalid = 1;
 550         while (pos($$textref) < length($$textref))
 551         {
 552                 $matched = '';
 553                 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
 554                 {
 555                         $patvalid = 0;
 556                         next;
 557                 }
 558
 559                 if ($$textref =~ m/\G\s*#.*/gc)
 560                 {
 561                         next;
 562                 }
 563
 564                 if ($$textref =~ m/\G\s*($rdel_outer)/gc)
 565                 {
 566                         unless ($matched = ($closing && $1 eq $closing) )
 567                         {
 568                                 next if $1 eq '>';      # MIGHT BE A "LESS THAN"
 569                                 _failmsg q{Mismatched closing bracket at "} .
 570                                              substr($$textref,pos($$textref),20) .
 571                                              qq{...". Expected '$closing'},
 572                                          pos $$textref;
 573                         }
 574                         last;
 575                 }
 576
 577                 if (_match_variable($textref,'\s*') ||
 578                     _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
 579                 {
 580                         $patvalid = 0;
 581                         next;
 582                 }
 583
 584
 585                 # NEED TO COVER MANY MORE CASES HERE!!!
 586                 if ($$textref =~ m#\G\s*( [-+*x/%^&|.]=?
 587                                         | [!=]~
 588                                         | =(?!>)
 589                                         | (\*\*|&&|\|\||<<|>>)=?
 590                                         | split|grep|map|return
 591                                         )#gcx)
 592                 {
 593                         $patvalid = 1;
 594                         next;
 595                 }
 596
 597                 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
 598                 {
 599                         $patvalid = 1;
 600                         next;
 601                 }
 602
 603                 if ($$textref =~ m/\G\s*$ldel_outer/gc)
 604                 {
 605                         _failmsg q{Improperly nested codeblock at "} .
 606                                      substr($$textref,pos($$textref),20) .
 607                                      q{..."},
 608                                  pos $$textref;
 609                         last;
 610                 }
 611
 612                 $patvalid = 0;
 613                 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
 614         }
 615         continue { $@ = undef }
 616
 617         unless ($matched)
 618         {
 619                 _failmsg 'No match found for opening bracket', pos $$textref
 620                         unless $@;
 621                 return;
 622         }
 623
 624         my $endpos = pos($$textref);
 625         return ( $startpos, $codepos-$startpos,
 626                  $codepos, $endpos-$codepos,
 627                  $endpos,  length($$textref)-$endpos,
 628                );
 629 }
 630
 631
 632 my %mods   = (
 633                 'none'  => '[cgimsox]*',
 634                 'm'     => '[cgimsox]*',
 635                 's'     => '[cegimsox]*',
 636                 'tr'    => '[cds]*',
 637                 'y'     => '[cds]*',
 638                 'qq'    => '',
 639                 'qx'    => '',
 640                 'qw'    => '',
 641                 'qr'    => '[imsx]*',
 642                 'q'     => '',
 643              );
 644
 645 sub extract_quotelike (;$$)
 646 {
 647         my $textref = $_[0] ? \$_[0] : \$_;
 648         my $wantarray = wantarray;
 649         my $pre  = defined $_[1] ? $_[1] : '\s*';
 650
 651         my @match = _match_quotelike($textref,$pre,1,0);
 652         return _fail($wantarray, $textref) unless @match;
 653         return _succeed($wantarray, $textref,
 654                         $match[2], $match[18]-$match[2],        # MATCH
 655                         @match[18,19],                          # REMAINDER
 656                         @match[0,1],                            # PREFIX
 657                         @match[2..17],                          # THE BITS
 658                         @match[20,21],                          # ANY FILLET?
 659                        );
 660 };
 661
 662 sub _match_quotelike($$$$)      # ($textref, $prepat, $allow_raw_match)
 663 {
 664         my ($textref, $pre, $rawmatch, $qmark) = @_;
 665
 666         my ($textlen,$startpos,
 667             $oppos,
 668             $preld1pos,$ld1pos,$str1pos,$rd1pos,
 669             $preld2pos,$ld2pos,$str2pos,$rd2pos,
 670             $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
 671
 672         unless ($$textref =~ m/\G($pre)/gc)
 673         {
 674                 _failmsg qq{Did not find prefix /$pre/ at "} .
 675                              substr($$textref, pos($$textref), 20) .
 676                              q{..."},
 677                          pos $$textref;
 678                 return;
 679         }
 680         $oppos = pos($$textref);
 681
 682         my $initial = substr($$textref,$oppos,1);
 683
 684         if ($initial && $initial =~ m|^[\"\'\`]|
 685                      || $rawmatch && $initial =~ m|^/|
 686                      || $qmark && $initial =~ m|^\?|)
 687         {
 688                 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
 689                 {
 690                         _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
 691                                      substr($$textref, $oppos, 20) .
 692                                      q{..."},
 693                                  pos $$textref;
 694                         pos $$textref = $startpos;
 695                         return;
 696                 }
 697                 $modpos= pos($$textref);
 698                 $rd1pos = $modpos-1;
 699
 700                 if ($initial eq '/' || $initial eq '?')
 701                 {
 702                         $$textref =~ m/\G$mods{none}/gc
 703                 }
 704
 705                 my $endpos = pos($$textref);
 706                 return (
 707                         $startpos,      $oppos-$startpos,       # PREFIX
 708                         $oppos,         0,                      # NO OPERATOR
 709                         $oppos,         1,                      # LEFT DEL
 710                         $oppos+1,       $rd1pos-$oppos-1,       # STR/PAT
 711                         $rd1pos,        1,                      # RIGHT DEL
 712                         $modpos,        0,                      # NO 2ND LDEL
 713                         $modpos,        0,                      # NO 2ND STR
 714                         $modpos,        0,                      # NO 2ND RDEL
 715                         $modpos,        $endpos-$modpos,        # MODIFIERS
 716                         $endpos,        $textlen-$endpos,       # REMAINDER
 717                        );
 718         }
 719
 720         unless ($$textref =~ m{\G((?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
 721         {
 722                 _failmsg q{No quotelike operator found after prefix at "} .
 723                              substr($$textref, pos($$textref), 20) .
 724                              q{..."},
 725                          pos $$textref;
 726                 pos $$textref = $startpos;
 727                 return;
 728         }
 729
 730         my $op = $1;
 731         $preld1pos = pos($$textref);
 732         if ($op eq '<<') {
 733                 $ld1pos = pos($$textref);
 734                 my $label;
 735                 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
 736                         $label = $1;
 737                 }
 738                 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
 739                                      | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
 740                                      | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
 741                                      }gcsx) {
 742                         $label = $+;
 743                 }
 744                 else {
 745                         $label = "";
 746                 }
 747                 my $extrapos = pos($$textref);
 748                 $$textref =~ m{.*\n}gc;
 749                 $str1pos = pos($$textref);
 750                 unless ($$textref =~ m{.*?\n(?=$label\n)}gc) {
 751                         _failmsg qq{Missing here doc terminator ('$label') after "} .
 752                                      substr($$textref, $startpos, 20) .
 753                                      q{..."},
 754                                  pos $$textref;
 755                         pos $$textref = $startpos;
 756                         return;
 757                 }
 758                 $rd1pos = pos($$textref);
 759                 $$textref =~ m{$label\n}gc;
 760                 $ld2pos = pos($$textref);
 761                 return (
 762                         $startpos,      $oppos-$startpos,       # PREFIX
 763                         $oppos,         length($op),            # OPERATOR
 764                         $ld1pos,        $extrapos-$ld1pos,      # LEFT DEL
 765                         $str1pos,       $rd1pos-$str1pos,       # STR/PAT
 766                         $rd1pos,        $ld2pos-$rd1pos,        # RIGHT DEL
 767                         $ld2pos,        0,                      # NO 2ND LDEL
 768                         $ld2pos,        0,                      # NO 2ND STR
 769                         $ld2pos,        0,                      # NO 2ND RDEL
 770                         $ld2pos,        0,                      # NO MODIFIERS
 771                         $ld2pos,        $textlen-$ld2pos,       # REMAINDER
 772                         $extrapos,      $str1pos-$extrapos,     # FILLETED BIT
 773                        );
 774         }
 775
 776         $$textref =~ m/\G\s*/gc;
 777         $ld1pos = pos($$textref);
 778         $str1pos = $ld1pos+1;
 779
 780         unless ($$textref =~ m/\G(\S)/gc)       # SHOULD USE LOOKAHEAD
 781         {
 782                 _failmsg "No block delimiter found after quotelike $op",
 783                          pos $$textref;
 784                 pos $$textref = $startpos;
 785                 return;
 786         }
 787         pos($$textref) = $ld1pos;       # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
 788         my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
 789         if ($ldel1 =~ /[[(<{]/)
 790         {
 791                 $rdel1 =~ tr/[({</])}>/;
 792                 _match_bracketed($textref,"",$ldel1,"","",$rdel1)
 793                 || do { pos $$textref = $startpos; return };
 794         }
 795         else
 796         {
 797                 $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
 798                 || do { pos $$textref = $startpos; return };
 799         }
 800         $ld2pos = $rd1pos = pos($$textref)-1;
 801
 802         my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
 803         if ($second_arg)
 804         {
 805                 my ($ldel2, $rdel2);
 806                 if ($ldel1 =~ /[[(<{]/)
 807                 {
 808                         unless ($$textref =~ /\G\s*(\S)/gc)     # SHOULD USE LOOKAHEAD
 809                         {
 810                                 _failmsg "Missing second block for quotelike $op",
 811                                          pos $$textref;
 812                                 pos $$textref = $startpos;
 813                                 return;
 814                         }
 815                         $ldel2 = $rdel2 = "\Q$1";
 816                         $rdel2 =~ tr/[({</])}>/;
 817                 }
 818                 else
 819                 {
 820                         $ldel2 = $rdel2 = $ldel1;
 821                 }
 822                 $str2pos = $ld2pos+1;
 823
 824                 if ($ldel2 =~ /[[(<{]/)
 825                 {
 826                         pos($$textref)--;       # OVERCOME BROKEN LOOKAHEAD
 827                         _match_bracketed($textref,"",$ldel2,"","",$rdel2)
 828                         || do { pos $$textref = $startpos; return };
 829                 }
 830                 else
 831                 {
 832                         $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
 833                         || do { pos $$textref = $startpos; return };
 834                 }
 835                 $rd2pos = pos($$textref)-1;
 836         }
 837         else
 838         {
 839                 $ld2pos = $str2pos = $rd2pos = $rd1pos;
 840         }
 841
 842         $modpos = pos $$textref;
 843
 844         $$textref =~ m/\G($mods{$op})/gc;
 845         my $endpos = pos $$textref;
 846
 847         return (
 848                 $startpos,      $oppos-$startpos,       # PREFIX
 849                 $oppos,         length($op),            # OPERATOR
 850                 $ld1pos,        1,                      # LEFT DEL
 851                 $str1pos,       $rd1pos-$str1pos,       # STR/PAT
 852                 $rd1pos,        1,                      # RIGHT DEL
 853                 $ld2pos,        $second_arg,            # 2ND LDEL (MAYBE)
 854                 $str2pos,       $rd2pos-$str2pos,       # 2ND STR (MAYBE)
 855                 $rd2pos,        $second_arg,            # 2ND RDEL (MAYBE)
 856                 $modpos,        $endpos-$modpos,        # MODIFIERS
 857                 $endpos,        $textlen-$endpos,       # REMAINDER
 858                );
 859 }
 860
 861 my $def_func =
 862 [
 863         sub { extract_variable($_[0], '') },
 864         sub { extract_quotelike($_[0],'') },
 865         sub { extract_codeblock($_[0],'{}','') },
 866 ];
 867
 868 sub extract_multiple (;$$$$)    # ($text, $functions_ref, $max_fields, $ignoreunknown)
 869 {
 870         my $textref = defined($_[0]) ? \$_[0] : \$_;
 871         my $posbug = pos;
 872         my ($lastpos, $firstpos);
 873         my @fields = ();
 874
 875         #for ($$textref)
 876         {
 877                 my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
 878                 my $max  = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
 879                 my $igunk = $_[3];
 880
 881                 pos $$textref ||= 0;
 882
 883                 unless (wantarray)
 884                 {
 885                         use Carp;
 886                         carp "extract_multiple reset maximal count to 1 in scalar context"
 887                                 if $^W && defined($_[2]) && $max > 1;
 888                         $max = 1
 889                 }
 890
 891                 my $unkpos;
 892                 my $func;
 893                 my $class;
 894
 895                 my @class;
 896                 foreach $func ( @func )
 897                 {
 898                         if (ref($func) eq 'HASH')
 899                         {
 900                                 push @class, (keys %$func)[0];
 901                                 $func = (values %$func)[0];
 902                         }
 903                         else
 904                         {
 905                                 push @class, undef;
 906                         }
 907                 }
 908
 909                 FIELD: while (pos($$textref) < length($$textref))
 910                 {
 911                         my $field;
 912                         my @bits;
 913                         foreach my $i ( 0..$#func )
 914                         {
 915                                 my $pref;
 916                                 $func = $func[$i];
 917                                 $class = $class[$i];
 918                                 $lastpos = pos $$textref;
 919                                 if (ref($func) eq 'CODE')
 920                                         { ($field,undef,$pref) = @bits = $func->($$textref) }
 921                                 elsif (ref($func) eq 'Text::Balanced::Extractor')
 922                                         { @bits = $field = $func->extract($$textref) }
 923                                 elsif( $$textref =~ m/\G$func/gc )
 924                                         { @bits = $field = defined($1) ? $1 : &$GetMatchedText($$textref) }
 925                                        # substr() on previous line is "$&", without the pain
 926                                 $pref ||= "";
 927                                 if (defined($field) && length($field))
 928                                 {
 929                                         if (!$igunk) {
 930                                                 $unkpos = pos $$textref
 931                                                         if length($pref) && !defined($unkpos);
 932                                                 if (defined $unkpos)
 933                                                 {
 934                                                         push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
 935                                                         $firstpos = $unkpos unless defined $firstpos;
 936                                                         undef $unkpos;
 937                                                         last FIELD if @fields == $max;
 938                                                 }
 939                                         }
 940                                         push @fields, $class
 941                                                 ? bless (\$field, $class)
 942                                                 : $field;
 943                                         $firstpos = $lastpos unless defined $firstpos;
 944                                         $lastpos = pos $$textref;
 945                                         last FIELD if @fields == $max;
 946                                         next FIELD;
 947                                 }
 948                         }
 949                         if ($$textref =~ /\G(.)/gcs)
 950                         {
 951                                 $unkpos = pos($$textref)-1
 952                                         unless $igunk || defined $unkpos;
 953                         }
 954                 }
 955
 956                 if (defined $unkpos)
 957                 {
 958                         push @fields, substr($$textref, $unkpos);
 959                         $firstpos = $unkpos unless defined $firstpos;
 960                         $lastpos = length $$textref;
 961                 }
 962                 last;
 963         }
 964
 965         pos $$textref = $lastpos;
 966         return @fields if wantarray;
 967
 968         $firstpos ||= 0;
 969         eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
 970                pos $$textref = $firstpos };
 971         return $fields[0];
 972 }
 973
 974
 975 sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
 976 {
 977         my $ldel    = $_[0];
 978         my $rdel    = $_[1];
 979         my $pre     = defined $_[2] ? $_[2] : '\s*';
 980         my %options = defined $_[3] ? %{$_[3]} : ();
 981         my $omode   = defined $options{fail} ? $options{fail} : '';
 982         my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
 983                     : defined($options{reject})        ? $options{reject}
 984                     :                                    ''
 985                     ;
 986         my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
 987                     : defined($options{ignore})        ? $options{ignore}
 988                     :                                    ''
 989                     ;
 990
 991         if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
 992
 993         my $posbug = pos;
 994         for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
 995         pos = $posbug;
 996
 997         my $closure = sub
 998         {
 999                 my $textref = defined $_[0] ? \$_[0] : \$_;
1000                 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
1001
1002                 return _fail(wantarray, $textref) unless @match;
1003                 return _succeed wantarray, $textref,
1004                                 $match[2], $match[3]+$match[5]+$match[7],       # MATCH
1005                                 @match[8..9,0..1,2..7];                         # REM, PRE, BITS
1006         };
1007
1008         bless $closure, 'Text::Balanced::Extractor';
1009 }
1010
1011 package Text::Balanced::Extractor;
1012
1013 sub extract($$) # ($self, $text)
1014 {
1015         &{$_[0]}($_[1]);
1016 }
1017
1018 package Text::Balanced::ErrorMsg;
1019
1020 use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
1021
1022 1;
1023
1024 __END__
1025
1026 =head1 NAME
1027
1028 Text::Balanced - Extract delimited text sequences from strings.
1029
1030
1031 =head1 SYNOPSIS
1032
1033  use Text::Balanced qw (
1034                         extract_delimited
1035                         extract_bracketed
1036                         extract_quotelike
1037                         extract_codeblock
1038                         extract_variable
1039                         extract_tagged
1040                         extract_multiple
1041
1042                         gen_delimited_pat
1043                         gen_extract_tagged
1044                        );
1045
1046  # Extract the initial substring of $text that is delimited by
1047  # two (unescaped) instances of the first character in $delim.
1048
1049         ($extracted, $remainder) = extract_delimited($text,$delim);
1050
1051
1052  # Extract the initial substring of $text that is bracketed
1053  # with a delimiter(s) specified by $delim (where the string
1054  # in $delim contains one or more of '(){}[]<>').
1055
1056         ($extracted, $remainder) = extract_bracketed($text,$delim);
1057
1058
1059  # Extract the initial substring of $text that is bounded by
1060  # an HTML/XML tag.
1061
1062         ($extracted, $remainder) = extract_tagged($text);
1063
1064
1065  # Extract the initial substring of $text that is bounded by
1066  # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1067
1068         ($extracted, $remainder) =
1069                 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1070
1071
1072  # Extract the initial substring of $text that represents a
1073  # Perl "quote or quote-like operation"
1074
1075         ($extracted, $remainder) = extract_quotelike($text);
1076
1077
1078  # Extract the initial substring of $text that represents a block
1079  # of Perl code, bracketed by any of character(s) specified by $delim
1080  # (where the string $delim contains one or more of '(){}[]<>').
1081
1082         ($extracted, $remainder) = extract_codeblock($text,$delim);
1083
1084
1085  # Extract the initial substrings of $text that would be extracted by
1086  # one or more sequential applications of the specified functions
1087  # or regular expressions
1088
1089         @extracted = extract_multiple($text,
1090                                       [ \&extract_bracketed,
1091                                         \&extract_quotelike,
1092                                         \&some_other_extractor_sub,
1093                                         qr/[xyz]*/,
1094                                         'literal',
1095                                       ]);
1096
1097 # Create a string representing an optimized pattern (a la Friedl)
1098 # that matches a substring delimited by any of the specified characters
1099 # (in this case: any type of quote or a slash)
1100
1101         $patstring = gen_delimited_pat(q{'"`/});
1102
1103
1104 # Generate a reference to an anonymous sub that is just like extract_tagged
1105 # but pre-compiled and optimized for a specific pair of tags, and consequently
1106 # much faster (i.e. 3 times faster). It uses qr// for better performance on
1107 # repeated calls, so it only works under Perl 5.005 or later.
1108
1109         $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1110
1111         ($extracted, $remainder) = $extract_head->($text);
1112
1113
1114 =head1 DESCRIPTION
1115
1116 The various C<extract_...> subroutines may be used to extract a
1117 delimited string (possibly after skipping a specified prefix string).
1118 The search for the string always begins at the current C<pos>
1119 location of the string's variable (or at index zero, if no C<pos>
1120 position is defined).
1121
1122 =head2 General behaviour in list contexts
1123
1124 In a list context, all the subroutines return a list, the first three
1125 elements of which are always:
1126
1127 =over 4
1128
1129 =item [0]
1130
1131 The extracted string, including the specified delimiters.
1132 If the extraction fails an empty string is returned.
1133
1134 =item [1]
1135
1136 The remainder of the input string (i.e. the characters after the
1137 extracted string). On failure, the entire string is returned.
1138
1139 =item [2]
1140
1141 The skipped prefix (i.e. the characters before the extracted string).
1142 On failure, the empty string is returned.
1143
1144 =back
1145
1146 Note that in a list context, the contents of the original input text (the first
1147 argument) are not modified in any way.
1148
1149 However, if the input text was passed in a variable, that variable's
1150 C<pos> value is updated to point at the first character after the
1151 extracted text. That means that in a list context the various
1152 subroutines can be used much like regular expressions. For example:
1153
1154         while ( $next = (extract_quotelike($text))[0] )
1155         {
1156                 # process next quote-like (in $next)
1157         }
1158
1159
1160 =head2 General behaviour in scalar and void contexts
1161
1162 In a scalar context, the extracted string is returned, having first been
1163 removed from the input text. Thus, the following code also processes
1164 each quote-like operation, but actually removes them from $text:
1165
1166         while ( $next = extract_quotelike($text) )
1167         {
1168                 # process next quote-like (in $next)
1169         }
1170
1171 Note that if the input text is a read-only string (i.e. a literal),
1172 no attempt is made to remove the extracted text.
1173
1174 In a void context the behaviour of the extraction subroutines is
1175 exactly the same as in a scalar context, except (of course) that the
1176 extracted substring is not returned.
1177
1178 =head2 A note about prefixes
1179
1180 Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1181 This can bite you if you're expecting a prefix specification like
1182 '.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1183 pattern will only succeed if the <H1> tag is on the current line, since
1184 . normally doesn't match newlines.
1185
1186 To overcome this limitation, you need to turn on /s matching within
1187 the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1188
1189
1190 =head2 C<extract_delimited>
1191
1192 The C<extract_delimited> function formalizes the common idiom
1193 of extracting a single-character-delimited substring from the start of
1194 a string. For example, to extract a single-quote delimited string, the
1195 following code is typically used:
1196
1197         ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1198         $extracted = $1;
1199
1200 but with C<extract_delimited> it can be simplified to:
1201
1202         ($extracted,$remainder) = extract_delimited($text, "'");
1203
1204 C<extract_delimited> takes up to four scalars (the input text, the
1205 delimiters, a prefix pattern to be skipped, and any escape characters)
1206 and extracts the initial substring of the text that
1207 is appropriately delimited. If the delimiter string has multiple
1208 characters, the first one encountered in the text is taken to delimit
1209 the substring.
1210 The third argument specifies a prefix pattern that is to be skipped
1211 (but must be present!) before the substring is extracted.
1212 The final argument specifies the escape character to be used for each
1213 delimiter.
1214
1215 All arguments are optional. If the escape characters are not specified,
1216 every delimiter is escaped with a backslash (C<\>).
1217 If the prefix is not specified, the
1218 pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1219 is also not specified, the set C</["'`]/> is used. If the text to be processed
1220 is not specified either, C<$_> is used.
1221
1222 In list context, C<extract_delimited> returns an array of three
1223 elements, the extracted substring (I<including the surrounding
1224 delimiters>), the remainder of the text, and the skipped prefix (if
1225 any). If a suitable delimited substring is not found, the first
1226 element of the array is the empty string, the second is the complete
1227 original text, and the prefix returned in the third element is an
1228 empty string.
1229
1230 In a scalar context, just the extracted substring is returned. In
1231 a void context, the extracted substring (and any prefix) are simply
1232 removed from the beginning of the first argument.
1233
1234 Examples:
1235
1236         # Remove a single-quoted substring from the very beginning of $text:
1237
1238                 $substring = extract_delimited($text, "'", '');
1239
1240         # Remove a single-quoted Pascalish substring (i.e. one in which
1241         # doubling the quote character escapes it) from the very
1242         # beginning of $text:
1243
1244                 $substring = extract_delimited($text, "'", '', "'");
1245
1246         # Extract a single- or double- quoted substring from the
1247         # beginning of $text, optionally after some whitespace
1248         # (note the list context to protect $text from modification):
1249
1250                 ($substring) = extract_delimited $text, q{"'};
1251
1252
1253         # Delete the substring delimited by the first '/' in $text:
1254
1255                 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1256
1257 Note that this last example is I<not> the same as deleting the first
1258 quote-like pattern. For instance, if C<$text> contained the string:
1259
1260         "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1261
1262 then after the deletion it would contain:
1263
1264         "if ('.$UNIXCMD/s) { $cmd = $1; }"
1265
1266 not:
1267
1268         "if ('./cmd' =~ ms) { $cmd = $1; }"
1269
1270
1271 See L<"extract_quotelike"> for a (partial) solution to this problem.
1272
1273
1274 =head2 C<extract_bracketed>
1275
1276 Like C<"extract_delimited">, the C<extract_bracketed> function takes
1277 up to three optional scalar arguments: a string to extract from, a delimiter
1278 specifier, and a prefix pattern. As before, a missing prefix defaults to
1279 optional whitespace and a missing text defaults to C<$_>. However, a missing
1280 delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1281
1282 C<extract_bracketed> extracts a balanced-bracket-delimited
1283 substring (using any one (or more) of the user-specified delimiter
1284 brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1285 respect quoted unbalanced brackets (see below).
1286
1287 A "delimiter bracket" is a bracket in list of delimiters passed as
1288 C<extract_bracketed>'s second argument. Delimiter brackets are
1289 specified by giving either the left or right (or both!) versions
1290 of the required bracket(s). Note that the order in which
1291 two or more delimiter brackets are specified is not significant.
1292
1293 A "balanced-bracket-delimited substring" is a substring bounded by
1294 matched brackets, such that any other (left or right) delimiter
1295 bracket I<within> the substring is also matched by an opposite
1296 (right or left) delimiter bracket I<at the same level of nesting>. Any
1297 type of bracket not in the delimiter list is treated as an ordinary
1298 character.
1299
1300 In other words, each type of bracket specified as a delimiter must be
1301 balanced and correctly nested within the substring, and any other kind of
1302 ("non-delimiter") bracket in the substring is ignored.
1303
1304 For example, given the string:
1305
1306         $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1307
1308 then a call to C<extract_bracketed> in a list context:
1309
1310         @result = extract_bracketed( $text, '{}' );
1311
1312 would return:
1313
1314         ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1315
1316 since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1317 (In a scalar context just the first element of the array would be returned. In
1318 a void context, C<$text> would be replaced by an empty string.)
1319
1320 Likewise the call in:
1321
1322         @result = extract_bracketed( $text, '{[' );
1323
1324 would return the same result, since all sets of both types of specified
1325 delimiter brackets are correctly nested and balanced.
1326
1327 However, the call in:
1328
1329         @result = extract_bracketed( $text, '{([<' );
1330
1331 would fail, returning:
1332
1333         ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }"  );
1334
1335 because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1336 the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1337 return an empty string. In a void context, C<$text> would be unchanged.)
1338
1339 Note that the embedded single-quotes in the string don't help in this
1340 case, since they have not been specified as acceptable delimiters and are
1341 therefore treated as non-delimiter characters (and ignored).
1342
1343 However, if a particular species of quote character is included in the
1344 delimiter specification, then that type of quote will be correctly handled.
1345 for example, if C<$text> is:
1346
1347         $text = '<A HREF=">>>>">link</A>';
1348
1349 then
1350
1351         @result = extract_bracketed( $text, '<">' );
1352
1353 returns:
1354
1355         ( '<A HREF=">>>>">', 'link</A>', "" )
1356
1357 as expected. Without the specification of C<"> as an embedded quoter:
1358
1359         @result = extract_bracketed( $text, '<>' );
1360
1361 the result would be:
1362
1363         ( '<A HREF=">', '>>>">link</A>', "" )
1364
1365 In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1366 quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1367 letter 'q' as a delimiter. Hence:
1368
1369         @result = extract_bracketed( $text, '<q>' );
1370
1371 would correctly match something like this:
1372
1373         $text = '<leftop: conj /and/ conj>';
1374
1375 See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1376
1377
1378 =head2 C<extract_tagged>
1379
1380 C<extract_tagged> extracts and segments text between (balanced)
1381 specified tags.
1382
1383 The subroutine takes up to five optional arguments:
1384
1385 =over 4
1386
1387 =item 1.
1388
1389 A string to be processed (C<$_> if the string is omitted or C<undef>)
1390
1391 =item 2.
1392
1393 A string specifying a pattern to be matched as the opening tag.
1394 If the pattern string is omitted (or C<undef>) then a pattern
1395 that matches any standard HTML/XML tag is used.
1396
1397 =item 3.
1398
1399 A string specifying a pattern to be matched at the closing tag.
1400 If the pattern string is omitted (or C<undef>) then the closing
1401 tag is constructed by inserting a C</> after any leading bracket
1402 characters in the actual opening tag that was matched (I<not> the pattern
1403 that matched the tag). For example, if the opening tag pattern
1404 is specified as C<'{{\w+}}'> and actually matched the opening tag
1405 C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1406
1407 =item 4.
1408
1409 A string specifying a pattern to be matched as a prefix (which is to be
1410 skipped). If omitted, optional whitespace is skipped.
1411
1412 =item 5.
1413
1414 A hash reference containing various parsing options (see below)
1415
1416 =back
1417
1418 The various options that can be specified are:
1419
1420 =over 4
1421
1422 =item C<reject =E<gt> $listref>
1423
1424 The list reference contains one or more strings specifying patterns
1425 that must I<not> appear within the tagged text.
1426
1427 For example, to extract
1428 an HTML link (which should not contain nested links) use:
1429
1430         extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1431
1432 =item C<ignore =E<gt> $listref>
1433
1434 The list reference contains one or more strings specifying patterns
1435 that are I<not> be be treated as nested tags within the tagged text
1436 (even if they would match the start tag pattern).
1437
1438 For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1439
1440         extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1441
1442 (also see L<"gen_delimited_pat"> below).
1443
1444
1445 =item C<fail =E<gt> $str>
1446
1447 The C<fail> option indicates the action to be taken if a matching end
1448 tag is not encountered (i.e. before the end of the string or some
1449 C<reject> pattern matches). By default, a failure to match a closing
1450 tag causes C<extract_tagged> to immediately fail.
1451
1452 However, if the string value associated with <reject> is "MAX", then
1453 C<extract_tagged> returns the complete text up to the point of failure.
1454 If the string is "PARA", C<extract_tagged> returns only the first paragraph
1455 after the tag (up to the first line that is either empty or contains
1456 only whitespace characters).
1457 If the string is "", the default behaviour (i.e. failure) is reinstated.
1458
1459 For example, suppose the start tag "/para" introduces a paragraph, which then
1460 continues until the next "/endpara" tag or until another "/para" tag is
1461 encountered:
1462
1463         $text = "/para line 1\n\nline 3\n/para line 4";
1464
1465         extract_tagged($text, '/para', '/endpara', undef,
1466                                 {reject => '/para', fail => MAX );
1467
1468         # EXTRACTED: "/para line 1\n\nline 3\n"
1469
1470 Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1471 tag refers only to the immediately following paragraph:
1472
1473         $text = "/para line 1\n\nline 3\n/para line 4";
1474
1475         extract_tagged($text, '/para', '/endpara', undef,
1476                         {reject => '/para', fail => MAX );
1477
1478         # EXTRACTED: "/para line 1\n"
1479
1480 Note that the specified C<fail> behaviour applies to nested tags as well.
1481
1482 =back
1483
1484 On success in a list context, an array of 6 elements is returned. The elements are:
1485
1486 =over 4
1487
1488 =item [0]
1489
1490 the extracted tagged substring (including the outermost tags),
1491
1492 =item [1]
1493
1494 the remainder of the input text,
1495
1496 =item [2]
1497
1498 the prefix substring (if any),
1499
1500 =item [3]
1501
1502 the opening tag
1503
1504 =item [4]
1505
1506 the text between the opening and closing tags
1507
1508 =item [5]
1509
1510 the closing tag (or "" if no closing tag was found)
1511
1512 =back
1513
1514 On failure, all of these values (except the remaining text) are C<undef>.
1515
1516 In a scalar context, C<extract_tagged> returns just the complete
1517 substring that matched a tagged text (including the start and end
1518 tags). C<undef> is returned on failure. In addition, the original input
1519 text has the returned substring (and any prefix) removed from it.
1520
1521 In a void context, the input text just has the matched substring (and
1522 any specified prefix) removed.
1523
1524
1525 =head2 C<gen_extract_tagged>
1526
1527 (Note: This subroutine is only available under Perl5.005)
1528
1529 C<gen_extract_tagged> generates a new anonymous subroutine which
1530 extracts text between (balanced) specified tags. In other words,
1531 it generates a function identical in function to C<extract_tagged>.
1532
1533 The difference between C<extract_tagged> and the anonymous
1534 subroutines generated by
1535 C<gen_extract_tagged>, is that those generated subroutines:
1536
1537 =over 4
1538
1539 =item *
1540
1541 do not have to reparse tag specification or parsing options every time
1542 they are called (whereas C<extract_tagged> has to effectively rebuild
1543 its tag parser on every call);
1544
1545 =item *
1546
1547 make use of the new qr// construct to pre-compile the regexes they use
1548 (whereas C<extract_tagged> uses standard string variable interpolation
1549 to create tag-matching patterns).
1550
1551 =back
1552
1553 The subroutine takes up to four optional arguments (the same set as
1554 C<extract_tagged> except for the string to be processed). It returns
1555 a reference to a subroutine which in turn takes a single argument (the text to
1556 be extracted from).
1557
1558 In other words, the implementation of C<extract_tagged> is exactly
1559 equivalent to:
1560
1561         sub extract_tagged
1562         {
1563                 my $text = shift;
1564                 $extractor = gen_extract_tagged(@_);
1565                 return $extractor->($text);
1566         }
1567
1568 (although C<extract_tagged> is not currently implemented that way, in order
1569 to preserve pre-5.005 compatibility).
1570
1571 Using C<gen_extract_tagged> to create extraction functions for specific tags
1572 is a good idea if those functions are going to be called more than once, since
1573 their performance is typically twice as good as the more general-purpose
1574 C<extract_tagged>.
1575
1576
1577 =head2 C<extract_quotelike>
1578
1579 C<extract_quotelike> attempts to recognize, extract, and segment any
1580 one of the various Perl quotes and quotelike operators (see
1581 L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1582 delimiters (for the quotelike operators), and trailing modifiers are
1583 all caught. For example, in:
1584
1585         extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1586
1587         extract_quotelike '  "You said, \"Use sed\"."  '
1588
1589         extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1590
1591         extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1592
1593 the full Perl quotelike operations are all extracted correctly.
1594
1595 Note too that, when using the /x modifier on a regex, any comment
1596 containing the current pattern delimiter will cause the regex to be
1597 immediately terminated. In other words:
1598
1599         'm /
1600                 (?i)            # CASE INSENSITIVE
1601                 [a-z_]          # LEADING ALPHABETIC/UNDERSCORE
1602                 [a-z0-9]*       # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1603            /x'
1604
1605 will be extracted as if it were:
1606
1607         'm /
1608                 (?i)            # CASE INSENSITIVE
1609                 [a-z_]          # LEADING ALPHABETIC/'
1610
1611 This behaviour is identical to that of the actual compiler.
1612
1613 C<extract_quotelike> takes two arguments: the text to be processed and
1614 a prefix to be matched at the very beginning of the text. If no prefix
1615 is specified, optional whitespace is the default. If no text is given,
1616 C<$_> is used.
1617
1618 In a list context, an array of 11 elements is returned. The elements are:
1619
1620 =over 4
1621
1622 =item [0]
1623
1624 the extracted quotelike substring (including trailing modifiers),
1625
1626 =item [1]
1627
1628 the remainder of the input text,
1629
1630 =item [2]
1631
1632 the prefix substring (if any),
1633
1634 =item [3]
1635
1636 the name of the quotelike operator (if any),
1637
1638 =item [4]
1639
1640 the left delimiter of the first block of the operation,
1641
1642 =item [5]
1643
1644 the text of the first block of the operation
1645 (that is, the contents of
1646 a quote, the regex of a match or substitution or the target list of a
1647 translation),
1648
1649 =item [6]
1650
1651 the right delimiter of the first block of the operation,
1652
1653 =item [7]
1654
1655 the left delimiter of the second block of the operation
1656 (that is, if it is an C<s>, C<tr>, or C<y>),
1657
1658 =item [8]
1659
1660 the text of the second block of the operation
1661 (that is, the replacement of a substitution or the translation list
1662 of a translation),
1663
1664 =item [9]
1665
1666 the right delimiter of the second block of the operation (if any),
1667
1668 =item [10]
1669
1670 the trailing modifiers on the operation (if any).
1671
1672 =back
1673
1674 For each of the fields marked "(if any)" the default value on success is
1675 an empty string.
1676 On failure, all of these values (except the remaining text) are C<undef>.
1677
1678
1679 In a scalar context, C<extract_quotelike> returns just the complete substring
1680 that matched a quotelike operation (or C<undef> on failure). In a scalar or
1681 void context, the input text has the same substring (and any specified
1682 prefix) removed.
1683
1684 Examples:
1685
1686         # Remove the first quotelike literal that appears in text
1687
1688                 $quotelike = extract_quotelike($text,'.*?');
1689
1690         # Replace one or more leading whitespace-separated quotelike
1691         # literals in $_ with "<QLL>"
1692
1693                 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1694
1695
1696         # Isolate the search pattern in a quotelike operation from $text
1697
1698                 ($op,$pat) = (extract_quotelike $text)[3,5];
1699                 if ($op =~ /[ms]/)
1700                 {
1701                         print "search pattern: $pat\n";
1702                 }
1703                 else
1704                 {
1705                         print "$op is not a pattern matching operation\n";
1706                 }
1707
1708
1709 =head2 C<extract_quotelike> and "here documents"
1710
1711 C<extract_quotelike> can successfully extract "here documents" from an input
1712 string, but with an important caveat in list contexts.
1713
1714 Unlike other types of quote-like literals, a here document is rarely
1715 a contiguous substring. For example, a typical piece of code using
1716 here document might look like this:
1717
1718         <<'EOMSG' || die;
1719         This is the message.
1720         EOMSG
1721         exit;
1722
1723 Given this as an input string in a scalar context, C<extract_quotelike>
1724 would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1725 leaving the string " || die;\nexit;" in the original variable. In other words,
1726 the two separate pieces of the here document are successfully extracted and
1727 concatenated.
1728
1729 In a list context, C<extract_quotelike> would return the list
1730
1731 =over 4
1732
1733 =item [0]
1734
1735 "<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1736 including fore and aft delimiters),
1737
1738 =item [1]
1739
1740 " || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1741
1742 =item [2]
1743
1744 "" (i.e. the prefix substring -- trivial in this case),
1745
1746 =item [3]
1747
1748 "<<" (i.e. the "name" of the quotelike operator)
1749
1750 =item [4]
1751
1752 "'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1753
1754 =item [5]
1755
1756 "This is the message.\n" (i.e. the text of the here document),
1757
1758 =item [6]
1759
1760 "EOMSG" (i.e. the right delimiter of the here document),
1761
1762 =item [7..10]
1763
1764 "" (a here document has no second left delimiter, second text, second right
1765 delimiter, or trailing modifiers).
1766
1767 =back
1768
1769 However, the matching position of the input variable would be set to
1770 "exit;" (i.e. I<after> the closing delimiter of the here document),
1771 which would cause the earlier " || die;\nexit;" to be skipped in any
1772 sequence of code fragment extractions.
1773
1774 To avoid this problem, when it encounters a here document while
1775 extracting from a modifiable string, C<extract_quotelike> silently
1776 rearranges the string to an equivalent piece of Perl:
1777
1778         <<'EOMSG'
1779         This is the message.
1780         EOMSG
1781         || die;
1782         exit;
1783
1784 in which the here document I<is> contiguous. It still leaves the
1785 matching position after the here document, but now the rest of the line
1786 on which the here document starts is not skipped.
1787
1788 To prevent <extract_quotelike> from mucking about with the input in this way
1789 (this is the only case where a list-context C<extract_quotelike> does so),
1790 you can pass the input variable as an interpolated literal:
1791
1792         $quotelike = extract_quotelike("$var");
1793
1794
1795 =head2 C<extract_codeblock>
1796
1797 C<extract_codeblock> attempts to recognize and extract a balanced
1798 bracket delimited substring that may contain unbalanced brackets
1799 inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1800 is like a combination of C<"extract_bracketed"> and
1801 C<"extract_quotelike">.
1802
1803 C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1804 a text to process, a set of delimiter brackets to look for, and a prefix to
1805 match first. It also takes an optional fourth parameter, which allows the
1806 outermost delimiter brackets to be specified separately (see below).
1807
1808 Omitting the first argument (input text) means process C<$_> instead.
1809 Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1810 Omitting the third argument (prefix argument) implies optional whitespace at the start.
1811 Omitting the fourth argument (outermost delimiter brackets) indicates that the
1812 value of the second argument is to be used for the outermost delimiters.
1813
1814 Once the prefix an the outermost opening delimiter bracket have been
1815 recognized, code blocks are extracted by stepping through the input text and
1816 trying the following alternatives in sequence:
1817
1818 =over 4
1819
1820 =item 1.
1821
1822 Try and match a closing delimiter bracket. If the bracket was the same
1823 species as the last opening bracket, return the substring to that
1824 point. If the bracket was mismatched, return an error.
1825
1826 =item 2.
1827
1828 Try to match a quote or quotelike operator. If found, call
1829 C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1830 the error it returned. Otherwise go back to step 1.
1831
1832 =item 3.
1833
1834 Try to match an opening delimiter bracket. If found, call
1835 C<extract_codeblock> recursively to eat the embedded block. If the
1836 recursive call fails, return an error. Otherwise, go back to step 1.
1837
1838 =item 4.
1839
1840 Unconditionally match a bareword or any other single character, and
1841 then go back to step 1.
1842
1843 =back
1844
1845
1846 Examples:
1847
1848         # Find a while loop in the text
1849
1850                 if ($text =~ s/.*?while\s*\{/{/)
1851                 {
1852                         $loop = "while " . extract_codeblock($text);
1853                 }
1854
1855         # Remove the first round-bracketed list (which may include
1856         # round- or curly-bracketed code blocks or quotelike operators)
1857
1858                 extract_codeblock $text, "(){}", '[^(]*';
1859
1860
1861 The ability to specify a different outermost delimiter bracket is useful
1862 in some circumstances. For example, in the Parse::RecDescent module,
1863 parser actions which are to be performed only on a successful parse
1864 are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1865
1866         sentence: subject verb object
1867                         <defer: {$::theVerb = $item{verb}} >
1868
1869 Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1870 within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1871
1872 A deferred action like this:
1873
1874                         <defer: {if ($count>10) {$count--}} >
1875
1876 will be incorrectly parsed as:
1877
1878                         <defer: {if ($count>
1879
1880 because the "less than" operator is interpreted as a closing delimiter.
1881
1882 But, by extracting the directive using
1883 S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1884 the '>' character is only treated as a delimited at the outermost
1885 level of the code block, so the directive is parsed correctly.
1886
1887 =head2 C<extract_multiple>
1888
1889 The C<extract_multiple> subroutine takes a string to be processed and a
1890 list of extractors (subroutines or regular expressions) to apply to that string.
1891
1892 In an array context C<extract_multiple> returns an array of substrings
1893 of the original string, as extracted by the specified extractors.
1894 In a scalar context, C<extract_multiple> returns the first
1895 substring successfully extracted from the original string. In both
1896 scalar and void contexts the original string has the first successfully
1897 extracted substring removed from it. In all contexts
1898 C<extract_multiple> starts at the current C<pos> of the string, and
1899 sets that C<pos> appropriately after it matches.
1900
1901 Hence, the aim of a call to C<extract_multiple> in a list context
1902 is to split the processed string into as many non-overlapping fields as
1903 possible, by repeatedly applying each of the specified extractors
1904 to the remainder of the string. Thus C<extract_multiple> is
1905 a generalized form of Perl's C<split> subroutine.
1906
1907 The subroutine takes up to four optional arguments:
1908
1909 =over 4
1910
1911 =item 1.
1912
1913 A string to be processed (C<$_> if the string is omitted or C<undef>)
1914
1915 =item 2.
1916
1917 A reference to a list of subroutine references and/or qr// objects and/or
1918 literal strings and/or hash references, specifying the extractors
1919 to be used to split the string. If this argument is omitted (or
1920 C<undef>) the list:
1921
1922         [
1923                 sub { extract_variable($_[0], '') },
1924                 sub { extract_quotelike($_[0],'') },
1925                 sub { extract_codeblock($_[0],'{}','') },
1926         ]
1927
1928 is used.
1929
1930
1931 =item 3.
1932
1933 A number specifying the maximum number of fields to return. If this
1934 argument is omitted (or C<undef>), split continues as long as possible.
1935
1936 If the third argument is I<N>, then extraction continues until I<N> fields
1937 have been successfully extracted, or until the string has been completely
1938 processed.
1939
1940 Note that in scalar and void contexts the value of this argument is
1941 automatically reset to 1 (under C<-w>, a warning is issued if the argument
1942 has to be reset).
1943
1944 =item 4.
1945
1946 A value indicating whether unmatched substrings (see below) within the
1947 text should be skipped or returned as fields. If the value is true,
1948 such substrings are skipped. Otherwise, they are returned.
1949
1950 =back
1951
1952 The extraction process works by applying each extractor in
1953 sequence to the text string.
1954
1955 If the extractor is a subroutine it is called in a list context and is
1956 expected to return a list of a single element, namely the extracted
1957 text. It may optionally also return two further arguments: a string
1958 representing the text left after extraction (like $' for a pattern
1959 match), and a string representing any prefix skipped before the
1960 extraction (like $` in a pattern match). Note that this is designed
1961 to facilitate the use of other Text::Balanced subroutines with
1962 C<extract_multiple>. Note too that the value returned by an extractor
1963 subroutine need not bear any relationship to the corresponding substring
1964 of the original text (see examples below).
1965
1966 If the extractor is a precompiled regular expression or a string,
1967 it is matched against the text in a scalar context with a leading
1968 '\G' and the gc modifiers enabled. The extracted value is either
1969 $1 if that variable is defined after the match, or else the
1970 complete match (i.e. $&).
1971
1972 If the extractor is a hash reference, it must contain exactly one element.
1973 The value of that element is one of the
1974 above extractor types (subroutine reference, regular expression, or string).
1975 The key of that element is the name of a class into which the successful
1976 return value of the extractor will be blessed.
1977
1978 If an extractor returns a defined value, that value is immediately
1979 treated as the next extracted field and pushed onto the list of fields.
1980 If the extractor was specified in a hash reference, the field is also
1981 blessed into the appropriate class,
1982
1983 If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
1984 assumed to have failed to extract.
1985 If none of the extractor subroutines succeeds, then one
1986 character is extracted from the start of the text and the extraction
1987 subroutines reapplied. Characters which are thus removed are accumulated and
1988 eventually become the next field (unless the fourth argument is true, in which
1989 case they are discarded).
1990
1991 For example, the following extracts substrings that are valid Perl variables:
1992
1993         @fields = extract_multiple($text,
1994                                    [ sub { extract_variable($_[0]) } ],
1995                                    undef, 1);
1996
1997 This example separates a text into fields which are quote delimited,
1998 curly bracketed, and anything else. The delimited and bracketed
1999 parts are also blessed to identify them (the "anything else" is unblessed):
2000
2001         @fields = extract_multiple($text,
2002                    [
2003                         { Delim => sub { extract_delimited($_[0],q{'"}) } },
2004                         { Brack => sub { extract_bracketed($_[0],'{}') } },
2005                    ]);
2006
2007 This call extracts the next single substring that is a valid Perl quotelike
2008 operator (and removes it from $text):
2009
2010         $quotelike = extract_multiple($text,
2011                                       [
2012                                         sub { extract_quotelike($_[0]) },
2013                                       ], undef, 1);
2014
2015 Finally, here is yet another way to do comma-separated value parsing:
2016
2017         @fields = extract_multiple($csv_text,
2018                                   [
2019                                         sub { extract_delimited($_[0],q{'"}) },
2020                                         qr/([^,]+)(.*)/,
2021                                   ],
2022                                   undef,1);
2023
2024 The list in the second argument means:
2025 I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2026 The undef third argument means:
2027 I<"...as many times as possible...">,
2028 and the true value in the fourth argument means
2029 I<"...discarding anything else that appears (i.e. the commas)">.
2030
2031 If you wanted the commas preserved as separate fields (i.e. like split
2032 does if your split pattern has capturing parentheses), you would
2033 just make the last parameter undefined (or remove it).
2034
2035
2036 =head2 C<gen_delimited_pat>
2037
2038 The C<gen_delimited_pat> subroutine takes a single (string) argument and
2039    > builds a Friedl-style optimized regex that matches a string delimited
2040 by any one of the characters in the single argument. For example:
2041
2042         gen_delimited_pat(q{'"})
2043
2044 returns the regex:
2045
2046         (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2047
2048 Note that the specified delimiters are automatically quotemeta'd.
2049
2050 A typical use of C<gen_delimited_pat> would be to build special purpose tags
2051 for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2052 (which might contain quoted strings):
2053
2054         my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2055
2056         extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2057
2058
2059 C<gen_delimited_pat> may also be called with an optional second argument,
2060 which specifies the "escape" character(s) to be used for each delimiter.
2061 For example to match a Pascal-style string (where ' is the delimiter
2062 and '' is a literal ' within the string):
2063
2064         gen_delimited_pat(q{'},q{'});
2065
2066 Different escape characters can be specified for different delimiters.
2067 For example, to specify that '/' is the escape for single quotes
2068 and '%' is the escape for double quotes:
2069
2070         gen_delimited_pat(q{'"},q{/%});
2071
2072 If more delimiters than escape chars are specified, the last escape char
2073 is used for the remaining delimiters.
2074 If no escape char is specified for a given specified delimiter, '\' is used.
2075
2076 Note that
2077 C<gen_delimited_pat> was previously called
2078 C<delimited_pat>. That name may still be used, but is now deprecated.
2079
2080
2081 =head1 DIAGNOSTICS
2082
2083 In a list context, all the functions return C<(undef,$original_text)>
2084 on failure. In a scalar context, failure is indicated by returning C<undef>
2085 (in this case the input text is not modified in any way).
2086
2087 In addition, on failure in I<any> context, the C<$@> variable is set.
2088 Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2089 below.
2090 Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2091 which the error was detected (although not necessarily where it occurred!)
2092 Printing C<$@> directly produces the error message, with the offset appended.
2093 On success, the C<$@> variable is guaranteed to be C<undef>.
2094
2095 The available diagnostics are:
2096
2097 =over 4
2098
2099 =item  C<Did not find a suitable bracket: "%s">
2100
2101 The delimiter provided to C<extract_bracketed> was not one of
2102 C<'()[]E<lt>E<gt>{}'>.
2103
2104 =item  C<Did not find prefix: /%s/>
2105
2106 A non-optional prefix was specified but wasn't found at the start of the text.
2107
2108 =item  C<Did not find opening bracket after prefix: "%s">
2109
2110 C<extract_bracketed> or C<extract_codeblock> was expecting a
2111 particular kind of bracket at the start of the text, and didn't find it.
2112
2113 =item  C<No quotelike operator found after prefix: "%s">
2114
2115 C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2116 C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2117 it was extracting.
2118
2119 =item  C<Unmatched closing bracket: "%c">
2120
2121 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2122 a closing bracket where none was expected.
2123
2124 =item  C<Unmatched opening bracket(s): "%s">
2125
2126 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2127 out of characters in the text before closing one or more levels of nested
2128 brackets.
2129
2130 =item C<Unmatched embedded quote (%s)>
2131
2132 C<extract_bracketed> attempted to match an embedded quoted substring, but
2133 failed to find a closing quote to match it.
2134
2135 =item C<Did not find closing delimiter to match '%s'>
2136
2137 C<extract_quotelike> was unable to find a closing delimiter to match the
2138 one that opened the quote-like operation.
2139
2140 =item  C<Mismatched closing bracket: expected "%c" but found "%s">
2141
2142 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2143 a valid bracket delimiter, but it was the wrong species. This usually
2144 indicates a nesting error, but may indicate incorrect quoting or escaping.
2145
2146 =item  C<No block delimiter found after quotelike "%s">
2147
2148 C<extract_quotelike> or C<extract_codeblock> found one of the
2149 quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2150 without a suitable block after it.
2151
2152 =item C<Did not find leading dereferencer>
2153
2154 C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2155 a variable, but didn't find any of them.
2156
2157 =item C<Bad identifier after dereferencer>
2158
2159 C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2160 character was not followed by a legal Perl identifier.
2161
2162 =item C<Did not find expected opening bracket at %s>
2163
2164 C<extract_codeblock> failed to find any of the outermost opening brackets
2165 that were specified.
2166
2167 =item C<Improperly nested codeblock at %s>
2168
2169 A nested code block was found that started with a delimiter that was specified
2170 as being only to be used as an outermost bracket.
2171
2172 =item  C<Missing second block for quotelike "%s">
2173
2174 C<extract_codeblock> or C<extract_quotelike> found one of the
2175 quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2176
2177 =item C<No match found for opening bracket>
2178
2179 C<extract_codeblock> failed to find a closing bracket to match the outermost
2180 opening bracket.
2181
2182 =item C<Did not find opening tag: /%s/>
2183
2184 C<extract_tagged> did not find a suitable opening tag (after any specified
2185 prefix was removed).
2186
2187 =item C<Unable to construct closing tag to match: /%s/>
2188
2189 C<extract_tagged> matched the specified opening tag and tried to
2190 modify the matched text to produce a matching closing tag (because
2191 none was specified). It failed to generate the closing tag, almost
2192 certainly because the opening tag did not start with a
2193 bracket of some kind.
2194
2195 =item C<Found invalid nested tag: %s>
2196
2197 C<extract_tagged> found a nested tag that appeared in the "reject" list
2198 (and the failure mode was not "MAX" or "PARA").
2199
2200 =item C<Found unbalanced nested tag: %s>
2201
2202 C<extract_tagged> found a nested opening tag that was not matched by a
2203 corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2204
2205 =item C<Did not find closing tag>
2206
2207 C<extract_tagged> reached the end of the text without finding a closing tag
2208 to match the original opening tag (and the failure mode was not
2209 "MAX" or "PARA").
2210
2211
2212
2213
2214 =back
2215
2216
2217 =head1 AUTHOR
2218
2219 Damian Conway (damian@conway.org)
2220
2221
2222 =head1 BUGS AND IRRITATIONS
2223
2224 There are undoubtedly serious bugs lurking somewhere in this code, if
2225 only because parts of it give the impression of understanding a great deal
2226 more about Perl than they really do.
2227
2228 Bug reports and other feedback are most welcome.
2229
2230
2231 =head1 COPYRIGHT
2232
2233  Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
2234  This module is free software. It may be used, redistributed
2235      and/or modified under the same terms as Perl itself.