lib/Text/Balanced.pm

   1 # EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
   2 # FOR FULL DOCUMENTATION SEE Balanced.pod
   3
   4 use 5.005;
   5 use strict;
   6
   7 package Text::Balanced;
   8
   9 use Exporter;
  10 use SelfLoader;
  11 use vars qw { $VERSION @ISA %EXPORT_TAGS };
  12
  13 use version; $VERSION = qv('1.99.1');
  14 @ISA            = qw ( Exporter );
  15
  16 %EXPORT_TAGS    = ( ALL => [ qw(
  17                                 &extract_delimited
  18                                 &extract_bracketed
  19                                 &extract_quotelike
  20                                 &extract_codeblock
  21                                 &extract_variable
  22                                 &extract_tagged
  23                                 &extract_multiple
  24
  25                                 &gen_delimited_pat
  26                                 &gen_extract_tagged
  27
  28                                 &delimited_pat
  29                                ) ] );
  30
  31 Exporter::export_ok_tags('ALL');
  32
  33 # PROTOTYPES
  34
  35 sub _match_bracketed($$$$$$);
  36 sub _match_variable($$);
  37 sub _match_codeblock($$$$$$$);
  38 sub _match_quotelike($$$$);
  39
  40 # HANDLE RETURN VALUES IN VARIOUS CONTEXTS
  41
  42 sub _failmsg {
  43         my ($message, $pos) = @_;
  44         $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
  45 }
  46
  47 sub _fail
  48 {
  49         my ($wantarray, $textref, $message, $pos) = @_;
  50         _failmsg $message, $pos if $message;
  51         return (undef,$$textref,undef) if $wantarray;
  52         return undef;
  53 }
  54
  55 sub _succeed
  56 {
  57         $@ = undef;
  58         my ($wantarray,$textref) = splice @_, 0, 2;
  59         my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
  60         my ($startlen, $oppos) = @_[5,6];
  61         my $remainderpos = $_[2];
  62         if ($wantarray)
  63         {
  64                 my @res;
  65                 while (my ($from, $len) = splice @_, 0, 2)
  66                 {
  67                         push @res, substr($$textref,$from,$len);
  68                 }
  69                 if ($extralen) {        # CORRECT FILLET
  70                         my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n");
  71                         $res[1] = "$extra$res[1]";
  72                         eval { substr($$textref,$remainderpos,0) = $extra;
  73                                substr($$textref,$extrapos,$extralen,"\n")} ;
  74                                 #REARRANGE HERE DOC AND FILLET IF POSSIBLE
  75                         pos($$textref) = $remainderpos-$extralen+1; # RESET \G
  76                 }
  77                 else {
  78                         pos($$textref) = $remainderpos;             # RESET \G
  79                 }
  80                 return @res;
  81         }
  82         else
  83         {
  84                 my $match = substr($$textref,$_[0],$_[1]);
  85                 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
  86                 my $extra = $extralen
  87                         ? substr($$textref, $extrapos, $extralen)."\n" : "";
  88                 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ;     #CHOP OUT PREFIX & MATCH, IF POSSIBLE
  89                 pos($$textref) = $_[4];                         # RESET \G
  90                 return $match;
  91         }
  92 }
  93
  94 # BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
  95
  96 sub gen_delimited_pat($;$)  # ($delimiters;$escapes)
  97 {
  98         my ($dels, $escs) = @_;
  99         return "" unless $dels =~ /\S/;
 100         $escs = '\\' unless $escs;
 101         $escs .= substr($escs,-1) x (length($dels)-length($escs));
 102         my @pat = ();
 103         my $i;
 104         for ($i=0; $i<length $dels; $i++)
 105         {
 106                 my $del = quotemeta substr($dels,$i,1);
 107                 my $esc = quotemeta substr($escs,$i,1);
 108                 if ($del eq $esc)
 109                 {
 110                         push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
 111                 }
 112                 else
 113                 {
 114                         push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
 115                 }
 116         }
 117         my $pat = join '|', @pat;
 118         return "(?:$pat)";
 119 }
 120
 121 *delimited_pat = \&gen_delimited_pat;
 122
 123
 124 # THE EXTRACTION FUNCTIONS
 125
 126 sub extract_delimited (;$$$$)
 127 {
 128         my $textref = defined $_[0] ? \$_[0] : \$_;
 129         my $wantarray = wantarray;
 130         my $del  = defined $_[1] ? $_[1] : qq{\'\"\`};
 131         my $pre  = defined $_[2] ? $_[2] : '\s*';
 132         my $esc  = defined $_[3] ? $_[3] : qq{\\};
 133         my $pat = gen_delimited_pat($del, $esc);
 134         my $startpos = pos $$textref || 0;
 135         return _fail($wantarray, $textref, "Not a delimited pattern", 0)
 136                 unless $$textref =~ m/\G($pre)($pat)/gc;
 137         my $prelen = length($1);
 138         my $matchpos = $startpos+$prelen;
 139         my $endpos = pos $$textref;
 140         return _succeed $wantarray, $textref,
 141                         $matchpos, $endpos-$matchpos,           # MATCH
 142                         $endpos,   length($$textref)-$endpos,   # REMAINDER
 143                         $startpos, $prelen;                     # PREFIX
 144 }
 145
 146 sub extract_bracketed (;$$$)
 147 {
 148         my $textref = defined $_[0] ? \$_[0] : \$_;
 149         my $ldel = defined $_[1] ? $_[1] : '{([<';
 150         my $pre  = defined $_[2] ? $_[2] : '\s*';
 151         my $wantarray = wantarray;
 152         my $qdel = "";
 153         my $quotelike;
 154         $ldel =~ s/'//g and $qdel .= q{'};
 155         $ldel =~ s/"//g and $qdel .= q{"};
 156         $ldel =~ s/`//g and $qdel .= q{`};
 157         $ldel =~ s/q//g and $quotelike = 1;
 158         $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
 159         my $rdel = $ldel;
 160         unless ($rdel =~ tr/[({</])}>/)
 161         {
 162                 return _fail $wantarray, $textref,
 163                              "Did not find a suitable bracket in delimiter: \"$_[1]\"",
 164                              0;
 165         }
 166         my $posbug = pos;
 167         $ldel = join('|', map { quotemeta $_ } split('', $ldel));
 168         $rdel = join('|', map { quotemeta $_ } split('', $rdel));
 169         pos = $posbug;
 170
 171         my $startpos = pos $$textref || 0;
 172         my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
 173
 174         return _fail ($wantarray, $textref) unless @match;
 175
 176         return _succeed ( $wantarray, $textref,
 177                           $match[2], $match[5]+2,       # MATCH
 178                           @match[8,9],                  # REMAINDER
 179                           @match[0,1],                  # PREFIX
 180                         );
 181 }
 182
 183 sub _match_bracketed($$$$$$)    # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
 184 {
 185         my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
 186         my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
 187         unless ($$textref =~ m/\G$pre/gc)
 188         {
 189                 _failmsg "Did not find prefix: /$pre/", $startpos;
 190                 return;
 191         }
 192
 193         $ldelpos = pos $$textref;
 194
 195         unless ($$textref =~ m/\G($ldel)/gc)
 196         {
 197                 _failmsg "Did not find opening bracket after prefix: \"$pre\"",
 198                          pos $$textref;
 199                 pos $$textref = $startpos;
 200                 return;
 201         }
 202
 203         my @nesting = ( $1 );
 204         my $textlen = length $$textref;
 205         while (pos $$textref < $textlen)
 206         {
 207                 next if $$textref =~ m/\G\\./gcs;
 208
 209                 if ($$textref =~ m/\G($ldel)/gc)
 210                 {
 211                         push @nesting, $1;
 212                 }
 213                 elsif ($$textref =~ m/\G($rdel)/gc)
 214                 {
 215                         my ($found, $brackettype) = ($1, $1);
 216                         if ($#nesting < 0)
 217                         {
 218                                 _failmsg "Unmatched closing bracket: \"$found\"",
 219                                          pos $$textref;
 220                                 pos $$textref = $startpos;
 221                                 return;
 222                         }
 223                         my $expected = pop(@nesting);
 224                         $expected =~ tr/({[</)}]>/;
 225                         if ($expected ne $brackettype)
 226                         {
 227                                 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
 228                                          pos $$textref;
 229                                 pos $$textref = $startpos;
 230                                 return;
 231                         }
 232                         last if $#nesting < 0;
 233                 }
 234                 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
 235                 {
 236                         $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
 237                         _failmsg "Unmatched embedded quote ($1)",
 238                                  pos $$textref;
 239                         pos $$textref = $startpos;
 240                         return;
 241                 }
 242                 elsif ($quotelike && _match_quotelike($textref,"",1,0))
 243                 {
 244                         next;
 245                 }
 246
 247                 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
 248         }
 249         if ($#nesting>=0)
 250         {
 251                 _failmsg "Unmatched opening bracket(s): "
 252                                 . join("..",@nesting)."..",
 253                          pos $$textref;
 254                 pos $$textref = $startpos;
 255                 return;
 256         }
 257
 258         $endpos = pos $$textref;
 259
 260         return (
 261                 $startpos,  $ldelpos-$startpos,         # PREFIX
 262                 $ldelpos,   1,                          # OPENING BRACKET
 263                 $ldelpos+1, $endpos-$ldelpos-2,         # CONTENTS
 264                 $endpos-1,  1,                          # CLOSING BRACKET
 265                 $endpos,    length($$textref)-$endpos,  # REMAINDER
 266                );
 267 }
 268
 269 sub _revbracket($)
 270 {
 271         my $brack = reverse $_[0];
 272         $brack =~ tr/[({</])}>/;
 273         return $brack;
 274 }
 275
 276 my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
 277
 278 sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
 279 {
 280         my $textref = defined $_[0] ? \$_[0] : \$_;
 281         my $ldel    = $_[1];
 282         my $rdel    = $_[2];
 283         my $pre     = defined $_[3] ? $_[3] : '\s*';
 284         my %options = defined $_[4] ? %{$_[4]} : ();
 285         my $omode   = defined $options{fail} ? $options{fail} : '';
 286         my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
 287                     : defined($options{reject})        ? $options{reject}
 288                     :                                    ''
 289                     ;
 290         my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
 291                     : defined($options{ignore})        ? $options{ignore}
 292                     :                                    ''
 293                     ;
 294
 295         if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
 296         $@ = undef;
 297
 298         my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
 299
 300         return _fail(wantarray, $textref) unless @match;
 301         return _succeed wantarray, $textref,
 302                         $match[2], $match[3]+$match[5]+$match[7],       # MATCH
 303                         @match[8..9,0..1,2..7];                         # REM, PRE, BITS
 304 }
 305
 306 sub _match_tagged       # ($$$$$$$)
 307 {
 308         my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
 309         my $rdelspec;
 310
 311         my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
 312
 313         unless ($$textref =~ m/\G($pre)/gc)
 314         {
 315                 _failmsg "Did not find prefix: /$pre/", pos $$textref;
 316                 goto failed;
 317         }
 318
 319         $opentagpos = pos($$textref);
 320
 321         unless ($$textref =~ m/\G$ldel/gc)
 322         {
 323                 _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
 324                 goto failed;
 325         }
 326
 327         $textpos = pos($$textref);
 328
 329         if (!defined $rdel)
 330         {
 331                 $rdelspec = substr($$textref, $-[0], $+[0] - $-[0]);
 332                 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes)
 333                 {
 334                         _failmsg "Unable to construct closing tag to match: $rdel",
 335                                  pos $$textref;
 336                         goto failed;
 337                 }
 338         }
 339         else
 340         {
 341                 $rdelspec = eval "qq{$rdel}" || do {
 342                         my $del;
 343                         for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
 344                                 { next if $rdel =~ /\Q$_/; $del = $_; last }
 345                         unless ($del) {
 346                                 use Carp;
 347                                 croak "Can't interpolate right delimiter $rdel"
 348                         }
 349                         eval "qq$del$rdel$del";
 350                 };
 351         }
 352
 353         while (pos($$textref) < length($$textref))
 354         {
 355                 next if $$textref =~ m/\G\\./gc;
 356
 357                 if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
 358                 {
 359                         $parapos = pos($$textref) - length($1)
 360                                 unless defined $parapos;
 361                 }
 362                 elsif ($$textref =~ m/\G($rdelspec)/gc )
 363                 {
 364                         $closetagpos = pos($$textref)-length($1);
 365                         goto matched;
 366                 }
 367                 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
 368                 {
 369                         next;
 370                 }
 371                 elsif ($bad && $$textref =~ m/\G($bad)/gcs)
 372                 {
 373                         pos($$textref) -= length($1);   # CUT OFF WHATEVER CAUSED THE SHORTNESS
 374                         goto short if ($omode eq 'PARA' || $omode eq 'MAX');
 375                         _failmsg "Found invalid nested tag: $1", pos $$textref;
 376                         goto failed;
 377                 }
 378                 elsif ($$textref =~ m/\G($ldel)/gc)
 379                 {
 380                         my $tag = $1;
 381                         pos($$textref) -= length($tag); # REWIND TO NESTED TAG
 382                         unless (_match_tagged(@_))      # MATCH NESTED TAG
 383                         {
 384                                 goto short if $omode eq 'PARA' || $omode eq 'MAX';
 385                                 _failmsg "Found unbalanced nested tag: $tag",
 386                                          pos $$textref;
 387                                 goto failed;
 388                         }
 389                 }
 390                 else { $$textref =~ m/./gcs }
 391         }
 392
 393 short:
 394         $closetagpos = pos($$textref);
 395         goto matched if $omode eq 'MAX';
 396         goto failed unless $omode eq 'PARA';
 397
 398         if (defined $parapos) { pos($$textref) = $parapos }
 399         else                  { $parapos = pos($$textref) }
 400
 401         return (
 402                 $startpos,    $opentagpos-$startpos,            # PREFIX
 403                 $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
 404                 $textpos,     $parapos-$textpos,                # TEXT
 405                 $parapos,     0,                                # NO CLOSING TAG
 406                 $parapos,     length($$textref)-$parapos,       # REMAINDER
 407                );
 408
 409 matched:
 410         $endpos = pos($$textref);
 411         return (
 412                 $startpos,    $opentagpos-$startpos,            # PREFIX
 413                 $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
 414                 $textpos,     $closetagpos-$textpos,            # TEXT
 415                 $closetagpos, $endpos-$closetagpos,             # CLOSING TAG
 416                 $endpos,      length($$textref)-$endpos,        # REMAINDER
 417                );
 418
 419 failed:
 420         _failmsg "Did not find closing tag", pos $$textref unless $@;
 421         pos($$textref) = $startpos;
 422         return;
 423 }
 424
 425 sub extract_variable (;$$)
 426 {
 427         my $textref = defined $_[0] ? \$_[0] : \$_;
 428         return ("","","") unless defined $$textref;
 429         my $pre  = defined $_[1] ? $_[1] : '\s*';
 430
 431         my @match = _match_variable($textref,$pre);
 432
 433         return _fail wantarray, $textref unless @match;
 434
 435         return _succeed wantarray, $textref,
 436                         @match[2..3,4..5,0..1];         # MATCH, REMAINDER, PREFIX
 437 }
 438
 439 sub _match_variable($$)
 440 {
 441 #  $#
 442 #  $^
 443 #  $$
 444         my ($textref, $pre) = @_;
 445         my $startpos = pos($$textref) = pos($$textref)||0;
 446         unless ($$textref =~ m/\G($pre)/gc)
 447         {
 448                 _failmsg "Did not find prefix: /$pre/", pos $$textref;
 449                 return;
 450         }
 451         my $varpos = pos($$textref);
 452         unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
 453         {
 454             unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
 455             {
 456                 _failmsg "Did not find leading dereferencer", pos $$textref;
 457                 pos $$textref = $startpos;
 458                 return;
 459             }
 460             my $deref = $1;
 461
 462             unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
 463                 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
 464                 or $deref eq '$#' or $deref eq '$$' )
 465             {
 466                 _failmsg "Bad identifier after dereferencer", pos $$textref;
 467                 pos $$textref = $startpos;
 468                 return;
 469             }
 470         }
 471
 472         while (1)
 473         {
 474                 next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
 475                 next if _match_codeblock($textref,
 476                                          qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
 477                                          qr/[({[]/, qr/[)}\]]/,
 478                                          qr/[({[]/, qr/[)}\]]/, 0);
 479                 next if _match_codeblock($textref,
 480                                          qr/\s*/, qr/[{[]/, qr/[}\]]/,
 481                                          qr/[{[]/, qr/[}\]]/, 0);
 482                 next if _match_variable($textref,'\s*->\s*');
 483                 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
 484                 last;
 485         }
 486
 487         my $endpos = pos($$textref);
 488         return ($startpos, $varpos-$startpos,
 489                 $varpos,   $endpos-$varpos,
 490                 $endpos,   length($$textref)-$endpos
 491                 );
 492 }
 493
 494 sub extract_codeblock (;$$$$$)
 495 {
 496         my $textref = defined $_[0] ? \$_[0] : \$_;
 497         my $wantarray = wantarray;
 498         my $ldel_inner = defined $_[1] ? $_[1] : '{';
 499         my $pre        = defined $_[2] ? $_[2] : '\s*';
 500         my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
 501         my $rd         = $_[4];
 502         my $rdel_inner = $ldel_inner;
 503         my $rdel_outer = $ldel_outer;
 504         my $posbug = pos;
 505         for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
 506         for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
 507         for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
 508         {
 509                 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
 510         }
 511         pos = $posbug;
 512
 513         my @match = _match_codeblock($textref, $pre,
 514                                      $ldel_outer, $rdel_outer,
 515                                      $ldel_inner, $rdel_inner,
 516                                      $rd);
 517         return _fail($wantarray, $textref) unless @match;
 518         return _succeed($wantarray, $textref,
 519                         @match[2..3,4..5,0..1]  # MATCH, REMAINDER, PREFIX
 520                        );
 521
 522 }
 523
 524 sub _match_codeblock($$$$$$$)
 525 {
 526         my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
 527         my $startpos = pos($$textref) = pos($$textref) || 0;
 528         unless ($$textref =~ m/\G($pre)/gc)
 529         {
 530                 _failmsg qq{Did not match prefix /$pre/ at"} .
 531                             substr($$textref,pos($$textref),20) .
 532                             q{..."},
 533                          pos $$textref;
 534                 return;
 535         }
 536         my $codepos = pos($$textref);
 537         unless ($$textref =~ m/\G($ldel_outer)/gc)      # OUTERMOST DELIMITER
 538         {
 539                 _failmsg qq{Did not find expected opening bracket at "} .
 540                              substr($$textref,pos($$textref),20) .
 541                              q{..."},
 542                          pos $$textref;
 543                 pos $$textref = $startpos;
 544                 return;
 545         }
 546         my $closing = $1;
 547            $closing =~ tr/([<{/)]>}/;
 548         my $matched;
 549         my $patvalid = 1;
 550         while (pos($$textref) < length($$textref))
 551         {
 552                 $matched = '';
 553                 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
 554                 {
 555                         $patvalid = 0;
 556                         next;
 557                 }
 558
 559                 if ($$textref =~ m/\G\s*#.*/gc)
 560                 {
 561                         next;
 562                 }
 563
 564                 if ($$textref =~ m/\G\s*($rdel_outer)/gc)
 565                 {
 566                         unless ($matched = ($closing && $1 eq $closing) )
 567                         {
 568                                 next if $1 eq '>';      # MIGHT BE A "LESS THAN"
 569                                 _failmsg q{Mismatched closing bracket at "} .
 570                                              substr($$textref,pos($$textref),20) .
 571                                              qq{...". Expected '$closing'},
 572                                          pos $$textref;
 573                         }
 574                         last;
 575                 }
 576
 577                 if (_match_variable($textref,'\s*') ||
 578                     _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
 579                 {
 580                         $patvalid = 0;
 581                         next;
 582                 }
 583
 584
 585                 # NEED TO COVER MANY MORE CASES HERE!!!
 586                 if ($$textref =~ m#\G\s*(?!$ldel_inner)
 587                                         ( [-+*x/%^&|.]=?
 588                                         | [!=]~
 589                                         | =(?!>)
 590                                         | (\*\*|&&|\|\||<<|>>)=?
 591                                         | split|grep|map|return
 592                                         | [([]
 593                                         )#gcx)
 594                 {
 595                         $patvalid = 1;
 596                         next;
 597                 }
 598
 599                 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
 600                 {
 601                         $patvalid = 1;
 602                         next;
 603                 }
 604
 605                 if ($$textref =~ m/\G\s*$ldel_outer/gc)
 606                 {
 607                         _failmsg q{Improperly nested codeblock at "} .
 608                                      substr($$textref,pos($$textref),20) .
 609                                      q{..."},
 610                                  pos $$textref;
 611                         last;
 612                 }
 613
 614                 $patvalid = 0;
 615                 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
 616         }
 617         continue { $@ = undef }
 618
 619         unless ($matched)
 620         {
 621                 _failmsg 'No match found for opening bracket', pos $$textref
 622                         unless $@;
 623                 return;
 624         }
 625
 626         my $endpos = pos($$textref);
 627         return ( $startpos, $codepos-$startpos,
 628                  $codepos, $endpos-$codepos,
 629                  $endpos,  length($$textref)-$endpos,
 630                );
 631 }
 632
 633
 634 my %mods   = (
 635                 'none'  => '[cgimsox]*',
 636                 'm'     => '[cgimsox]*',
 637                 's'     => '[cegimsox]*',
 638                 'tr'    => '[cds]*',
 639                 'y'     => '[cds]*',
 640                 'qq'    => '',
 641                 'qx'    => '',
 642                 'qw'    => '',
 643                 'qr'    => '[imsx]*',
 644                 'q'     => '',
 645              );
 646
 647 sub extract_quotelike (;$$)
 648 {
 649         my $textref = $_[0] ? \$_[0] : \$_;
 650         my $wantarray = wantarray;
 651         my $pre  = defined $_[1] ? $_[1] : '\s*';
 652
 653         my @match = _match_quotelike($textref,$pre,1,0);
 654         return _fail($wantarray, $textref) unless @match;
 655         return _succeed($wantarray, $textref,
 656                         $match[2], $match[18]-$match[2],        # MATCH
 657                         @match[18,19],                          # REMAINDER
 658                         @match[0,1],                            # PREFIX
 659                         @match[2..17],                          # THE BITS
 660                         @match[20,21],                          # ANY FILLET?
 661                        );
 662 };
 663
 664 sub _match_quotelike($$$$)      # ($textref, $prepat, $allow_raw_match)
 665 {
 666         my ($textref, $pre, $rawmatch, $qmark) = @_;
 667
 668         my ($textlen,$startpos,
 669             $oppos,
 670             $preld1pos,$ld1pos,$str1pos,$rd1pos,
 671             $preld2pos,$ld2pos,$str2pos,$rd2pos,
 672             $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
 673
 674         unless ($$textref =~ m/\G($pre)/gc)
 675         {
 676                 _failmsg qq{Did not find prefix /$pre/ at "} .
 677                              substr($$textref, pos($$textref), 20) .
 678                              q{..."},
 679                          pos $$textref;
 680                 return;
 681         }
 682         $oppos = pos($$textref);
 683
 684         my $initial = substr($$textref,$oppos,1);
 685
 686         if ($initial && $initial =~ m|^[\"\'\`]|
 687                      || $rawmatch && $initial =~ m|^/|
 688                      || $qmark && $initial =~ m|^\?|)
 689         {
 690                 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
 691                 {
 692                         _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
 693                                      substr($$textref, $oppos, 20) .
 694                                      q{..."},
 695                                  pos $$textref;
 696                         pos $$textref = $startpos;
 697                         return;
 698                 }
 699                 $modpos= pos($$textref);
 700                 $rd1pos = $modpos-1;
 701
 702                 if ($initial eq '/' || $initial eq '?')
 703                 {
 704                         $$textref =~ m/\G$mods{none}/gc
 705                 }
 706
 707                 my $endpos = pos($$textref);
 708                 return (
 709                         $startpos,      $oppos-$startpos,       # PREFIX
 710                         $oppos,         0,                      # NO OPERATOR
 711                         $oppos,         1,                      # LEFT DEL
 712                         $oppos+1,       $rd1pos-$oppos-1,       # STR/PAT
 713                         $rd1pos,        1,                      # RIGHT DEL
 714                         $modpos,        0,                      # NO 2ND LDEL
 715                         $modpos,        0,                      # NO 2ND STR
 716                         $modpos,        0,                      # NO 2ND RDEL
 717                         $modpos,        $endpos-$modpos,        # MODIFIERS
 718                         $endpos,        $textlen-$endpos,       # REMAINDER
 719                        );
 720         }
 721
 722         unless ($$textref =~
 723     m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<(?=\s*["'A-Za-z_]))}gc)
 724         {
 725                 _failmsg q{No quotelike operator found after prefix at "} .
 726                              substr($$textref, pos($$textref), 20) .
 727                              q{..."},
 728                          pos $$textref;
 729                 pos $$textref = $startpos;
 730                 return;
 731         }
 732
 733         my $op = $1;
 734         $preld1pos = pos($$textref);
 735         if ($op eq '<<') {
 736                 $ld1pos = pos($$textref);
 737                 my $label;
 738                 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
 739                         $label = $1;
 740                 }
 741                 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
 742                                      | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
 743                                      | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
 744                                      }gcsx) {
 745                         $label = $+;
 746                 }
 747                 else {
 748                         $label = "";
 749                 }
 750                 my $extrapos = pos($$textref);
 751                 $$textref =~ m{.*\n}gc;
 752                 $str1pos = pos($$textref)--;
 753                 unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) {
 754                         _failmsg qq{Missing here doc terminator ('$label') after "} .
 755                                      substr($$textref, $startpos, 20) .
 756                                      q{..."},
 757                                  pos $$textref;
 758                         pos $$textref = $startpos;
 759                         return;
 760                 }
 761                 $rd1pos = pos($$textref);
 762         $$textref =~ m{\Q$label\E\n}gc;
 763                 $ld2pos = pos($$textref);
 764                 return (
 765                         $startpos,      $oppos-$startpos,       # PREFIX
 766                         $oppos,         length($op),            # OPERATOR
 767                         $ld1pos,        $extrapos-$ld1pos,      # LEFT DEL
 768                         $str1pos,       $rd1pos-$str1pos,       # STR/PAT
 769                         $rd1pos,        $ld2pos-$rd1pos,        # RIGHT DEL
 770                         $ld2pos,        0,                      # NO 2ND LDEL
 771                         $ld2pos,        0,                      # NO 2ND STR
 772                         $ld2pos,        0,                      # NO 2ND RDEL
 773                         $ld2pos,        0,                      # NO MODIFIERS
 774                         $ld2pos,        $textlen-$ld2pos,       # REMAINDER
 775                         $extrapos,      $str1pos-$extrapos,     # FILLETED BIT
 776                        );
 777         }
 778
 779         $$textref =~ m/\G\s*/gc;
 780         $ld1pos = pos($$textref);
 781         $str1pos = $ld1pos+1;
 782
 783         unless ($$textref =~ m/\G(\S)/gc)       # SHOULD USE LOOKAHEAD
 784         {
 785                 _failmsg "No block delimiter found after quotelike $op",
 786                          pos $$textref;
 787                 pos $$textref = $startpos;
 788                 return;
 789         }
 790         pos($$textref) = $ld1pos;       # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
 791         my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
 792         if ($ldel1 =~ /[[(<{]/)
 793         {
 794                 $rdel1 =~ tr/[({</])}>/;
 795                 defined(_match_bracketed($textref,"",$ldel1,"","",$rdel1))
 796                 || do { pos $$textref = $startpos; return };
 797         }
 798         else
 799         {
 800                 $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
 801                 || do { pos $$textref = $startpos; return };
 802         }
 803         $ld2pos = $rd1pos = pos($$textref)-1;
 804
 805         my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
 806         if ($second_arg)
 807         {
 808                 my ($ldel2, $rdel2);
 809                 if ($ldel1 =~ /[[(<{]/)
 810                 {
 811                         unless ($$textref =~ /\G\s*(\S)/gc)     # SHOULD USE LOOKAHEAD
 812                         {
 813                                 _failmsg "Missing second block for quotelike $op",
 814                                          pos $$textref;
 815                                 pos $$textref = $startpos;
 816                                 return;
 817                         }
 818                         $ldel2 = $rdel2 = "\Q$1";
 819                         $rdel2 =~ tr/[({</])}>/;
 820                 }
 821                 else
 822                 {
 823                         $ldel2 = $rdel2 = $ldel1;
 824                 }
 825                 $str2pos = $ld2pos+1;
 826
 827                 if ($ldel2 =~ /[[(<{]/)
 828                 {
 829                         pos($$textref)--;       # OVERCOME BROKEN LOOKAHEAD
 830                         defined(_match_bracketed($textref,"",$ldel2,"","",$rdel2))
 831                         || do { pos $$textref = $startpos; return };
 832                 }
 833                 else
 834                 {
 835                         $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
 836                         || do { pos $$textref = $startpos; return };
 837                 }
 838                 $rd2pos = pos($$textref)-1;
 839         }
 840         else
 841         {
 842                 $ld2pos = $str2pos = $rd2pos = $rd1pos;
 843         }
 844
 845         $modpos = pos $$textref;
 846
 847         $$textref =~ m/\G($mods{$op})/gc;
 848         my $endpos = pos $$textref;
 849
 850         return (
 851                 $startpos,      $oppos-$startpos,       # PREFIX
 852                 $oppos,         length($op),            # OPERATOR
 853                 $ld1pos,        1,                      # LEFT DEL
 854                 $str1pos,       $rd1pos-$str1pos,       # STR/PAT
 855                 $rd1pos,        1,                      # RIGHT DEL
 856                 $ld2pos,        $second_arg,            # 2ND LDEL (MAYBE)
 857                 $str2pos,       $rd2pos-$str2pos,       # 2ND STR (MAYBE)
 858                 $rd2pos,        $second_arg,            # 2ND RDEL (MAYBE)
 859                 $modpos,        $endpos-$modpos,        # MODIFIERS
 860                 $endpos,        $textlen-$endpos,       # REMAINDER
 861                );
 862 }
 863
 864 my $def_func =
 865 [
 866         sub { extract_variable($_[0], '') },
 867         sub { extract_quotelike($_[0],'') },
 868         sub { extract_codeblock($_[0],'{}','') },
 869 ];
 870
 871 sub extract_multiple (;$$$$)    # ($text, $functions_ref, $max_fields, $ignoreunknown)
 872 {
 873         my $textref = defined($_[0]) ? \$_[0] : \$_;
 874         my $posbug = pos;
 875         my ($lastpos, $firstpos);
 876         my @fields = ();
 877
 878         #for ($$textref)
 879         {
 880                 my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
 881                 my $max  = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
 882                 my $igunk = $_[3];
 883
 884                 pos $$textref ||= 0;
 885
 886                 unless (wantarray)
 887                 {
 888                         use Carp;
 889                         carp "extract_multiple reset maximal count to 1 in scalar context"
 890                                 if $^W && defined($_[2]) && $max > 1;
 891                         $max = 1
 892                 }
 893
 894                 my $unkpos;
 895                 my $func;
 896                 my $class;
 897
 898                 my @class;
 899                 foreach $func ( @func )
 900                 {
 901                         if (ref($func) eq 'HASH')
 902                         {
 903                                 push @class, (keys %$func)[0];
 904                                 $func = (values %$func)[0];
 905                         }
 906                         else
 907                         {
 908                                 push @class, undef;
 909                         }
 910                 }
 911
 912                 FIELD: while (pos($$textref) < length($$textref))
 913                 {
 914                         my ($field, $rem);
 915                         my @bits;
 916                         foreach my $i ( 0..$#func )
 917                         {
 918                                 my $pref;
 919                                 $func = $func[$i];
 920                                 $class = $class[$i];
 921                                 $lastpos = pos $$textref;
 922                                 if (ref($func) eq 'CODE')
 923                                         { ($field,$rem,$pref) = @bits = $func->($$textref) }
 924                                 elsif (ref($func) eq 'Text::Balanced::Extractor')
 925                                         { @bits = $field = $func->extract($$textref) }
 926                                 elsif( $$textref =~ m/\G$func/gc )
 927                                         { @bits = $field = defined($1)
 928                                 ? $1
 929                                 : substr($$textref, $-[0], $+[0] - $-[0])
 930                     }
 931                                 $pref ||= "";
 932                                 if (defined($field) && length($field))
 933                                 {
 934                                         if (!$igunk) {
 935                                                 $unkpos = $lastpos
 936                                                         if length($pref) && !defined($unkpos);
 937                                                 if (defined $unkpos)
 938                                                 {
 939                                                         push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
 940                                                         $firstpos = $unkpos unless defined $firstpos;
 941                                                         undef $unkpos;
 942                                                         last FIELD if @fields == $max;
 943                                                 }
 944                                         }
 945                                         push @fields, $class
 946                                                 ? bless (\$field, $class)
 947                                                 : $field;
 948                                         $firstpos = $lastpos unless defined $firstpos;
 949                                         $lastpos = pos $$textref;
 950                                         last FIELD if @fields == $max;
 951                                         next FIELD;
 952                                 }
 953                         }
 954                         if ($$textref =~ /\G(.)/gcs)
 955                         {
 956                                 $unkpos = pos($$textref)-1
 957                                         unless $igunk || defined $unkpos;
 958                         }
 959                 }
 960
 961                 if (defined $unkpos)
 962                 {
 963                         push @fields, substr($$textref, $unkpos);
 964                         $firstpos = $unkpos unless defined $firstpos;
 965                         $lastpos = length $$textref;
 966                 }
 967                 last;
 968         }
 969
 970         pos $$textref = $lastpos;
 971         return @fields if wantarray;
 972
 973         $firstpos ||= 0;
 974         eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
 975                pos $$textref = $firstpos };
 976         return $fields[0];
 977 }
 978
 979
 980 sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
 981 {
 982         my $ldel    = $_[0];
 983         my $rdel    = $_[1];
 984         my $pre     = defined $_[2] ? $_[2] : '\s*';
 985         my %options = defined $_[3] ? %{$_[3]} : ();
 986         my $omode   = defined $options{fail} ? $options{fail} : '';
 987         my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
 988                     : defined($options{reject})        ? $options{reject}
 989                     :                                    ''
 990                     ;
 991         my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
 992                     : defined($options{ignore})        ? $options{ignore}
 993                     :                                    ''
 994                     ;
 995
 996         if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
 997
 998         my $posbug = pos;
 999         for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
1000         pos = $posbug;
1001
1002         my $closure = sub
1003         {
1004                 my $textref = defined $_[0] ? \$_[0] : \$_;
1005                 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
1006
1007                 return _fail(wantarray, $textref) unless @match;
1008                 return _succeed wantarray, $textref,
1009                                 $match[2], $match[3]+$match[5]+$match[7],       # MATCH
1010                                 @match[8..9,0..1,2..7];                         # REM, PRE, BITS
1011         };
1012
1013         bless $closure, 'Text::Balanced::Extractor';
1014 }
1015
1016 package Text::Balanced::Extractor;
1017
1018 sub extract($$) # ($self, $text)
1019 {
1020         &{$_[0]}($_[1]);
1021 }
1022
1023 package Text::Balanced::ErrorMsg;
1024
1025 use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
1026
1027 1;
1028
1029 __END__
1030
1031 =head1 NAME
1032
1033 Text::Balanced - Extract delimited text sequences from strings.
1034
1035
1036 =head1 SYNOPSIS
1037
1038  use Text::Balanced qw (
1039                         extract_delimited
1040                         extract_bracketed
1041                         extract_quotelike
1042                         extract_codeblock
1043                         extract_variable
1044                         extract_tagged
1045                         extract_multiple
1046
1047                         gen_delimited_pat
1048                         gen_extract_tagged
1049                        );
1050
1051  # Extract the initial substring of $text that is delimited by
1052  # two (unescaped) instances of the first character in $delim.
1053
1054         ($extracted, $remainder) = extract_delimited($text,$delim);
1055
1056
1057  # Extract the initial substring of $text that is bracketed
1058  # with a delimiter(s) specified by $delim (where the string
1059  # in $delim contains one or more of '(){}[]<>').
1060
1061         ($extracted, $remainder) = extract_bracketed($text,$delim);
1062
1063
1064  # Extract the initial substring of $text that is bounded by
1065  # an XML tag.
1066
1067         ($extracted, $remainder) = extract_tagged($text);
1068
1069
1070  # Extract the initial substring of $text that is bounded by
1071  # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1072
1073         ($extracted, $remainder) =
1074                 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1075
1076
1077  # Extract the initial substring of $text that represents a
1078  # Perl "quote or quote-like operation"
1079
1080         ($extracted, $remainder) = extract_quotelike($text);
1081
1082
1083  # Extract the initial substring of $text that represents a block
1084  # of Perl code, bracketed by any of character(s) specified by $delim
1085  # (where the string $delim contains one or more of '(){}[]<>').
1086
1087         ($extracted, $remainder) = extract_codeblock($text,$delim);
1088
1089
1090  # Extract the initial substrings of $text that would be extracted by
1091  # one or more sequential applications of the specified functions
1092  # or regular expressions
1093
1094         @extracted = extract_multiple($text,
1095                                       [ \&extract_bracketed,
1096                                         \&extract_quotelike,
1097                                         \&some_other_extractor_sub,
1098                                         qr/[xyz]*/,
1099                                         'literal',
1100                                       ]);
1101
1102 # Create a string representing an optimized pattern (a la Friedl)
1103 # that matches a substring delimited by any of the specified characters
1104 # (in this case: any type of quote or a slash)
1105
1106         $patstring = gen_delimited_pat(q{'"`/});
1107
1108
1109 # Generate a reference to an anonymous sub that is just like extract_tagged
1110 # but pre-compiled and optimized for a specific pair of tags, and consequently
1111 # much faster (i.e. 3 times faster). It uses qr// for better performance on
1112 # repeated calls, so it only works under Perl 5.005 or later.
1113
1114         $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1115
1116         ($extracted, $remainder) = $extract_head->($text);
1117
1118
1119 =head1 DESCRIPTION
1120
1121 The various C<extract_...> subroutines may be used to
1122 extract a delimited substring, possibly after skipping a
1123 specified prefix string. By default, that prefix is
1124 optional whitespace (C</\s*/>), but you can change it to whatever
1125 you wish (see below).
1126
1127 The substring to be extracted must appear at the
1128 current C<pos> location of the string's variable
1129 (or at index zero, if no C<pos> position is defined).
1130 In other words, the C<extract_...> subroutines I<don't>
1131 extract the first occurance of a substring anywhere
1132 in a string (like an unanchored regex would). Rather,
1133 they extract an occurance of the substring appearing
1134 immediately at the current matching position in the
1135 string (like a C<\G>-anchored regex would).
1136
1137
1138
1139 =head2 General behaviour in list contexts
1140
1141 In a list context, all the subroutines return a list, the first three
1142 elements of which are always:
1143
1144 =over 4
1145
1146 =item [0]
1147
1148 The extracted string, including the specified delimiters.
1149 If the extraction fails C<undef> is returned.
1150
1151 =item [1]
1152
1153 The remainder of the input string (i.e. the characters after the
1154 extracted string). On failure, the entire string is returned.
1155
1156 =item [2]
1157
1158 The skipped prefix (i.e. the characters before the extracted string).
1159 On failure, C<undef> is returned.
1160
1161 =back
1162
1163 Note that in a list context, the contents of the original input text (the first
1164 argument) are not modified in any way.
1165
1166 However, if the input text was passed in a variable, that variable's
1167 C<pos> value is updated to point at the first character after the
1168 extracted text. That means that in a list context the various
1169 subroutines can be used much like regular expressions. For example:
1170
1171         while ( $next = (extract_quotelike($text))[0] )
1172         {
1173                 # process next quote-like (in $next)
1174         }
1175
1176
1177 =head2 General behaviour in scalar and void contexts
1178
1179 In a scalar context, the extracted string is returned, having first been
1180 removed from the input text. Thus, the following code also processes
1181 each quote-like operation, but actually removes them from $text:
1182
1183         while ( $next = extract_quotelike($text) )
1184         {
1185                 # process next quote-like (in $next)
1186         }
1187
1188 Note that if the input text is a read-only string (i.e. a literal),
1189 no attempt is made to remove the extracted text.
1190
1191 In a void context the behaviour of the extraction subroutines is
1192 exactly the same as in a scalar context, except (of course) that the
1193 extracted substring is not returned.
1194
1195 =head2 A note about prefixes
1196
1197 Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1198 This can bite you if you're expecting a prefix specification like
1199 '.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1200 pattern will only succeed if the <H1> tag is on the current line, since
1201 . normally doesn't match newlines.
1202
1203 To overcome this limitation, you need to turn on /s matching within
1204 the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1205
1206
1207 =head2 C<extract_delimited>
1208
1209 The C<extract_delimited> function formalizes the common idiom
1210 of extracting a single-character-delimited substring from the start of
1211 a string. For example, to extract a single-quote delimited string, the
1212 following code is typically used:
1213
1214         ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1215         $extracted = $1;
1216
1217 but with C<extract_delimited> it can be simplified to:
1218
1219         ($extracted,$remainder) = extract_delimited($text, "'");
1220
1221 C<extract_delimited> takes up to four scalars (the input text, the
1222 delimiters, a prefix pattern to be skipped, and any escape characters)
1223 and extracts the initial substring of the text that
1224 is appropriately delimited. If the delimiter string has multiple
1225 characters, the first one encountered in the text is taken to delimit
1226 the substring.
1227 The third argument specifies a prefix pattern that is to be skipped
1228 (but must be present!) before the substring is extracted.
1229 The final argument specifies the escape character to be used for each
1230 delimiter.
1231
1232 All arguments are optional. If the escape characters are not specified,
1233 every delimiter is escaped with a backslash (C<\>).
1234 If the prefix is not specified, the
1235 pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1236 is also not specified, the set C</["'`]/> is used. If the text to be processed
1237 is not specified either, C<$_> is used.
1238
1239 In list context, C<extract_delimited> returns a array of three
1240 elements, the extracted substring (I<including the surrounding
1241 delimiters>), the remainder of the text, and the skipped prefix (if
1242 any). If a suitable delimited substring is not found, the first
1243 element of the array is the empty string, the second is the complete
1244 original text, and the prefix returned in the third element is an
1245 empty string.
1246
1247 In a scalar context, just the extracted substring is returned. In
1248 a void context, the extracted substring (and any prefix) are simply
1249 removed from the beginning of the first argument.
1250
1251 Examples:
1252
1253         # Remove a single-quoted substring from the very beginning of $text:
1254
1255                 $substring = extract_delimited($text, "'", '');
1256
1257         # Remove a single-quoted Pascalish substring (i.e. one in which
1258         # doubling the quote character escapes it) from the very
1259         # beginning of $text:
1260
1261                 $substring = extract_delimited($text, "'", '', "'");
1262
1263         # Extract a single- or double- quoted substring from the
1264         # beginning of $text, optionally after some whitespace
1265         # (note the list context to protect $text from modification):
1266
1267                 ($substring) = extract_delimited $text, q{"'};
1268
1269
1270         # Delete the substring delimited by the first '/' in $text:
1271
1272                 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1273
1274 Note that this last example is I<not> the same as deleting the first
1275 quote-like pattern. For instance, if C<$text> contained the string:
1276
1277         "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1278
1279 then after the deletion it would contain:
1280
1281         "if ('.$UNIXCMD/s) { $cmd = $1; }"
1282
1283 not:
1284
1285         "if ('./cmd' =~ ms) { $cmd = $1; }"
1286
1287
1288 See L<"extract_quotelike"> for a (partial) solution to this problem.
1289
1290
1291 =head2 C<extract_bracketed>
1292
1293 Like C<"extract_delimited">, the C<extract_bracketed> function takes
1294 up to three optional scalar arguments: a string to extract from, a delimiter
1295 specifier, and a prefix pattern. As before, a missing prefix defaults to
1296 optional whitespace and a missing text defaults to C<$_>. However, a missing
1297 delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1298
1299 C<extract_bracketed> extracts a balanced-bracket-delimited
1300 substring (using any one (or more) of the user-specified delimiter
1301 brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1302 respect quoted unbalanced brackets (see below).
1303
1304 A "delimiter bracket" is a bracket in list of delimiters passed as
1305 C<extract_bracketed>'s second argument. Delimiter brackets are
1306 specified by giving either the left or right (or both!) versions
1307 of the required bracket(s). Note that the order in which
1308 two or more delimiter brackets are specified is not significant.
1309
1310 A "balanced-bracket-delimited substring" is a substring bounded by
1311 matched brackets, such that any other (left or right) delimiter
1312 bracket I<within> the substring is also matched by an opposite
1313 (right or left) delimiter bracket I<at the same level of nesting>. Any
1314 type of bracket not in the delimiter list is treated as an ordinary
1315 character.
1316
1317 In other words, each type of bracket specified as a delimiter must be
1318 balanced and correctly nested within the substring, and any other kind of
1319 ("non-delimiter") bracket in the substring is ignored.
1320
1321 For example, given the string:
1322
1323         $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1324
1325 then a call to C<extract_bracketed> in a list context:
1326
1327         @result = extract_bracketed( $text, '{}' );
1328
1329 would return:
1330
1331         ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1332
1333 since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1334 (In a scalar context just the first element of the array would be returned. In
1335 a void context, C<$text> would be replaced by an empty string.)
1336
1337 Likewise the call in:
1338
1339         @result = extract_bracketed( $text, '{[' );
1340
1341 would return the same result, since all sets of both types of specified
1342 delimiter brackets are correctly nested and balanced.
1343
1344 However, the call in:
1345
1346         @result = extract_bracketed( $text, '{([<' );
1347
1348 would fail, returning:
1349
1350         ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }"  );
1351
1352 because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1353 the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1354 return an empty string. In a void context, C<$text> would be unchanged.)
1355
1356 Note that the embedded single-quotes in the string don't help in this
1357 case, since they have not been specified as acceptable delimiters and are
1358 therefore treated as non-delimiter characters (and ignored).
1359
1360 However, if a particular species of quote character is included in the
1361 delimiter specification, then that type of quote will be correctly handled.
1362 for example, if C<$text> is:
1363
1364         $text = '<A HREF=">>>>">link</A>';
1365
1366 then
1367
1368         @result = extract_bracketed( $text, '<">' );
1369
1370 returns:
1371
1372         ( '<A HREF=">>>>">', 'link</A>', "" )
1373
1374 as expected. Without the specification of C<"> as an embedded quoter:
1375
1376         @result = extract_bracketed( $text, '<>' );
1377
1378 the result would be:
1379
1380         ( '<A HREF=">', '>>>">link</A>', "" )
1381
1382 In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1383 quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1384 letter 'q' as a delimiter. Hence:
1385
1386         @result = extract_bracketed( $text, '<q>' );
1387
1388 would correctly match something like this:
1389
1390         $text = '<leftop: conj /and/ conj>';
1391
1392 See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1393
1394
1395 =head2 C<extract_variable>
1396
1397 C<extract_variable> extracts any valid Perl variable or
1398 variable-involved expression, including scalars, arrays, hashes, array
1399 accesses, hash look-ups, method calls through objects, subroutine calles
1400 through subroutine references, etc.
1401
1402 The subroutine takes up to two optional arguments:
1403
1404 =over 4
1405
1406 =item 1.
1407
1408 A string to be processed (C<$_> if the string is omitted or C<undef>)
1409
1410 =item 2.
1411
1412 A string specifying a pattern to be matched as a prefix (which is to be
1413 skipped). If omitted, optional whitespace is skipped.
1414
1415 =back
1416
1417 On success in a list context, an array of 3 elements is returned. The
1418 elements are:
1419
1420 =over 4
1421
1422 =item [0]
1423
1424 the extracted variable, or variablish expression
1425
1426 =item [1]
1427
1428 the remainder of the input text,
1429
1430 =item [2]
1431
1432 the prefix substring (if any),
1433
1434 =back
1435
1436 On failure, all of these values (except the remaining text) are C<undef>.
1437
1438 In a scalar context, C<extract_variable> returns just the complete
1439 substring that matched a variablish expression. C<undef> is returned on
1440 failure. In addition, the original input text has the returned substring
1441 (and any prefix) removed from it.
1442
1443 In a void context, the input text just has the matched substring (and
1444 any specified prefix) removed.
1445
1446
1447 =head2 C<extract_tagged>
1448
1449 C<extract_tagged> extracts and segments text between (balanced)
1450 specified tags.
1451
1452 The subroutine takes up to five optional arguments:
1453
1454 =over 4
1455
1456 =item 1.
1457
1458 A string to be processed (C<$_> if the string is omitted or C<undef>)
1459
1460 =item 2.
1461
1462 A string specifying a pattern to be matched as the opening tag.
1463 If the pattern string is omitted (or C<undef>) then a pattern
1464 that matches any standard XML tag is used.
1465
1466 =item 3.
1467
1468 A string specifying a pattern to be matched at the closing tag.
1469 If the pattern string is omitted (or C<undef>) then the closing
1470 tag is constructed by inserting a C</> after any leading bracket
1471 characters in the actual opening tag that was matched (I<not> the pattern
1472 that matched the tag). For example, if the opening tag pattern
1473 is specified as C<'{{\w+}}'> and actually matched the opening tag
1474 C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1475
1476 =item 4.
1477
1478 A string specifying a pattern to be matched as a prefix (which is to be
1479 skipped). If omitted, optional whitespace is skipped.
1480
1481 =item 5.
1482
1483 A hash reference containing various parsing options (see below)
1484
1485 =back
1486
1487 The various options that can be specified are:
1488
1489 =over 4
1490
1491 =item C<reject =E<gt> $listref>
1492
1493 The list reference contains one or more strings specifying patterns
1494 that must I<not> appear within the tagged text.
1495
1496 For example, to extract
1497 an HTML link (which should not contain nested links) use:
1498
1499         extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1500
1501 =item C<ignore =E<gt> $listref>
1502
1503 The list reference contains one or more strings specifying patterns
1504 that are I<not> be be treated as nested tags within the tagged text
1505 (even if they would match the start tag pattern).
1506
1507 For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1508
1509         extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1510
1511 (also see L<"gen_delimited_pat"> below).
1512
1513
1514 =item C<fail =E<gt> $str>
1515
1516 The C<fail> option indicates the action to be taken if a matching end
1517 tag is not encountered (i.e. before the end of the string or some
1518 C<reject> pattern matches). By default, a failure to match a closing
1519 tag causes C<extract_tagged> to immediately fail.
1520
1521 However, if the string value associated with <reject> is "MAX", then
1522 C<extract_tagged> returns the complete text up to the point of failure.
1523 If the string is "PARA", C<extract_tagged> returns only the first paragraph
1524 after the tag (up to the first line that is either empty or contains
1525 only whitespace characters).
1526 If the string is "", the the default behaviour (i.e. failure) is reinstated.
1527
1528 For example, suppose the start tag "/para" introduces a paragraph, which then
1529 continues until the next "/endpara" tag or until another "/para" tag is
1530 encountered:
1531
1532         $text = "/para line 1\n\nline 3\n/para line 4";
1533
1534         extract_tagged($text, '/para', '/endpara', undef,
1535                                 {reject => '/para', fail => MAX );
1536
1537         # EXTRACTED: "/para line 1\n\nline 3\n"
1538
1539 Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1540 tag refers only to the immediately following paragraph:
1541
1542         $text = "/para line 1\n\nline 3\n/para line 4";
1543
1544         extract_tagged($text, '/para', '/endpara', undef,
1545                         {reject => '/para', fail => MAX );
1546
1547         # EXTRACTED: "/para line 1\n"
1548
1549 Note that the specified C<fail> behaviour applies to nested tags as well.
1550
1551 =back
1552
1553 On success in a list context, an array of 6 elements is returned. The elements are:
1554
1555 =over 4
1556
1557 =item [0]
1558
1559 the extracted tagged substring (including the outermost tags),
1560
1561 =item [1]
1562
1563 the remainder of the input text,
1564
1565 =item [2]
1566
1567 the prefix substring (if any),
1568
1569 =item [3]
1570
1571 the opening tag
1572
1573 =item [4]
1574
1575 the text between the opening and closing tags
1576
1577 =item [5]
1578
1579 the closing tag (or "" if no closing tag was found)
1580
1581 =back
1582
1583 On failure, all of these values (except the remaining text) are C<undef>.
1584
1585 In a scalar context, C<extract_tagged> returns just the complete
1586 substring that matched a tagged text (including the start and end
1587 tags). C<undef> is returned on failure. In addition, the original input
1588 text has the returned substring (and any prefix) removed from it.
1589
1590 In a void context, the input text just has the matched substring (and
1591 any specified prefix) removed.
1592
1593
1594 =head2 C<gen_extract_tagged>
1595
1596 (Note: This subroutine is only available under Perl5.005)
1597
1598 C<gen_extract_tagged> generates a new anonymous subroutine which
1599 extracts text between (balanced) specified tags. In other words,
1600 it generates a function identical in function to C<extract_tagged>.
1601
1602 The difference between C<extract_tagged> and the anonymous
1603 subroutines generated by
1604 C<gen_extract_tagged>, is that those generated subroutines:
1605
1606 =over 4
1607
1608 =item *
1609
1610 do not have to reparse tag specification or parsing options every time
1611 they are called (whereas C<extract_tagged> has to effectively rebuild
1612 its tag parser on every call);
1613
1614 =item *
1615
1616 make use of the new qr// construct to pre-compile the regexes they use
1617 (whereas C<extract_tagged> uses standard string variable interpolation
1618 to create tag-matching patterns).
1619
1620 =back
1621
1622 The subroutine takes up to four optional arguments (the same set as
1623 C<extract_tagged> except for the string to be processed). It returns
1624 a reference to a subroutine which in turn takes a single argument (the text to
1625 be extracted from).
1626
1627 In other words, the implementation of C<extract_tagged> is exactly
1628 equivalent to:
1629
1630         sub extract_tagged
1631         {
1632                 my $text = shift;
1633                 $extractor = gen_extract_tagged(@_);
1634                 return $extractor->($text);
1635         }
1636
1637 (although C<extract_tagged> is not currently implemented that way, in order
1638 to preserve pre-5.005 compatibility).
1639
1640 Using C<gen_extract_tagged> to create extraction functions for specific tags
1641 is a good idea if those functions are going to be called more than once, since
1642 their performance is typically twice as good as the more general-purpose
1643 C<extract_tagged>.
1644
1645
1646 =head2 C<extract_quotelike>
1647
1648 C<extract_quotelike> attempts to recognize, extract, and segment any
1649 one of the various Perl quotes and quotelike operators (see
1650 L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1651 delimiters (for the quotelike operators), and trailing modifiers are
1652 all caught. For example, in:
1653
1654         extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1655
1656         extract_quotelike '  "You said, \"Use sed\"."  '
1657
1658         extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1659
1660         extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1661
1662 the full Perl quotelike operations are all extracted correctly.
1663
1664 Note too that, when using the /x modifier on a regex, any comment
1665 containing the current pattern delimiter will cause the regex to be
1666 immediately terminated. In other words:
1667
1668         'm /
1669                 (?i)            # CASE INSENSITIVE
1670                 [a-z_]          # LEADING ALPHABETIC/UNDERSCORE
1671                 [a-z0-9]*       # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1672            /x'
1673
1674 will be extracted as if it were:
1675
1676         'm /
1677                 (?i)            # CASE INSENSITIVE
1678                 [a-z_]          # LEADING ALPHABETIC/'
1679
1680 This behaviour is identical to that of the actual compiler.
1681
1682 C<extract_quotelike> takes two arguments: the text to be processed and
1683 a prefix to be matched at the very beginning of the text. If no prefix
1684 is specified, optional whitespace is the default. If no text is given,
1685 C<$_> is used.
1686
1687 In a list context, an array of 11 elements is returned. The elements are:
1688
1689 =over 4
1690
1691 =item [0]
1692
1693 the extracted quotelike substring (including trailing modifiers),
1694
1695 =item [1]
1696
1697 the remainder of the input text,
1698
1699 =item [2]
1700
1701 the prefix substring (if any),
1702
1703 =item [3]
1704
1705 the name of the quotelike operator (if any),
1706
1707 =item [4]
1708
1709 the left delimiter of the first block of the operation,
1710
1711 =item [5]
1712
1713 the text of the first block of the operation
1714 (that is, the contents of
1715 a quote, the regex of a match or substitution or the target list of a
1716 translation),
1717
1718 =item [6]
1719
1720 the right delimiter of the first block of the operation,
1721
1722 =item [7]
1723
1724 the left delimiter of the second block of the operation
1725 (that is, if it is a C<s>, C<tr>, or C<y>),
1726
1727 =item [8]
1728
1729 the text of the second block of the operation
1730 (that is, the replacement of a substitution or the translation list
1731 of a translation),
1732
1733 =item [9]
1734
1735 the right delimiter of the second block of the operation (if any),
1736
1737 =item [10]
1738
1739 the trailing modifiers on the operation (if any).
1740
1741 =back
1742
1743 For each of the fields marked "(if any)" the default value on success is
1744 an empty string.
1745 On failure, all of these values (except the remaining text) are C<undef>.
1746
1747
1748 In a scalar context, C<extract_quotelike> returns just the complete substring
1749 that matched a quotelike operation (or C<undef> on failure). In a scalar or
1750 void context, the input text has the same substring (and any specified
1751 prefix) removed.
1752
1753 Examples:
1754
1755         # Remove the first quotelike literal that appears in text
1756
1757                 $quotelike = extract_quotelike($text,'.*?');
1758
1759         # Replace one or more leading whitespace-separated quotelike
1760         # literals in $_ with "<QLL>"
1761
1762                 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1763
1764
1765         # Isolate the search pattern in a quotelike operation from $text
1766
1767                 ($op,$pat) = (extract_quotelike $text)[3,5];
1768                 if ($op =~ /[ms]/)
1769                 {
1770                         print "search pattern: $pat\n";
1771                 }
1772                 else
1773                 {
1774                         print "$op is not a pattern matching operation\n";
1775                 }
1776
1777
1778 =head2 C<extract_quotelike> and "here documents"
1779
1780 C<extract_quotelike> can successfully extract "here documents" from an input
1781 string, but with an important caveat in list contexts.
1782
1783 Unlike other types of quote-like literals, a here document is rarely
1784 a contiguous substring. For example, a typical piece of code using
1785 here document might look like this:
1786
1787         <<'EOMSG' || die;
1788         This is the message.
1789         EOMSG
1790         exit;
1791
1792 Given this as an input string in a scalar context, C<extract_quotelike>
1793 would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1794 leaving the string " || die;\nexit;" in the original variable. In other words,
1795 the two separate pieces of the here document are successfully extracted and
1796 concatenated.
1797
1798 In a list context, C<extract_quotelike> would return the list
1799
1800 =over 4
1801
1802 =item [0]
1803
1804 "<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1805 including fore and aft delimiters),
1806
1807 =item [1]
1808
1809 " || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1810
1811 =item [2]
1812
1813 "" (i.e. the prefix substring -- trivial in this case),
1814
1815 =item [3]
1816
1817 "<<" (i.e. the "name" of the quotelike operator)
1818
1819 =item [4]
1820
1821 "'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1822
1823 =item [5]
1824
1825 "This is the message.\n" (i.e. the text of the here document),
1826
1827 =item [6]
1828
1829 "EOMSG" (i.e. the right delimiter of the here document),
1830
1831 =item [7..10]
1832
1833 "" (a here document has no second left delimiter, second text, second right
1834 delimiter, or trailing modifiers).
1835
1836 =back
1837
1838 However, the matching position of the input variable would be set to
1839 "exit;" (i.e. I<after> the closing delimiter of the here document),
1840 which would cause the earlier " || die;\nexit;" to be skipped in any
1841 sequence of code fragment extractions.
1842
1843 To avoid this problem, when it encounters a here document whilst
1844 extracting from a modifiable string, C<extract_quotelike> silently
1845 rearranges the string to an equivalent piece of Perl:
1846
1847         <<'EOMSG'
1848         This is the message.
1849         EOMSG
1850         || die;
1851         exit;
1852
1853 in which the here document I<is> contiguous. It still leaves the
1854 matching position after the here document, but now the rest of the line
1855 on which the here document starts is not skipped.
1856
1857 To prevent <extract_quotelike> from mucking about with the input in this way
1858 (this is the only case where a list-context C<extract_quotelike> does so),
1859 you can pass the input variable as an interpolated literal:
1860
1861         $quotelike = extract_quotelike("$var");
1862
1863
1864 =head2 C<extract_codeblock>
1865
1866 C<extract_codeblock> attempts to recognize and extract a balanced
1867 bracket delimited substring that may contain unbalanced brackets
1868 inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1869 is like a combination of C<"extract_bracketed"> and
1870 C<"extract_quotelike">.
1871
1872 C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1873 a text to process, a set of delimiter brackets to look for, and a prefix to
1874 match first. It also takes an optional fourth parameter, which allows the
1875 outermost delimiter brackets to be specified separately (see below).
1876
1877 Omitting the first argument (input text) means process C<$_> instead.
1878 Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1879 Omitting the third argument (prefix argument) implies optional whitespace at the start.
1880 Omitting the fourth argument (outermost delimiter brackets) indicates that the
1881 value of the second argument is to be used for the outermost delimiters.
1882
1883 Once the prefix an dthe outermost opening delimiter bracket have been
1884 recognized, code blocks are extracted by stepping through the input text and
1885 trying the following alternatives in sequence:
1886
1887 =over 4
1888
1889 =item 1.
1890
1891 Try and match a closing delimiter bracket. If the bracket was the same
1892 species as the last opening bracket, return the substring to that
1893 point. If the bracket was mismatched, return an error.
1894
1895 =item 2.
1896
1897 Try to match a quote or quotelike operator. If found, call
1898 C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1899 the error it returned. Otherwise go back to step 1.
1900
1901 =item 3.
1902
1903 Try to match an opening delimiter bracket. If found, call
1904 C<extract_codeblock> recursively to eat the embedded block. If the
1905 recursive call fails, return an error. Otherwise, go back to step 1.
1906
1907 =item 4.
1908
1909 Unconditionally match a bareword or any other single character, and
1910 then go back to step 1.
1911
1912 =back
1913
1914
1915 Examples:
1916
1917         # Find a while loop in the text
1918
1919                 if ($text =~ s/.*?while\s*\{/{/)
1920                 {
1921                         $loop = "while " . extract_codeblock($text);
1922                 }
1923
1924         # Remove the first round-bracketed list (which may include
1925         # round- or curly-bracketed code blocks or quotelike operators)
1926
1927                 extract_codeblock $text, "(){}", '[^(]*';
1928
1929
1930 The ability to specify a different outermost delimiter bracket is useful
1931 in some circumstances. For example, in the Parse::RecDescent module,
1932 parser actions which are to be performed only on a successful parse
1933 are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1934
1935         sentence: subject verb object
1936                         <defer: {$::theVerb = $item{verb}} >
1937
1938 Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1939 within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1940
1941 A deferred action like this:
1942
1943                         <defer: {if ($count>10) {$count--}} >
1944
1945 will be incorrectly parsed as:
1946
1947                         <defer: {if ($count>
1948
1949 because the "less than" operator is interpreted as a closing delimiter.
1950
1951 But, by extracting the directive using
1952 S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1953 the '>' character is only treated as a delimited at the outermost
1954 level of the code block, so the directive is parsed correctly.
1955
1956 =head2 C<extract_multiple>
1957
1958 The C<extract_multiple> subroutine takes a string to be processed and a
1959 list of extractors (subroutines or regular expressions) to apply to that string.
1960
1961 In an array context C<extract_multiple> returns an array of substrings
1962 of the original string, as extracted by the specified extractors.
1963 In a scalar context, C<extract_multiple> returns the first
1964 substring successfully extracted from the original string. In both
1965 scalar and void contexts the original string has the first successfully
1966 extracted substring removed from it. In all contexts
1967 C<extract_multiple> starts at the current C<pos> of the string, and
1968 sets that C<pos> appropriately after it matches.
1969
1970 Hence, the aim of of a call to C<extract_multiple> in a list context
1971 is to split the processed string into as many non-overlapping fields as
1972 possible, by repeatedly applying each of the specified extractors
1973 to the remainder of the string. Thus C<extract_multiple> is
1974 a generalized form of Perl's C<split> subroutine.
1975
1976 The subroutine takes up to four optional arguments:
1977
1978 =over 4
1979
1980 =item 1.
1981
1982 A string to be processed (C<$_> if the string is omitted or C<undef>)
1983
1984 =item 2.
1985
1986 A reference to a list of subroutine references and/or qr// objects and/or
1987 literal strings and/or hash references, specifying the extractors
1988 to be used to split the string. If this argument is omitted (or
1989 C<undef>) the list:
1990
1991         [
1992                 sub { extract_variable($_[0], '') },
1993                 sub { extract_quotelike($_[0],'') },
1994                 sub { extract_codeblock($_[0],'{}','') },
1995         ]
1996
1997 is used.
1998
1999
2000 =item 3.
2001
2002 An number specifying the maximum number of fields to return. If this
2003 argument is omitted (or C<undef>), split continues as long as possible.
2004
2005 If the third argument is I<N>, then extraction continues until I<N> fields
2006 have been successfully extracted, or until the string has been completely
2007 processed.
2008
2009 Note that in scalar and void contexts the value of this argument is
2010 automatically reset to 1 (under C<-w>, a warning is issued if the argument
2011 has to be reset).
2012
2013 =item 4.
2014
2015 A value indicating whether unmatched substrings (see below) within the
2016 text should be skipped or returned as fields. If the value is true,
2017 such substrings are skipped. Otherwise, they are returned.
2018
2019 =back
2020
2021 The extraction process works by applying each extractor in
2022 sequence to the text string.
2023
2024 If the extractor is a subroutine it is called in a list context and is
2025 expected to return a list of a single element, namely the extracted
2026 text. It may optionally also return two further arguments: a string
2027 representing the text left after extraction (like $' for a pattern
2028 match), and a string representing any prefix skipped before the
2029 extraction (like $` in a pattern match). Note that this is designed
2030 to facilitate the use of other Text::Balanced subroutines with
2031 C<extract_multiple>. Note too that the value returned by an extractor
2032 subroutine need not bear any relationship to the corresponding substring
2033 of the original text (see examples below).
2034
2035 If the extractor is a precompiled regular expression or a string,
2036 it is matched against the text in a scalar context with a leading
2037 '\G' and the gc modifiers enabled. The extracted value is either
2038 $1 if that variable is defined after the match, or else the
2039 complete match (i.e. $&).
2040
2041 If the extractor is a hash reference, it must contain exactly one element.
2042 The value of that element is one of the
2043 above extractor types (subroutine reference, regular expression, or string).
2044 The key of that element is the name of a class into which the successful
2045 return value of the extractor will be blessed.
2046
2047 If an extractor returns a defined value, that value is immediately
2048 treated as the next extracted field and pushed onto the list of fields.
2049 If the extractor was specified in a hash reference, the field is also
2050 blessed into the appropriate class,
2051
2052 If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
2053 assumed to have failed to extract.
2054 If none of the extractor subroutines succeeds, then one
2055 character is extracted from the start of the text and the extraction
2056 subroutines reapplied. Characters which are thus removed are accumulated and
2057 eventually become the next field (unless the fourth argument is true, in which
2058 case they are disgarded).
2059
2060 For example, the following extracts substrings that are valid Perl variables:
2061
2062         @fields = extract_multiple($text,
2063                                    [ sub { extract_variable($_[0]) } ],
2064                                    undef, 1);
2065
2066 This example separates a text into fields which are quote delimited,
2067 curly bracketed, and anything else. The delimited and bracketed
2068 parts are also blessed to identify them (the "anything else" is unblessed):
2069
2070         @fields = extract_multiple($text,
2071                    [
2072                         { Delim => sub { extract_delimited($_[0],q{'"}) } },
2073                         { Brack => sub { extract_bracketed($_[0],'{}') } },
2074                    ]);
2075
2076 This call extracts the next single substring that is a valid Perl quotelike
2077 operator (and removes it from $text):
2078
2079         $quotelike = extract_multiple($text,
2080                                       [
2081                                         sub { extract_quotelike($_[0]) },
2082                                       ], undef, 1);
2083
2084 Finally, here is yet another way to do comma-separated value parsing:
2085
2086         @fields = extract_multiple($csv_text,
2087                                   [
2088                                         sub { extract_delimited($_[0],q{'"}) },
2089                                         qr/([^,]+)(.*)/,
2090                                   ],
2091                                   undef,1);
2092
2093 The list in the second argument means:
2094 I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2095 The undef third argument means:
2096 I<"...as many times as possible...">,
2097 and the true value in the fourth argument means
2098 I<"...discarding anything else that appears (i.e. the commas)">.
2099
2100 If you wanted the commas preserved as separate fields (i.e. like split
2101 does if your split pattern has capturing parentheses), you would
2102 just make the last parameter undefined (or remove it).
2103
2104
2105 =head2 C<gen_delimited_pat>
2106
2107 The C<gen_delimited_pat> subroutine takes a single (string) argument and
2108    > builds a Friedl-style optimized regex that matches a string delimited
2109 by any one of the characters in the single argument. For example:
2110
2111         gen_delimited_pat(q{'"})
2112
2113 returns the regex:
2114
2115         (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2116
2117 Note that the specified delimiters are automatically quotemeta'd.
2118
2119 A typical use of C<gen_delimited_pat> would be to build special purpose tags
2120 for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2121 (which might contain quoted strings):
2122
2123         my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2124
2125         extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2126
2127
2128 C<gen_delimited_pat> may also be called with an optional second argument,
2129 which specifies the "escape" character(s) to be used for each delimiter.
2130 For example to match a Pascal-style string (where ' is the delimiter
2131 and '' is a literal ' within the string):
2132
2133         gen_delimited_pat(q{'},q{'});
2134
2135 Different escape characters can be specified for different delimiters.
2136 For example, to specify that '/' is the escape for single quotes
2137 and '%' is the escape for double quotes:
2138
2139         gen_delimited_pat(q{'"},q{/%});
2140
2141 If more delimiters than escape chars are specified, the last escape char
2142 is used for the remaining delimiters.
2143 If no escape char is specified for a given specified delimiter, '\' is used.
2144
2145 =head2 C<delimited_pat>
2146
2147 Note that C<gen_delimited_pat> was previously called C<delimited_pat>.
2148 That name may still be used, but is now deprecated.
2149
2150
2151 =head1 DIAGNOSTICS
2152
2153 In a list context, all the functions return C<(undef,$original_text)>
2154 on failure. In a scalar context, failure is indicated by returning C<undef>
2155 (in this case the input text is not modified in any way).
2156
2157 In addition, on failure in I<any> context, the C<$@> variable is set.
2158 Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2159 below.
2160 Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2161 which the error was detected (although not necessarily where it occurred!)
2162 Printing C<$@> directly produces the error message, with the offset appended.
2163 On success, the C<$@> variable is guaranteed to be C<undef>.
2164
2165 The available diagnostics are:
2166
2167 =over 4
2168
2169 =item  C<Did not find a suitable bracket: "%s">
2170
2171 The delimiter provided to C<extract_bracketed> was not one of
2172 C<'()[]E<lt>E<gt>{}'>.
2173
2174 =item  C<Did not find prefix: /%s/>
2175
2176 A non-optional prefix was specified but wasn't found at the start of the text.
2177
2178 =item  C<Did not find opening bracket after prefix: "%s">
2179
2180 C<extract_bracketed> or C<extract_codeblock> was expecting a
2181 particular kind of bracket at the start of the text, and didn't find it.
2182
2183 =item  C<No quotelike operator found after prefix: "%s">
2184
2185 C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2186 C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2187 it was extracting.
2188
2189 =item  C<Unmatched closing bracket: "%c">
2190
2191 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2192 a closing bracket where none was expected.
2193
2194 =item  C<Unmatched opening bracket(s): "%s">
2195
2196 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2197 out of characters in the text before closing one or more levels of nested
2198 brackets.
2199
2200 =item C<Unmatched embedded quote (%s)>
2201
2202 C<extract_bracketed> attempted to match an embedded quoted substring, but
2203 failed to find a closing quote to match it.
2204
2205 =item C<Did not find closing delimiter to match '%s'>
2206
2207 C<extract_quotelike> was unable to find a closing delimiter to match the
2208 one that opened the quote-like operation.
2209
2210 =item  C<Mismatched closing bracket: expected "%c" but found "%s">
2211
2212 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2213 a valid bracket delimiter, but it was the wrong species. This usually
2214 indicates a nesting error, but may indicate incorrect quoting or escaping.
2215
2216 =item  C<No block delimiter found after quotelike "%s">
2217
2218 C<extract_quotelike> or C<extract_codeblock> found one of the
2219 quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2220 without a suitable block after it.
2221
2222 =item C<Did not find leading dereferencer>
2223
2224 C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2225 a variable, but didn't find any of them.
2226
2227 =item C<Bad identifier after dereferencer>
2228
2229 C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2230 character was not followed by a legal Perl identifier.
2231
2232 =item C<Did not find expected opening bracket at %s>
2233
2234 C<extract_codeblock> failed to find any of the outermost opening brackets
2235 that were specified.
2236
2237 =item C<Improperly nested codeblock at %s>
2238
2239 A nested code block was found that started with a delimiter that was specified
2240 as being only to be used as an outermost bracket.
2241
2242 =item  C<Missing second block for quotelike "%s">
2243
2244 C<extract_codeblock> or C<extract_quotelike> found one of the
2245 quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2246
2247 =item C<No match found for opening bracket>
2248
2249 C<extract_codeblock> failed to find a closing bracket to match the outermost
2250 opening bracket.
2251
2252 =item C<Did not find opening tag: /%s/>
2253
2254 C<extract_tagged> did not find a suitable opening tag (after any specified
2255 prefix was removed).
2256
2257 =item C<Unable to construct closing tag to match: /%s/>
2258
2259 C<extract_tagged> matched the specified opening tag and tried to
2260 modify the matched text to produce a matching closing tag (because
2261 none was specified). It failed to generate the closing tag, almost
2262 certainly because the opening tag did not start with a
2263 bracket of some kind.
2264
2265 =item C<Found invalid nested tag: %s>
2266
2267 C<extract_tagged> found a nested tag that appeared in the "reject" list
2268 (and the failure mode was not "MAX" or "PARA").
2269
2270 =item C<Found unbalanced nested tag: %s>
2271
2272 C<extract_tagged> found a nested opening tag that was not matched by a
2273 corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2274
2275 =item C<Did not find closing tag>
2276
2277 C<extract_tagged> reached the end of the text without finding a closing tag
2278 to match the original opening tag (and the failure mode was not
2279 "MAX" or "PARA").
2280
2281
2282
2283
2284 =back
2285
2286
2287 =head1 AUTHOR
2288
2289 Damian Conway (damian@conway.org)
2290
2291
2292 =head1 BUGS AND IRRITATIONS
2293
2294 There are undoubtedly serious bugs lurking somewhere in this code, if
2295 only because parts of it give the impression of understanding a great deal
2296 more about Perl than they really do.
2297
2298 Bug reports and other feedback are most welcome.
2299
2300
2301 =head1 COPYRIGHT
2302
2303  Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
2304  This module is free software. It may be used, redistributed
2305      and/or modified under the same terms as Perl itself.