lib/Text/Balanced.pm

   1 # EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
   2 # FOR FULL DOCUMENTATION SEE Balanced.pod
   3
   4 use 5.005;
   5 use strict;
   6
   7 package Text::Balanced;
   8
   9 use Exporter;
  10 use SelfLoader;
  11 use vars qw { $VERSION @ISA %EXPORT_TAGS };
  12
  13 $VERSION = '1.89';
  14 @ISA            = qw ( Exporter );
  15
  16 %EXPORT_TAGS    = ( ALL => [ qw(
  17                                 &extract_delimited
  18                                 &extract_bracketed
  19                                 &extract_quotelike
  20                                 &extract_codeblock
  21                                 &extract_variable
  22                                 &extract_tagged
  23                                 &extract_multiple
  24
  25                                 &gen_delimited_pat
  26                                 &gen_extract_tagged
  27
  28                                 &delimited_pat
  29                                ) ] );
  30
  31 Exporter::export_ok_tags('ALL');
  32
  33 # PROTOTYPES
  34
  35 sub _match_bracketed($$$$$$);
  36 sub _match_variable($$);
  37 sub _match_codeblock($$$$$$$);
  38 sub _match_quotelike($$$$);
  39
  40 # HANDLE RETURN VALUES IN VARIOUS CONTEXTS
  41
  42 sub _failmsg {
  43         my ($message, $pos) = @_;
  44         $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
  45 }
  46
  47 sub _fail
  48 {
  49         my ($wantarray, $textref, $message, $pos) = @_;
  50         _failmsg $message, $pos if $message;
  51         return ("",$$textref,"") if $wantarray;
  52         return undef;
  53 }
  54
  55 sub _succeed
  56 {
  57         $@ = undef;
  58         my ($wantarray,$textref) = splice @_, 0, 2;
  59         my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
  60         my ($startlen) = $_[5];
  61         my $remainderpos = $_[2];
  62         if ($wantarray)
  63         {
  64                 my @res;
  65                 while (my ($from, $len) = splice @_, 0, 2)
  66                 {
  67                         push @res, substr($$textref,$from,$len);
  68                 }
  69                 if ($extralen) {        # CORRECT FILLET
  70                         my $extra = substr($res[0], $extrapos-$startlen, $extralen, "\n");
  71                         $res[1] = "$extra$res[1]";
  72                         eval { substr($$textref,$remainderpos,0) = $extra;
  73                                substr($$textref,$extrapos,$extralen,"\n")} ;
  74                                 #REARRANGE HERE DOC AND FILLET IF POSSIBLE
  75                         pos($$textref) = $remainderpos-$extralen+1; # RESET \G
  76                 }
  77                 else {
  78                         pos($$textref) = $remainderpos;             # RESET \G
  79                 }
  80                 return @res;
  81         }
  82         else
  83         {
  84                 my $match = substr($$textref,$_[0],$_[1]);
  85                 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
  86                 my $extra = $extralen
  87                         ? substr($$textref, $extrapos, $extralen)."\n" : "";
  88                 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ;     #CHOP OUT PREFIX & MATCH, IF POSSIBLE
  89                 pos($$textref) = $_[4];                         # RESET \G
  90                 return $match;
  91         }
  92 }
  93
  94 # BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
  95
  96 sub gen_delimited_pat($;$)  # ($delimiters;$escapes)
  97 {
  98         my ($dels, $escs) = @_;
  99         return "" unless $dels =~ /\S/;
 100         $escs = '\\' unless $escs;
 101         $escs .= substr($escs,-1) x (length($dels)-length($escs));
 102         my @pat = ();
 103         my $i;
 104         for ($i=0; $i<length $dels; $i++)
 105         {
 106                 my $del = quotemeta substr($dels,$i,1);
 107                 my $esc = quotemeta substr($escs,$i,1);
 108                 if ($del eq $esc)
 109                 {
 110                         push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
 111                 }
 112                 else
 113                 {
 114                         push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
 115                 }
 116         }
 117         my $pat = join '|', @pat;
 118         return "(?:$pat)";
 119 }
 120
 121 *delimited_pat = \&gen_delimited_pat;
 122
 123
 124 # THE EXTRACTION FUNCTIONS
 125
 126 sub extract_delimited (;$$$$)
 127 {
 128         my $textref = defined $_[0] ? \$_[0] : \$_;
 129         my $wantarray = wantarray;
 130         my $del  = defined $_[1] ? $_[1] : qq{\'\"\`};
 131         my $pre  = defined $_[2] ? $_[2] : '\s*';
 132         my $esc  = defined $_[3] ? $_[3] : qq{\\};
 133         my $pat = gen_delimited_pat($del, $esc);
 134         my $startpos = pos $$textref || 0;
 135         return _fail($wantarray, $textref, "Not a delimited pattern", 0)
 136                 unless $$textref =~ m/\G($pre)($pat)/gc;
 137         my $prelen = length($1);
 138         my $matchpos = $startpos+$prelen;
 139         my $endpos = pos $$textref;
 140         return _succeed $wantarray, $textref,
 141                         $matchpos, $endpos-$matchpos,           # MATCH
 142                         $endpos,   length($$textref)-$endpos,   # REMAINDER
 143                         $startpos, $prelen;                     # PREFIX
 144 }
 145
 146 sub extract_bracketed (;$$$)
 147 {
 148         my $textref = defined $_[0] ? \$_[0] : \$_;
 149         my $ldel = defined $_[1] ? $_[1] : '{([<';
 150         my $pre  = defined $_[2] ? $_[2] : '\s*';
 151         my $wantarray = wantarray;
 152         my $qdel = "";
 153         my $quotelike;
 154         $ldel =~ s/'//g and $qdel .= q{'};
 155         $ldel =~ s/"//g and $qdel .= q{"};
 156         $ldel =~ s/`//g and $qdel .= q{`};
 157         $ldel =~ s/q//g and $quotelike = 1;
 158         $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
 159         my $rdel = $ldel;
 160         unless ($rdel =~ tr/[({</])}>/)
 161         {
 162                 return _fail $wantarray, $textref,
 163                              "Did not find a suitable bracket in delimiter: \"$_[1]\"",
 164                              0;
 165         }
 166         my $posbug = pos;
 167         $ldel = join('|', map { quotemeta $_ } split('', $ldel));
 168         $rdel = join('|', map { quotemeta $_ } split('', $rdel));
 169         pos = $posbug;
 170
 171         my $startpos = pos $$textref || 0;
 172         my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
 173
 174         return _fail ($wantarray, $textref) unless @match;
 175
 176         return _succeed ( $wantarray, $textref,
 177                           $match[2], $match[5]+2,       # MATCH
 178                           @match[8,9],                  # REMAINDER
 179                           @match[0,1],                  # PREFIX
 180                         );
 181 }
 182
 183 sub _match_bracketed($$$$$$)    # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
 184 {
 185         my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
 186         my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
 187         unless ($$textref =~ m/\G$pre/gc)
 188         {
 189                 _failmsg "Did not find prefix: /$pre/", $startpos;
 190                 return;
 191         }
 192
 193         $ldelpos = pos $$textref;
 194
 195         unless ($$textref =~ m/\G($ldel)/gc)
 196         {
 197                 _failmsg "Did not find opening bracket after prefix: \"$pre\"",
 198                          pos $$textref;
 199                 pos $$textref = $startpos;
 200                 return;
 201         }
 202
 203         my @nesting = ( $1 );
 204         my $textlen = length $$textref;
 205         while (pos $$textref < $textlen)
 206         {
 207                 next if $$textref =~ m/\G\\./gcs;
 208
 209                 if ($$textref =~ m/\G($ldel)/gc)
 210                 {
 211                         push @nesting, $1;
 212                 }
 213                 elsif ($$textref =~ m/\G($rdel)/gc)
 214                 {
 215                         my ($found, $brackettype) = ($1, $1);
 216                         if ($#nesting < 0)
 217                         {
 218                                 _failmsg "Unmatched closing bracket: \"$found\"",
 219                                          pos $$textref;
 220                                 pos $$textref = $startpos;
 221                                 return;
 222                         }
 223                         my $expected = pop(@nesting);
 224                         $expected =~ tr/({[</)}]>/;
 225                         if ($expected ne $brackettype)
 226                         {
 227                                 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
 228                                          pos $$textref;
 229                                 pos $$textref = $startpos;
 230                                 return;
 231                         }
 232                         last if $#nesting < 0;
 233                 }
 234                 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
 235                 {
 236                         $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
 237                         _failmsg "Unmatched embedded quote ($1)",
 238                                  pos $$textref;
 239                         pos $$textref = $startpos;
 240                         return;
 241                 }
 242                 elsif ($quotelike && _match_quotelike($textref,"",1,0))
 243                 {
 244                         next;
 245                 }
 246
 247                 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
 248         }
 249         if ($#nesting>=0)
 250         {
 251                 _failmsg "Unmatched opening bracket(s): "
 252                                 . join("..",@nesting)."..",
 253                          pos $$textref;
 254                 pos $$textref = $startpos;
 255                 return;
 256         }
 257
 258         $endpos = pos $$textref;
 259
 260         return (
 261                 $startpos,  $ldelpos-$startpos,         # PREFIX
 262                 $ldelpos,   1,                          # OPENING BRACKET
 263                 $ldelpos+1, $endpos-$ldelpos-2,         # CONTENTS
 264                 $endpos-1,  1,                          # CLOSING BRACKET
 265                 $endpos,    length($$textref)-$endpos,  # REMAINDER
 266                );
 267 }
 268
 269 sub revbracket($)
 270 {
 271         my $brack = reverse $_[0];
 272         $brack =~ tr/[({</])}>/;
 273         return $brack;
 274 }
 275
 276 my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
 277
 278 sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
 279 {
 280         my $textref = defined $_[0] ? \$_[0] : \$_;
 281         my $ldel    = $_[1];
 282         my $rdel    = $_[2];
 283         my $pre     = defined $_[3] ? $_[3] : '\s*';
 284         my %options = defined $_[4] ? %{$_[4]} : ();
 285         my $omode   = defined $options{fail} ? $options{fail} : '';
 286         my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
 287                     : defined($options{reject})        ? $options{reject}
 288                     :                                    ''
 289                     ;
 290         my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
 291                     : defined($options{ignore})        ? $options{ignore}
 292                     :                                    ''
 293                     ;
 294
 295         if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
 296         $@ = undef;
 297
 298         my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
 299
 300         return _fail(wantarray, $textref) unless @match;
 301         return _succeed wantarray, $textref,
 302                         $match[2], $match[3]+$match[5]+$match[7],       # MATCH
 303                         @match[8..9,0..1,2..7];                         # REM, PRE, BITS
 304 }
 305
 306 sub _match_tagged       # ($$$$$$$)
 307 {
 308         my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
 309         my $rdelspec;
 310
 311         my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
 312
 313         unless ($$textref =~ m/\G($pre)/gc)
 314         {
 315                 _failmsg "Did not find prefix: /$pre/", pos $$textref;
 316                 goto failed;
 317         }
 318
 319         $opentagpos = pos($$textref);
 320
 321         unless ($$textref =~ m/\G$ldel/gc)
 322         {
 323                 _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
 324                 goto failed;
 325         }
 326
 327         $textpos = pos($$textref);
 328
 329         if (!defined $rdel)
 330         {
 331                 $rdelspec = $&;
 332                 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes)
 333                 {
 334                         _failmsg "Unable to construct closing tag to match: $rdel",
 335                                  pos $$textref;
 336                         goto failed;
 337                 }
 338         }
 339         else
 340         {
 341                 $rdelspec = eval "qq{$rdel}";
 342         }
 343
 344         while (pos($$textref) < length($$textref))
 345         {
 346                 next if $$textref =~ m/\G\\./gc;
 347
 348                 if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
 349                 {
 350                         $parapos = pos($$textref) - length($1)
 351                                 unless defined $parapos;
 352                 }
 353                 elsif ($$textref =~ m/\G($rdelspec)/gc )
 354                 {
 355                         $closetagpos = pos($$textref)-length($1);
 356                         goto matched;
 357                 }
 358                 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
 359                 {
 360                         next;
 361                 }
 362                 elsif ($bad && $$textref =~ m/\G($bad)/gcs)
 363                 {
 364                         pos($$textref) -= length($1);   # CUT OFF WHATEVER CAUSED THE SHORTNESS
 365                         goto short if ($omode eq 'PARA' || $omode eq 'MAX');
 366                         _failmsg "Found invalid nested tag: $1", pos $$textref;
 367                         goto failed;
 368                 }
 369                 elsif ($$textref =~ m/\G($ldel)/gc)
 370                 {
 371                         my $tag = $1;
 372                         pos($$textref) -= length($tag); # REWIND TO NESTED TAG
 373                         unless (_match_tagged(@_))      # MATCH NESTED TAG
 374                         {
 375                                 goto short if $omode eq 'PARA' || $omode eq 'MAX';
 376                                 _failmsg "Found unbalanced nested tag: $tag",
 377                                          pos $$textref;
 378                                 goto failed;
 379                         }
 380                 }
 381                 else { $$textref =~ m/./gcs }
 382         }
 383
 384 short:
 385         $closetagpos = pos($$textref);
 386         goto matched if $omode eq 'MAX';
 387         goto failed unless $omode eq 'PARA';
 388
 389         if (defined $parapos) { pos($$textref) = $parapos }
 390         else                  { $parapos = pos($$textref) }
 391
 392         return (
 393                 $startpos,    $opentagpos-$startpos,            # PREFIX
 394                 $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
 395                 $textpos,     $parapos-$textpos,                # TEXT
 396                 $parapos,     0,                                # NO CLOSING TAG
 397                 $parapos,     length($$textref)-$parapos,       # REMAINDER
 398                );
 399
 400 matched:
 401         $endpos = pos($$textref);
 402         return (
 403                 $startpos,    $opentagpos-$startpos,            # PREFIX
 404                 $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
 405                 $textpos,     $closetagpos-$textpos,            # TEXT
 406                 $closetagpos, $endpos-$closetagpos,             # CLOSING TAG
 407                 $endpos,      length($$textref)-$endpos,        # REMAINDER
 408                );
 409
 410 failed:
 411         _failmsg "Did not find closing tag", pos $$textref unless $@;
 412         pos($$textref) = $startpos;
 413         return;
 414 }
 415
 416 sub extract_variable (;$$)
 417 {
 418         my $textref = defined $_[0] ? \$_[0] : \$_;
 419         return ("","","") unless defined $$textref;
 420         my $pre  = defined $_[1] ? $_[1] : '\s*';
 421
 422         my @match = _match_variable($textref,$pre);
 423
 424         return _fail wantarray, $textref unless @match;
 425
 426         return _succeed wantarray, $textref,
 427                         @match[2..3,4..5,0..1];         # MATCH, REMAINDER, PREFIX
 428 }
 429
 430 sub _match_variable($$)
 431 {
 432 #  $#
 433 #  $^
 434 #  $$
 435         my ($textref, $pre) = @_;
 436         my $startpos = pos($$textref) = pos($$textref)||0;
 437         unless ($$textref =~ m/\G($pre)/gc)
 438         {
 439                 _failmsg "Did not find prefix: /$pre/", pos $$textref;
 440                 return;
 441         }
 442         my $varpos = pos($$textref);
 443         unless ($$textref =~ m{\G\$\s*(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
 444         {
 445             unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
 446             {
 447                 _failmsg "Did not find leading dereferencer", pos $$textref;
 448                 pos $$textref = $startpos;
 449                 return;
 450             }
 451             my $deref = $1;
 452
 453             unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
 454                 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
 455                 or $deref eq '$#' or $deref eq '$$' )
 456             {
 457                 _failmsg "Bad identifier after dereferencer", pos $$textref;
 458                 pos $$textref = $startpos;
 459                 return;
 460             }
 461         }
 462
 463         while (1)
 464         {
 465                 next if _match_codeblock($textref,
 466                                          qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
 467                                          qr/[({[]/, qr/[)}\]]/,
 468                                          qr/[({[]/, qr/[)}\]]/, 0);
 469                 next if _match_codeblock($textref,
 470                                          qr/\s*/, qr/[{[]/, qr/[}\]]/,
 471                                          qr/[{[]/, qr/[}\]]/, 0);
 472                 next if _match_variable($textref,'\s*->\s*');
 473                 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
 474                 last;
 475         }
 476
 477         my $endpos = pos($$textref);
 478         return ($startpos, $varpos-$startpos,
 479                 $varpos,   $endpos-$varpos,
 480                 $endpos,   length($$textref)-$endpos
 481                 );
 482 }
 483
 484 sub extract_codeblock (;$$$$$)
 485 {
 486         my $textref = defined $_[0] ? \$_[0] : \$_;
 487         my $wantarray = wantarray;
 488         my $ldel_inner = defined $_[1] ? $_[1] : '{';
 489         my $pre        = defined $_[2] ? $_[2] : '\s*';
 490         my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
 491         my $rd         = $_[4];
 492         my $rdel_inner = $ldel_inner;
 493         my $rdel_outer = $ldel_outer;
 494         my $posbug = pos;
 495         for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
 496         for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
 497         for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
 498         {
 499                 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
 500         }
 501         pos = $posbug;
 502
 503         my @match = _match_codeblock($textref, $pre,
 504                                      $ldel_outer, $rdel_outer,
 505                                      $ldel_inner, $rdel_inner,
 506                                      $rd);
 507         return _fail($wantarray, $textref) unless @match;
 508         return _succeed($wantarray, $textref,
 509                         @match[2..3,4..5,0..1]  # MATCH, REMAINDER, PREFIX
 510                        );
 511
 512 }
 513
 514 sub _match_codeblock($$$$$$$)
 515 {
 516         my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
 517         my $startpos = pos($$textref) = pos($$textref) || 0;
 518         unless ($$textref =~ m/\G($pre)/gc)
 519         {
 520                 _failmsg qq{Did not match prefix /$pre/ at"} .
 521                             substr($$textref,pos($$textref),20) .
 522                             q{..."},
 523                          pos $$textref;
 524                 return;
 525         }
 526         my $codepos = pos($$textref);
 527         unless ($$textref =~ m/\G($ldel_outer)/gc)      # OUTERMOST DELIMITER
 528         {
 529                 _failmsg qq{Did not find expected opening bracket at "} .
 530                              substr($$textref,pos($$textref),20) .
 531                              q{..."},
 532                          pos $$textref;
 533                 pos $$textref = $startpos;
 534                 return;
 535         }
 536         my $closing = $1;
 537            $closing =~ tr/([<{/)]>}/;
 538         my $matched;
 539         my $patvalid = 1;
 540         while (pos($$textref) < length($$textref))
 541         {
 542                 $matched = '';
 543                 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
 544                 {
 545                         $patvalid = 0;
 546                         next;
 547                 }
 548
 549                 if ($$textref =~ m/\G\s*#.*/gc)
 550                 {
 551                         next;
 552                 }
 553
 554                 if ($$textref =~ m/\G\s*($rdel_outer)/gc)
 555                 {
 556                         unless ($matched = ($closing && $1 eq $closing) )
 557                         {
 558                                 next if $1 eq '>';      # MIGHT BE A "LESS THAN"
 559                                 _failmsg q{Mismatched closing bracket at "} .
 560                                              substr($$textref,pos($$textref),20) .
 561                                              qq{...". Expected '$closing'},
 562                                          pos $$textref;
 563                         }
 564                         last;
 565                 }
 566
 567                 if (_match_variable($textref,'\s*') ||
 568                     _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
 569                 {
 570                         $patvalid = 0;
 571                         next;
 572                 }
 573
 574
 575                 # NEED TO COVER MANY MORE CASES HERE!!!
 576                 if ($$textref =~ m#\G\s*( [-+*x/%^&|.]=?
 577                                         | [!=]~
 578                                         | =(?!>)
 579                                         | (\*\*|&&|\|\||<<|>>)=?
 580                                         | split|grep|map|return
 581                                         )#gcx)
 582                 {
 583                         $patvalid = 1;
 584                         next;
 585                 }
 586
 587                 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
 588                 {
 589                         $patvalid = 1;
 590                         next;
 591                 }
 592
 593                 if ($$textref =~ m/\G\s*$ldel_outer/gc)
 594                 {
 595                         _failmsg q{Improperly nested codeblock at "} .
 596                                      substr($$textref,pos($$textref),20) .
 597                                      q{..."},
 598                                  pos $$textref;
 599                         last;
 600                 }
 601
 602                 $patvalid = 0;
 603                 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
 604         }
 605         continue { $@ = undef }
 606
 607         unless ($matched)
 608         {
 609                 _failmsg 'No match found for opening bracket', pos $$textref
 610                         unless $@;
 611                 return;
 612         }
 613
 614         my $endpos = pos($$textref);
 615         return ( $startpos, $codepos-$startpos,
 616                  $codepos, $endpos-$codepos,
 617                  $endpos,  length($$textref)-$endpos,
 618                );
 619 }
 620
 621
 622 my %mods   = (
 623                 'none'  => '[cgimsox]*',
 624                 'm'     => '[cgimsox]*',
 625                 's'     => '[cegimsox]*',
 626                 'tr'    => '[cds]*',
 627                 'y'     => '[cds]*',
 628                 'qq'    => '',
 629                 'qx'    => '',
 630                 'qw'    => '',
 631                 'qr'    => '[imsx]*',
 632                 'q'     => '',
 633              );
 634
 635 sub extract_quotelike (;$$)
 636 {
 637         my $textref = $_[0] ? \$_[0] : \$_;
 638         my $wantarray = wantarray;
 639         my $pre  = defined $_[1] ? $_[1] : '\s*';
 640
 641         my @match = _match_quotelike($textref,$pre,1,0);
 642         return _fail($wantarray, $textref) unless @match;
 643         return _succeed($wantarray, $textref,
 644                         $match[2], $match[18]-$match[2],        # MATCH
 645                         @match[18,19],                          # REMAINDER
 646                         @match[0,1],                            # PREFIX
 647                         @match[2..17],                          # THE BITS
 648                         @match[20,21],                          # ANY FILLET?
 649                        );
 650 };
 651
 652 sub _match_quotelike($$$$)      # ($textref, $prepat, $allow_raw_match)
 653 {
 654         my ($textref, $pre, $rawmatch, $qmark) = @_;
 655
 656         my ($textlen,$startpos,
 657             $oppos,
 658             $preld1pos,$ld1pos,$str1pos,$rd1pos,
 659             $preld2pos,$ld2pos,$str2pos,$rd2pos,
 660             $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
 661
 662         unless ($$textref =~ m/\G($pre)/gc)
 663         {
 664                 _failmsg qq{Did not find prefix /$pre/ at "} .
 665                              substr($$textref, pos($$textref), 20) .
 666                              q{..."},
 667                          pos $$textref;
 668                 return;
 669         }
 670         $oppos = pos($$textref);
 671
 672         my $initial = substr($$textref,$oppos,1);
 673
 674         if ($initial && $initial =~ m|^[\"\'\`]|
 675                      || $rawmatch && $initial =~ m|^/|
 676                      || $qmark && $initial =~ m|^\?|)
 677         {
 678                 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
 679                 {
 680                         _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
 681                                      substr($$textref, $oppos, 20) .
 682                                      q{..."},
 683                                  pos $$textref;
 684                         pos $$textref = $startpos;
 685                         return;
 686                 }
 687                 $modpos= pos($$textref);
 688                 $rd1pos = $modpos-1;
 689
 690                 if ($initial eq '/' || $initial eq '?')
 691                 {
 692                         $$textref =~ m/\G$mods{none}/gc
 693                 }
 694
 695                 my $endpos = pos($$textref);
 696                 return (
 697                         $startpos,      $oppos-$startpos,       # PREFIX
 698                         $oppos,         0,                      # NO OPERATOR
 699                         $oppos,         1,                      # LEFT DEL
 700                         $oppos+1,       $rd1pos-$oppos-1,       # STR/PAT
 701                         $rd1pos,        1,                      # RIGHT DEL
 702                         $modpos,        0,                      # NO 2ND LDEL
 703                         $modpos,        0,                      # NO 2ND STR
 704                         $modpos,        0,                      # NO 2ND RDEL
 705                         $modpos,        $endpos-$modpos,        # MODIFIERS
 706                         $endpos,        $textlen-$endpos,       # REMAINDER
 707                        );
 708         }
 709
 710         unless ($$textref =~ m{\G((?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
 711         {
 712                 _failmsg q{No quotelike operator found after prefix at "} .
 713                              substr($$textref, pos($$textref), 20) .
 714                              q{..."},
 715                          pos $$textref;
 716                 pos $$textref = $startpos;
 717                 return;
 718         }
 719
 720         my $op = $1;
 721         $preld1pos = pos($$textref);
 722         if ($op eq '<<') {
 723                 $ld1pos = pos($$textref);
 724                 my $label;
 725                 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
 726                         $label = $1;
 727                 }
 728                 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
 729                                      | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
 730                                      | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
 731                                      }gcsx) {
 732                         $label = $+;
 733                 }
 734                 else {
 735                         $label = "";
 736                 }
 737                 my $extrapos = pos($$textref);
 738                 $$textref =~ m{.*\n}gc;
 739                 $str1pos = pos($$textref);
 740                 unless ($$textref =~ m{.*?\n(?=$label\n)}gc) {
 741                         _failmsg qq{Missing here doc terminator ('$label') after "} .
 742                                      substr($$textref, $startpos, 20) .
 743                                      q{..."},
 744                                  pos $$textref;
 745                         pos $$textref = $startpos;
 746                         return;
 747                 }
 748                 $rd1pos = pos($$textref);
 749                 $$textref =~ m{$label\n}gc;
 750                 $ld2pos = pos($$textref);
 751                 return (
 752                         $startpos,      $oppos-$startpos,       # PREFIX
 753                         $oppos,         length($op),            # OPERATOR
 754                         $ld1pos,        $extrapos-$ld1pos,      # LEFT DEL
 755                         $str1pos,       $rd1pos-$str1pos,       # STR/PAT
 756                         $rd1pos,        $ld2pos-$rd1pos,        # RIGHT DEL
 757                         $ld2pos,        0,                      # NO 2ND LDEL
 758                         $ld2pos,        0,                      # NO 2ND STR
 759                         $ld2pos,        0,                      # NO 2ND RDEL
 760                         $ld2pos,        0,                      # NO MODIFIERS
 761                         $ld2pos,        $textlen-$ld2pos,       # REMAINDER
 762                         $extrapos,      $str1pos-$extrapos,     # FILLETED BIT
 763                        );
 764         }
 765
 766         $$textref =~ m/\G\s*/gc;
 767         $ld1pos = pos($$textref);
 768         $str1pos = $ld1pos+1;
 769
 770         unless ($$textref =~ m/\G(\S)/gc)       # SHOULD USE LOOKAHEAD
 771         {
 772                 _failmsg "No block delimiter found after quotelike $op",
 773                          pos $$textref;
 774                 pos $$textref = $startpos;
 775                 return;
 776         }
 777         pos($$textref) = $ld1pos;       # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
 778         my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
 779         if ($ldel1 =~ /[[(<{]/)
 780         {
 781                 $rdel1 =~ tr/[({</])}>/;
 782                 _match_bracketed($textref,"",$ldel1,"","",$rdel1)
 783                 || do { pos $$textref = $startpos; return };
 784         }
 785         else
 786         {
 787                 $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
 788                 || do { pos $$textref = $startpos; return };
 789         }
 790         $ld2pos = $rd1pos = pos($$textref)-1;
 791
 792         my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
 793         if ($second_arg)
 794         {
 795                 my ($ldel2, $rdel2);
 796                 if ($ldel1 =~ /[[(<{]/)
 797                 {
 798                         unless ($$textref =~ /\G\s*(\S)/gc)     # SHOULD USE LOOKAHEAD
 799                         {
 800                                 _failmsg "Missing second block for quotelike $op",
 801                                          pos $$textref;
 802                                 pos $$textref = $startpos;
 803                                 return;
 804                         }
 805                         $ldel2 = $rdel2 = "\Q$1";
 806                         $rdel2 =~ tr/[({</])}>/;
 807                 }
 808                 else
 809                 {
 810                         $ldel2 = $rdel2 = $ldel1;
 811                 }
 812                 $str2pos = $ld2pos+1;
 813
 814                 if ($ldel2 =~ /[[(<{]/)
 815                 {
 816                         pos($$textref)--;       # OVERCOME BROKEN LOOKAHEAD
 817                         _match_bracketed($textref,"",$ldel2,"","",$rdel2)
 818                         || do { pos $$textref = $startpos; return };
 819                 }
 820                 else
 821                 {
 822                         $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
 823                         || do { pos $$textref = $startpos; return };
 824                 }
 825                 $rd2pos = pos($$textref)-1;
 826         }
 827         else
 828         {
 829                 $ld2pos = $str2pos = $rd2pos = $rd1pos;
 830         }
 831
 832         $modpos = pos $$textref;
 833
 834         $$textref =~ m/\G($mods{$op})/gc;
 835         my $endpos = pos $$textref;
 836
 837         return (
 838                 $startpos,      $oppos-$startpos,       # PREFIX
 839                 $oppos,         length($op),            # OPERATOR
 840                 $ld1pos,        1,                      # LEFT DEL
 841                 $str1pos,       $rd1pos-$str1pos,       # STR/PAT
 842                 $rd1pos,        1,                      # RIGHT DEL
 843                 $ld2pos,        $second_arg,            # 2ND LDEL (MAYBE)
 844                 $str2pos,       $rd2pos-$str2pos,       # 2ND STR (MAYBE)
 845                 $rd2pos,        $second_arg,            # 2ND RDEL (MAYBE)
 846                 $modpos,        $endpos-$modpos,        # MODIFIERS
 847                 $endpos,        $textlen-$endpos,       # REMAINDER
 848                );
 849 }
 850
 851 my $def_func =
 852 [
 853         sub { extract_variable($_[0], '') },
 854         sub { extract_quotelike($_[0],'') },
 855         sub { extract_codeblock($_[0],'{}','') },
 856 ];
 857
 858 sub extract_multiple (;$$$$)    # ($text, $functions_ref, $max_fields, $ignoreunknown)
 859 {
 860         my $textref = defined($_[0]) ? \$_[0] : \$_;
 861         my $posbug = pos;
 862         my ($lastpos, $firstpos);
 863         my @fields = ();
 864
 865         #for ($$textref)
 866         {
 867                 my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
 868                 my $max  = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
 869                 my $igunk = $_[3];
 870
 871                 pos $$textref ||= 0;
 872
 873                 unless (wantarray)
 874                 {
 875                         use Carp;
 876                         carp "extract_multiple reset maximal count to 1 in scalar context"
 877                                 if $^W && defined($_[2]) && $max > 1;
 878                         $max = 1
 879                 }
 880
 881                 my $unkpos;
 882                 my $func;
 883                 my $class;
 884
 885                 my @class;
 886                 foreach $func ( @func )
 887                 {
 888                         if (ref($func) eq 'HASH')
 889                         {
 890                                 push @class, (keys %$func)[0];
 891                                 $func = (values %$func)[0];
 892                         }
 893                         else
 894                         {
 895                                 push @class, undef;
 896                         }
 897                 }
 898
 899                 FIELD: while (pos($$textref) < length($$textref))
 900                 {
 901                         my $field;
 902                         my @bits;
 903                         foreach my $i ( 0..$#func )
 904                         {
 905                                 my $pref;
 906                                 $func = $func[$i];
 907                                 $class = $class[$i];
 908                                 $lastpos = pos $$textref;
 909                                 if (ref($func) eq 'CODE')
 910                                         { ($field,undef,$pref) = @bits = $func->($$textref) }
 911                                 elsif (ref($func) eq 'Text::Balanced::Extractor')
 912                                         { @bits = $field = $func->extract($$textref) }
 913                                 elsif( $$textref =~ m/\G$func/gc )
 914                                         { @bits = $field = defined($1) ? $1 : $& }
 915                                 $pref ||= "";
 916                                 if (defined($field) && length($field))
 917                                 {
 918                                         if (!$igunk) {
 919                                                 $unkpos = pos $$textref
 920                                                         if length($pref) && !defined($unkpos);
 921                                                 if (defined $unkpos)
 922                                                 {
 923                                                         push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
 924                                                         $firstpos = $unkpos unless defined $firstpos;
 925                                                         undef $unkpos;
 926                                                         last FIELD if @fields == $max;
 927                                                 }
 928                                         }
 929                                         push @fields, $class
 930                                                 ? bless (\$field, $class)
 931                                                 : $field;
 932                                         $firstpos = $lastpos unless defined $firstpos;
 933                                         $lastpos = pos $$textref;
 934                                         last FIELD if @fields == $max;
 935                                         next FIELD;
 936                                 }
 937                         }
 938                         if ($$textref =~ /\G(.)/gcs)
 939                         {
 940                                 $unkpos = pos($$textref)-1
 941                                         unless $igunk || defined $unkpos;
 942                         }
 943                 }
 944
 945                 if (defined $unkpos)
 946                 {
 947                         push @fields, substr($$textref, $unkpos);
 948                         $firstpos = $unkpos unless defined $firstpos;
 949                         $lastpos = length $$textref;
 950                 }
 951                 last;
 952         }
 953
 954         pos $$textref = $lastpos;
 955         return @fields if wantarray;
 956
 957         $firstpos ||= 0;
 958         eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
 959                pos $$textref = $firstpos };
 960         return $fields[0];
 961 }
 962
 963
 964 sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
 965 {
 966         my $ldel    = $_[0];
 967         my $rdel    = $_[1];
 968         my $pre     = defined $_[2] ? $_[2] : '\s*';
 969         my %options = defined $_[3] ? %{$_[3]} : ();
 970         my $omode   = defined $options{fail} ? $options{fail} : '';
 971         my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
 972                     : defined($options{reject})        ? $options{reject}
 973                     :                                    ''
 974                     ;
 975         my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
 976                     : defined($options{ignore})        ? $options{ignore}
 977                     :                                    ''
 978                     ;
 979
 980         if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
 981
 982         my $posbug = pos;
 983         for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
 984         pos = $posbug;
 985
 986         my $closure = sub
 987         {
 988                 my $textref = defined $_[0] ? \$_[0] : \$_;
 989                 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
 990
 991                 return _fail(wantarray, $textref) unless @match;
 992                 return _succeed wantarray, $textref,
 993                                 $match[2], $match[3]+$match[5]+$match[7],       # MATCH
 994                                 @match[8..9,0..1,2..7];                         # REM, PRE, BITS
 995         };
 996
 997         bless $closure, 'Text::Balanced::Extractor';
 998 }
 999
1000 package Text::Balanced::Extractor;
1001
1002 sub extract($$) # ($self, $text)
1003 {
1004         &{$_[0]}($_[1]);
1005 }
1006
1007 package Text::Balanced::ErrorMsg;
1008
1009 use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
1010
1011 1;
1012
1013 __END__
1014
1015 =head1 NAME
1016
1017 Text::Balanced - Extract delimited text sequences from strings.
1018
1019
1020 =head1 SYNOPSIS
1021
1022  use Text::Balanced qw (
1023                         extract_delimited
1024                         extract_bracketed
1025                         extract_quotelike
1026                         extract_codeblock
1027                         extract_variable
1028                         extract_tagged
1029                         extract_multiple
1030
1031                         gen_delimited_pat
1032                         gen_extract_tagged
1033                        );
1034
1035  # Extract the initial substring of $text that is delimited by
1036  # two (unescaped) instances of the first character in $delim.
1037
1038         ($extracted, $remainder) = extract_delimited($text,$delim);
1039
1040
1041  # Extract the initial substring of $text that is bracketed
1042  # with a delimiter(s) specified by $delim (where the string
1043  # in $delim contains one or more of '(){}[]<>').
1044
1045         ($extracted, $remainder) = extract_bracketed($text,$delim);
1046
1047
1048  # Extract the initial substring of $text that is bounded by
1049  # an HTML/XML tag.
1050
1051         ($extracted, $remainder) = extract_tagged($text);
1052
1053
1054  # Extract the initial substring of $text that is bounded by
1055  # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1056
1057         ($extracted, $remainder) =
1058                 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1059
1060
1061  # Extract the initial substring of $text that represents a
1062  # Perl "quote or quote-like operation"
1063
1064         ($extracted, $remainder) = extract_quotelike($text);
1065
1066
1067  # Extract the initial substring of $text that represents a block
1068  # of Perl code, bracketed by any of character(s) specified by $delim
1069  # (where the string $delim contains one or more of '(){}[]<>').
1070
1071         ($extracted, $remainder) = extract_codeblock($text,$delim);
1072
1073
1074  # Extract the initial substrings of $text that would be extracted by
1075  # one or more sequential applications of the specified functions
1076  # or regular expressions
1077
1078         @extracted = extract_multiple($text,
1079                                       [ \&extract_bracketed,
1080                                         \&extract_quotelike,
1081                                         \&some_other_extractor_sub,
1082                                         qr/[xyz]*/,
1083                                         'literal',
1084                                       ]);
1085
1086 # Create a string representing an optimized pattern (a la Friedl)
1087 # that matches a substring delimited by any of the specified characters
1088 # (in this case: any type of quote or a slash)
1089
1090         $patstring = gen_delimited_pat(q{'"`/});
1091
1092
1093 # Generate a reference to an anonymous sub that is just like extract_tagged
1094 # but pre-compiled and optimized for a specific pair of tags, and consequently
1095 # much faster (i.e. 3 times faster). It uses qr// for better performance on
1096 # repeated calls, so it only works under Perl 5.005 or later.
1097
1098         $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1099
1100         ($extracted, $remainder) = $extract_head->($text);
1101
1102
1103 =head1 DESCRIPTION
1104
1105 The various C<extract_...> subroutines may be used to extract a
1106 delimited string (possibly after skipping a specified prefix string).
1107 The search for the string always begins at the current C<pos>
1108 location of the string's variable (or at index zero, if no C<pos>
1109 position is defined).
1110
1111 =head2 General behaviour in list contexts
1112
1113 In a list context, all the subroutines return a list, the first three
1114 elements of which are always:
1115
1116 =over 4
1117
1118 =item [0]
1119
1120 The extracted string, including the specified delimiters.
1121 If the extraction fails an empty string is returned.
1122
1123 =item [1]
1124
1125 The remainder of the input string (i.e. the characters after the
1126 extracted string). On failure, the entire string is returned.
1127
1128 =item [2]
1129
1130 The skipped prefix (i.e. the characters before the extracted string).
1131 On failure, the empty string is returned.
1132
1133 =back
1134
1135 Note that in a list context, the contents of the original input text (the first
1136 argument) are not modified in any way.
1137
1138 However, if the input text was passed in a variable, that variable's
1139 C<pos> value is updated to point at the first character after the
1140 extracted text. That means that in a list context the various
1141 subroutines can be used much like regular expressions. For example:
1142
1143         while ( $next = (extract_quotelike($text))[0] )
1144         {
1145                 # process next quote-like (in $next)
1146         }
1147
1148
1149 =head2 General behaviour in scalar and void contexts
1150
1151 In a scalar context, the extracted string is returned, having first been
1152 removed from the input text. Thus, the following code also processes
1153 each quote-like operation, but actually removes them from $text:
1154
1155         while ( $next = extract_quotelike($text) )
1156         {
1157                 # process next quote-like (in $next)
1158         }
1159
1160 Note that if the input text is a read-only string (i.e. a literal),
1161 no attempt is made to remove the extracted text.
1162
1163 In a void context the behaviour of the extraction subroutines is
1164 exactly the same as in a scalar context, except (of course) that the
1165 extracted substring is not returned.
1166
1167 =head2 A note about prefixes
1168
1169 Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1170 This can bite you if you're expecting a prefix specification like
1171 '.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1172 pattern will only succeed if the <H1> tag is on the current line, since
1173 . normally doesn't match newlines.
1174
1175 To overcome this limitation, you need to turn on /s matching within
1176 the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1177
1178
1179 =head2 C<extract_delimited>
1180
1181 The C<extract_delimited> function formalizes the common idiom
1182 of extracting a single-character-delimited substring from the start of
1183 a string. For example, to extract a single-quote delimited string, the
1184 following code is typically used:
1185
1186         ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1187         $extracted = $1;
1188
1189 but with C<extract_delimited> it can be simplified to:
1190
1191         ($extracted,$remainder) = extract_delimited($text, "'");
1192
1193 C<extract_delimited> takes up to four scalars (the input text, the
1194 delimiters, a prefix pattern to be skipped, and any escape characters)
1195 and extracts the initial substring of the text that
1196 is appropriately delimited. If the delimiter string has multiple
1197 characters, the first one encountered in the text is taken to delimit
1198 the substring.
1199 The third argument specifies a prefix pattern that is to be skipped
1200 (but must be present!) before the substring is extracted.
1201 The final argument specifies the escape character to be used for each
1202 delimiter.
1203
1204 All arguments are optional. If the escape characters are not specified,
1205 every delimiter is escaped with a backslash (C<\>).
1206 If the prefix is not specified, the
1207 pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1208 is also not specified, the set C</["'`]/> is used. If the text to be processed
1209 is not specified either, C<$_> is used.
1210
1211 In list context, C<extract_delimited> returns an array of three
1212 elements, the extracted substring (I<including the surrounding
1213 delimiters>), the remainder of the text, and the skipped prefix (if
1214 any). If a suitable delimited substring is not found, the first
1215 element of the array is the empty string, the second is the complete
1216 original text, and the prefix returned in the third element is an
1217 empty string.
1218
1219 In a scalar context, just the extracted substring is returned. In
1220 a void context, the extracted substring (and any prefix) are simply
1221 removed from the beginning of the first argument.
1222
1223 Examples:
1224
1225         # Remove a single-quoted substring from the very beginning of $text:
1226
1227                 $substring = extract_delimited($text, "'", '');
1228
1229         # Remove a single-quoted Pascalish substring (i.e. one in which
1230         # doubling the quote character escapes it) from the very
1231         # beginning of $text:
1232
1233                 $substring = extract_delimited($text, "'", '', "'");
1234
1235         # Extract a single- or double- quoted substring from the
1236         # beginning of $text, optionally after some whitespace
1237         # (note the list context to protect $text from modification):
1238
1239                 ($substring) = extract_delimited $text, q{"'};
1240
1241
1242         # Delete the substring delimited by the first '/' in $text:
1243
1244                 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1245
1246 Note that this last example is I<not> the same as deleting the first
1247 quote-like pattern. For instance, if C<$text> contained the string:
1248
1249         "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1250
1251 then after the deletion it would contain:
1252
1253         "if ('.$UNIXCMD/s) { $cmd = $1; }"
1254
1255 not:
1256
1257         "if ('./cmd' =~ ms) { $cmd = $1; }"
1258
1259
1260 See L<"extract_quotelike"> for a (partial) solution to this problem.
1261
1262
1263 =head2 C<extract_bracketed>
1264
1265 Like C<"extract_delimited">, the C<extract_bracketed> function takes
1266 up to three optional scalar arguments: a string to extract from, a delimiter
1267 specifier, and a prefix pattern. As before, a missing prefix defaults to
1268 optional whitespace and a missing text defaults to C<$_>. However, a missing
1269 delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1270
1271 C<extract_bracketed> extracts a balanced-bracket-delimited
1272 substring (using any one (or more) of the user-specified delimiter
1273 brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1274 respect quoted unbalanced brackets (see below).
1275
1276 A "delimiter bracket" is a bracket in list of delimiters passed as
1277 C<extract_bracketed>'s second argument. Delimiter brackets are
1278 specified by giving either the left or right (or both!) versions
1279 of the required bracket(s). Note that the order in which
1280 two or more delimiter brackets are specified is not significant.
1281
1282 A "balanced-bracket-delimited substring" is a substring bounded by
1283 matched brackets, such that any other (left or right) delimiter
1284 bracket I<within> the substring is also matched by an opposite
1285 (right or left) delimiter bracket I<at the same level of nesting>. Any
1286 type of bracket not in the delimiter list is treated as an ordinary
1287 character.
1288
1289 In other words, each type of bracket specified as a delimiter must be
1290 balanced and correctly nested within the substring, and any other kind of
1291 ("non-delimiter") bracket in the substring is ignored.
1292
1293 For example, given the string:
1294
1295         $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1296
1297 then a call to C<extract_bracketed> in a list context:
1298
1299         @result = extract_bracketed( $text, '{}' );
1300
1301 would return:
1302
1303         ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1304
1305 since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1306 (In a scalar context just the first element of the array would be returned. In
1307 a void context, C<$text> would be replaced by an empty string.)
1308
1309 Likewise the call in:
1310
1311         @result = extract_bracketed( $text, '{[' );
1312
1313 would return the same result, since all sets of both types of specified
1314 delimiter brackets are correctly nested and balanced.
1315
1316 However, the call in:
1317
1318         @result = extract_bracketed( $text, '{([<' );
1319
1320 would fail, returning:
1321
1322         ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }"  );
1323
1324 because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1325 the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1326 return an empty string. In a void context, C<$text> would be unchanged.)
1327
1328 Note that the embedded single-quotes in the string don't help in this
1329 case, since they have not been specified as acceptable delimiters and are
1330 therefore treated as non-delimiter characters (and ignored).
1331
1332 However, if a particular species of quote character is included in the
1333 delimiter specification, then that type of quote will be correctly handled.
1334 for example, if C<$text> is:
1335
1336         $text = '<A HREF=">>>>">link</A>';
1337
1338 then
1339
1340         @result = extract_bracketed( $text, '<">' );
1341
1342 returns:
1343
1344         ( '<A HREF=">>>>">', 'link</A>', "" )
1345
1346 as expected. Without the specification of C<"> as an embedded quoter:
1347
1348         @result = extract_bracketed( $text, '<>' );
1349
1350 the result would be:
1351
1352         ( '<A HREF=">', '>>>">link</A>', "" )
1353
1354 In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1355 quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1356 letter 'q' as a delimiter. Hence:
1357
1358         @result = extract_bracketed( $text, '<q>' );
1359
1360 would correctly match something like this:
1361
1362         $text = '<leftop: conj /and/ conj>';
1363
1364 See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1365
1366
1367 =head2 C<extract_tagged>
1368
1369 C<extract_tagged> extracts and segments text between (balanced)
1370 specified tags.
1371
1372 The subroutine takes up to five optional arguments:
1373
1374 =over 4
1375
1376 =item 1.
1377
1378 A string to be processed (C<$_> if the string is omitted or C<undef>)
1379
1380 =item 2.
1381
1382 A string specifying a pattern to be matched as the opening tag.
1383 If the pattern string is omitted (or C<undef>) then a pattern
1384 that matches any standard HTML/XML tag is used.
1385
1386 =item 3.
1387
1388 A string specifying a pattern to be matched at the closing tag.
1389 If the pattern string is omitted (or C<undef>) then the closing
1390 tag is constructed by inserting a C</> after any leading bracket
1391 characters in the actual opening tag that was matched (I<not> the pattern
1392 that matched the tag). For example, if the opening tag pattern
1393 is specified as C<'{{\w+}}'> and actually matched the opening tag
1394 C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1395
1396 =item 4.
1397
1398 A string specifying a pattern to be matched as a prefix (which is to be
1399 skipped). If omitted, optional whitespace is skipped.
1400
1401 =item 5.
1402
1403 A hash reference containing various parsing options (see below)
1404
1405 =back
1406
1407 The various options that can be specified are:
1408
1409 =over 4
1410
1411 =item C<reject =E<gt> $listref>
1412
1413 The list reference contains one or more strings specifying patterns
1414 that must I<not> appear within the tagged text.
1415
1416 For example, to extract
1417 an HTML link (which should not contain nested links) use:
1418
1419         extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1420
1421 =item C<ignore =E<gt> $listref>
1422
1423 The list reference contains one or more strings specifying patterns
1424 that are I<not> be be treated as nested tags within the tagged text
1425 (even if they would match the start tag pattern).
1426
1427 For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1428
1429         extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1430
1431 (also see L<"gen_delimited_pat"> below).
1432
1433
1434 =item C<fail =E<gt> $str>
1435
1436 The C<fail> option indicates the action to be taken if a matching end
1437 tag is not encountered (i.e. before the end of the string or some
1438 C<reject> pattern matches). By default, a failure to match a closing
1439 tag causes C<extract_tagged> to immediately fail.
1440
1441 However, if the string value associated with <reject> is "MAX", then
1442 C<extract_tagged> returns the complete text up to the point of failure.
1443 If the string is "PARA", C<extract_tagged> returns only the first paragraph
1444 after the tag (up to the first line that is either empty or contains
1445 only whitespace characters).
1446 If the string is "", the default behaviour (i.e. failure) is reinstated.
1447
1448 For example, suppose the start tag "/para" introduces a paragraph, which then
1449 continues until the next "/endpara" tag or until another "/para" tag is
1450 encountered:
1451
1452         $text = "/para line 1\n\nline 3\n/para line 4";
1453
1454         extract_tagged($text, '/para', '/endpara', undef,
1455                                 {reject => '/para', fail => MAX );
1456
1457         # EXTRACTED: "/para line 1\n\nline 3\n"
1458
1459 Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1460 tag refers only to the immediately following paragraph:
1461
1462         $text = "/para line 1\n\nline 3\n/para line 4";
1463
1464         extract_tagged($text, '/para', '/endpara', undef,
1465                         {reject => '/para', fail => MAX );
1466
1467         # EXTRACTED: "/para line 1\n"
1468
1469 Note that the specified C<fail> behaviour applies to nested tags as well.
1470
1471 =back
1472
1473 On success in a list context, an array of 6 elements is returned. The elements are:
1474
1475 =over 4
1476
1477 =item [0]
1478
1479 the extracted tagged substring (including the outermost tags),
1480
1481 =item [1]
1482
1483 the remainder of the input text,
1484
1485 =item [2]
1486
1487 the prefix substring (if any),
1488
1489 =item [3]
1490
1491 the opening tag
1492
1493 =item [4]
1494
1495 the text between the opening and closing tags
1496
1497 =item [5]
1498
1499 the closing tag (or "" if no closing tag was found)
1500
1501 =back
1502
1503 On failure, all of these values (except the remaining text) are C<undef>.
1504
1505 In a scalar context, C<extract_tagged> returns just the complete
1506 substring that matched a tagged text (including the start and end
1507 tags). C<undef> is returned on failure. In addition, the original input
1508 text has the returned substring (and any prefix) removed from it.
1509
1510 In a void context, the input text just has the matched substring (and
1511 any specified prefix) removed.
1512
1513
1514 =head2 C<gen_extract_tagged>
1515
1516 (Note: This subroutine is only available under Perl5.005)
1517
1518 C<gen_extract_tagged> generates a new anonymous subroutine which
1519 extracts text between (balanced) specified tags. In other words,
1520 it generates a function identical in function to C<extract_tagged>.
1521
1522 The difference between C<extract_tagged> and the anonymous
1523 subroutines generated by
1524 C<gen_extract_tagged>, is that those generated subroutines:
1525
1526 =over 4
1527
1528 =item *
1529
1530 do not have to reparse tag specification or parsing options every time
1531 they are called (whereas C<extract_tagged> has to effectively rebuild
1532 its tag parser on every call);
1533
1534 =item *
1535
1536 make use of the new qr// construct to pre-compile the regexes they use
1537 (whereas C<extract_tagged> uses standard string variable interpolation
1538 to create tag-matching patterns).
1539
1540 =back
1541
1542 The subroutine takes up to four optional arguments (the same set as
1543 C<extract_tagged> except for the string to be processed). It returns
1544 a reference to a subroutine which in turn takes a single argument (the text to
1545 be extracted from).
1546
1547 In other words, the implementation of C<extract_tagged> is exactly
1548 equivalent to:
1549
1550         sub extract_tagged
1551         {
1552                 my $text = shift;
1553                 $extractor = gen_extract_tagged(@_);
1554                 return $extractor->($text);
1555         }
1556
1557 (although C<extract_tagged> is not currently implemented that way, in order
1558 to preserve pre-5.005 compatibility).
1559
1560 Using C<gen_extract_tagged> to create extraction functions for specific tags
1561 is a good idea if those functions are going to be called more than once, since
1562 their performance is typically twice as good as the more general-purpose
1563 C<extract_tagged>.
1564
1565
1566 =head2 C<extract_quotelike>
1567
1568 C<extract_quotelike> attempts to recognize, extract, and segment any
1569 one of the various Perl quotes and quotelike operators (see
1570 L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1571 delimiters (for the quotelike operators), and trailing modifiers are
1572 all caught. For example, in:
1573
1574         extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1575
1576         extract_quotelike '  "You said, \"Use sed\"."  '
1577
1578         extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1579
1580         extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1581
1582 the full Perl quotelike operations are all extracted correctly.
1583
1584 Note too that, when using the /x modifier on a regex, any comment
1585 containing the current pattern delimiter will cause the regex to be
1586 immediately terminated. In other words:
1587
1588         'm /
1589                 (?i)            # CASE INSENSITIVE
1590                 [a-z_]          # LEADING ALPHABETIC/UNDERSCORE
1591                 [a-z0-9]*       # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1592            /x'
1593
1594 will be extracted as if it were:
1595
1596         'm /
1597                 (?i)            # CASE INSENSITIVE
1598                 [a-z_]          # LEADING ALPHABETIC/'
1599
1600 This behaviour is identical to that of the actual compiler.
1601
1602 C<extract_quotelike> takes two arguments: the text to be processed and
1603 a prefix to be matched at the very beginning of the text. If no prefix
1604 is specified, optional whitespace is the default. If no text is given,
1605 C<$_> is used.
1606
1607 In a list context, an array of 11 elements is returned. The elements are:
1608
1609 =over 4
1610
1611 =item [0]
1612
1613 the extracted quotelike substring (including trailing modifiers),
1614
1615 =item [1]
1616
1617 the remainder of the input text,
1618
1619 =item [2]
1620
1621 the prefix substring (if any),
1622
1623 =item [3]
1624
1625 the name of the quotelike operator (if any),
1626
1627 =item [4]
1628
1629 the left delimiter of the first block of the operation,
1630
1631 =item [5]
1632
1633 the text of the first block of the operation
1634 (that is, the contents of
1635 a quote, the regex of a match or substitution or the target list of a
1636 translation),
1637
1638 =item [6]
1639
1640 the right delimiter of the first block of the operation,
1641
1642 =item [7]
1643
1644 the left delimiter of the second block of the operation
1645 (that is, if it is an C<s>, C<tr>, or C<y>),
1646
1647 =item [8]
1648
1649 the text of the second block of the operation
1650 (that is, the replacement of a substitution or the translation list
1651 of a translation),
1652
1653 =item [9]
1654
1655 the right delimiter of the second block of the operation (if any),
1656
1657 =item [10]
1658
1659 the trailing modifiers on the operation (if any).
1660
1661 =back
1662
1663 For each of the fields marked "(if any)" the default value on success is
1664 an empty string.
1665 On failure, all of these values (except the remaining text) are C<undef>.
1666
1667
1668 In a scalar context, C<extract_quotelike> returns just the complete substring
1669 that matched a quotelike operation (or C<undef> on failure). In a scalar or
1670 void context, the input text has the same substring (and any specified
1671 prefix) removed.
1672
1673 Examples:
1674
1675         # Remove the first quotelike literal that appears in text
1676
1677                 $quotelike = extract_quotelike($text,'.*?');
1678
1679         # Replace one or more leading whitespace-separated quotelike
1680         # literals in $_ with "<QLL>"
1681
1682                 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1683
1684
1685         # Isolate the search pattern in a quotelike operation from $text
1686
1687                 ($op,$pat) = (extract_quotelike $text)[3,5];
1688                 if ($op =~ /[ms]/)
1689                 {
1690                         print "search pattern: $pat\n";
1691                 }
1692                 else
1693                 {
1694                         print "$op is not a pattern matching operation\n";
1695                 }
1696
1697
1698 =head2 C<extract_quotelike> and "here documents"
1699
1700 C<extract_quotelike> can successfully extract "here documents" from an input
1701 string, but with an important caveat in list contexts.
1702
1703 Unlike other types of quote-like literals, a here document is rarely
1704 a contiguous substring. For example, a typical piece of code using
1705 here document might look like this:
1706
1707         <<'EOMSG' || die;
1708         This is the message.
1709         EOMSG
1710         exit;
1711
1712 Given this as an input string in a scalar context, C<extract_quotelike>
1713 would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1714 leaving the string " || die;\nexit;" in the original variable. In other words,
1715 the two separate pieces of the here document are successfully extracted and
1716 concatenated.
1717
1718 In a list context, C<extract_quotelike> would return the list
1719
1720 =over 4
1721
1722 =item [0]
1723
1724 "<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1725 including fore and aft delimiters),
1726
1727 =item [1]
1728
1729 " || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1730
1731 =item [2]
1732
1733 "" (i.e. the prefix substring -- trivial in this case),
1734
1735 =item [3]
1736
1737 "<<" (i.e. the "name" of the quotelike operator)
1738
1739 =item [4]
1740
1741 "'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1742
1743 =item [5]
1744
1745 "This is the message.\n" (i.e. the text of the here document),
1746
1747 =item [6]
1748
1749 "EOMSG" (i.e. the right delimiter of the here document),
1750
1751 =item [7..10]
1752
1753 "" (a here document has no second left delimiter, second text, second right
1754 delimiter, or trailing modifiers).
1755
1756 =back
1757
1758 However, the matching position of the input variable would be set to
1759 "exit;" (i.e. I<after> the closing delimiter of the here document),
1760 which would cause the earlier " || die;\nexit;" to be skipped in any
1761 sequence of code fragment extractions.
1762
1763 To avoid this problem, when it encounters a here document while
1764 extracting from a modifiable string, C<extract_quotelike> silently
1765 rearranges the string to an equivalent piece of Perl:
1766
1767         <<'EOMSG'
1768         This is the message.
1769         EOMSG
1770         || die;
1771         exit;
1772
1773 in which the here document I<is> contiguous. It still leaves the
1774 matching position after the here document, but now the rest of the line
1775 on which the here document starts is not skipped.
1776
1777 To prevent <extract_quotelike> from mucking about with the input in this way
1778 (this is the only case where a list-context C<extract_quotelike> does so),
1779 you can pass the input variable as an interpolated literal:
1780
1781         $quotelike = extract_quotelike("$var");
1782
1783
1784 =head2 C<extract_codeblock>
1785
1786 C<extract_codeblock> attempts to recognize and extract a balanced
1787 bracket delimited substring that may contain unbalanced brackets
1788 inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1789 is like a combination of C<"extract_bracketed"> and
1790 C<"extract_quotelike">.
1791
1792 C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1793 a text to process, a set of delimiter brackets to look for, and a prefix to
1794 match first. It also takes an optional fourth parameter, which allows the
1795 outermost delimiter brackets to be specified separately (see below).
1796
1797 Omitting the first argument (input text) means process C<$_> instead.
1798 Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1799 Omitting the third argument (prefix argument) implies optional whitespace at the start.
1800 Omitting the fourth argument (outermost delimiter brackets) indicates that the
1801 value of the second argument is to be used for the outermost delimiters.
1802
1803 Once the prefix an the outermost opening delimiter bracket have been
1804 recognized, code blocks are extracted by stepping through the input text and
1805 trying the following alternatives in sequence:
1806
1807 =over 4
1808
1809 =item 1.
1810
1811 Try and match a closing delimiter bracket. If the bracket was the same
1812 species as the last opening bracket, return the substring to that
1813 point. If the bracket was mismatched, return an error.
1814
1815 =item 2.
1816
1817 Try to match a quote or quotelike operator. If found, call
1818 C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1819 the error it returned. Otherwise go back to step 1.
1820
1821 =item 3.
1822
1823 Try to match an opening delimiter bracket. If found, call
1824 C<extract_codeblock> recursively to eat the embedded block. If the
1825 recursive call fails, return an error. Otherwise, go back to step 1.
1826
1827 =item 4.
1828
1829 Unconditionally match a bareword or any other single character, and
1830 then go back to step 1.
1831
1832 =back
1833
1834
1835 Examples:
1836
1837         # Find a while loop in the text
1838
1839                 if ($text =~ s/.*?while\s*\{/{/)
1840                 {
1841                         $loop = "while " . extract_codeblock($text);
1842                 }
1843
1844         # Remove the first round-bracketed list (which may include
1845         # round- or curly-bracketed code blocks or quotelike operators)
1846
1847                 extract_codeblock $text, "(){}", '[^(]*';
1848
1849
1850 The ability to specify a different outermost delimiter bracket is useful
1851 in some circumstances. For example, in the Parse::RecDescent module,
1852 parser actions which are to be performed only on a successful parse
1853 are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1854
1855         sentence: subject verb object
1856                         <defer: {$::theVerb = $item{verb}} >
1857
1858 Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1859 within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1860
1861 A deferred action like this:
1862
1863                         <defer: {if ($count>10) {$count--}} >
1864
1865 will be incorrectly parsed as:
1866
1867                         <defer: {if ($count>
1868
1869 because the "less than" operator is interpreted as a closing delimiter.
1870
1871 But, by extracting the directive using
1872 S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1873 the '>' character is only treated as a delimited at the outermost
1874 level of the code block, so the directive is parsed correctly.
1875
1876 =head2 C<extract_multiple>
1877
1878 The C<extract_multiple> subroutine takes a string to be processed and a
1879 list of extractors (subroutines or regular expressions) to apply to that string.
1880
1881 In an array context C<extract_multiple> returns an array of substrings
1882 of the original string, as extracted by the specified extractors.
1883 In a scalar context, C<extract_multiple> returns the first
1884 substring successfully extracted from the original string. In both
1885 scalar and void contexts the original string has the first successfully
1886 extracted substring removed from it. In all contexts
1887 C<extract_multiple> starts at the current C<pos> of the string, and
1888 sets that C<pos> appropriately after it matches.
1889
1890 Hence, the aim of a call to C<extract_multiple> in a list context
1891 is to split the processed string into as many non-overlapping fields as
1892 possible, by repeatedly applying each of the specified extractors
1893 to the remainder of the string. Thus C<extract_multiple> is
1894 a generalized form of Perl's C<split> subroutine.
1895
1896 The subroutine takes up to four optional arguments:
1897
1898 =over 4
1899
1900 =item 1.
1901
1902 A string to be processed (C<$_> if the string is omitted or C<undef>)
1903
1904 =item 2.
1905
1906 A reference to a list of subroutine references and/or qr// objects and/or
1907 literal strings and/or hash references, specifying the extractors
1908 to be used to split the string. If this argument is omitted (or
1909 C<undef>) the list:
1910
1911         [
1912                 sub { extract_variable($_[0], '') },
1913                 sub { extract_quotelike($_[0],'') },
1914                 sub { extract_codeblock($_[0],'{}','') },
1915         ]
1916
1917 is used.
1918
1919
1920 =item 3.
1921
1922 A number specifying the maximum number of fields to return. If this
1923 argument is omitted (or C<undef>), split continues as long as possible.
1924
1925 If the third argument is I<N>, then extraction continues until I<N> fields
1926 have been successfully extracted, or until the string has been completely
1927 processed.
1928
1929 Note that in scalar and void contexts the value of this argument is
1930 automatically reset to 1 (under C<-w>, a warning is issued if the argument
1931 has to be reset).
1932
1933 =item 4.
1934
1935 A value indicating whether unmatched substrings (see below) within the
1936 text should be skipped or returned as fields. If the value is true,
1937 such substrings are skipped. Otherwise, they are returned.
1938
1939 =back
1940
1941 The extraction process works by applying each extractor in
1942 sequence to the text string.
1943
1944 If the extractor is a subroutine it is called in a list context and is
1945 expected to return a list of a single element, namely the extracted
1946 text. It may optionally also return two further arguments: a string
1947 representing the text left after extraction (like $' for a pattern
1948 match), and a string representing any prefix skipped before the
1949 extraction (like $` in a pattern match). Note that this is designed
1950 to facilitate the use of other Text::Balanced subroutines with
1951 C<extract_multiple>. Note too that the value returned by an extractor
1952 subroutine need not bear any relationship to the corresponding substring
1953 of the original text (see examples below).
1954
1955 If the extractor is a precompiled regular expression or a string,
1956 it is matched against the text in a scalar context with a leading
1957 '\G' and the gc modifiers enabled. The extracted value is either
1958 $1 if that variable is defined after the match, or else the
1959 complete match (i.e. $&).
1960
1961 If the extractor is a hash reference, it must contain exactly one element.
1962 The value of that element is one of the
1963 above extractor types (subroutine reference, regular expression, or string).
1964 The key of that element is the name of a class into which the successful
1965 return value of the extractor will be blessed.
1966
1967 If an extractor returns a defined value, that value is immediately
1968 treated as the next extracted field and pushed onto the list of fields.
1969 If the extractor was specified in a hash reference, the field is also
1970 blessed into the appropriate class,
1971
1972 If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
1973 assumed to have failed to extract.
1974 If none of the extractor subroutines succeeds, then one
1975 character is extracted from the start of the text and the extraction
1976 subroutines reapplied. Characters which are thus removed are accumulated and
1977 eventually become the next field (unless the fourth argument is true, in which
1978 case they are discarded).
1979
1980 For example, the following extracts substrings that are valid Perl variables:
1981
1982         @fields = extract_multiple($text,
1983                                    [ sub { extract_variable($_[0]) } ],
1984                                    undef, 1);
1985
1986 This example separates a text into fields which are quote delimited,
1987 curly bracketed, and anything else. The delimited and bracketed
1988 parts are also blessed to identify them (the "anything else" is unblessed):
1989
1990         @fields = extract_multiple($text,
1991                    [
1992                         { Delim => sub { extract_delimited($_[0],q{'"}) } },
1993                         { Brack => sub { extract_bracketed($_[0],'{}') } },
1994                    ]);
1995
1996 This call extracts the next single substring that is a valid Perl quotelike
1997 operator (and removes it from $text):
1998
1999         $quotelike = extract_multiple($text,
2000                                       [
2001                                         sub { extract_quotelike($_[0]) },
2002                                       ], undef, 1);
2003
2004 Finally, here is yet another way to do comma-separated value parsing:
2005
2006         @fields = extract_multiple($csv_text,
2007                                   [
2008                                         sub { extract_delimited($_[0],q{'"}) },
2009                                         qr/([^,]+)(.*)/,
2010                                   ],
2011                                   undef,1);
2012
2013 The list in the second argument means:
2014 I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2015 The undef third argument means:
2016 I<"...as many times as possible...">,
2017 and the true value in the fourth argument means
2018 I<"...discarding anything else that appears (i.e. the commas)">.
2019
2020 If you wanted the commas preserved as separate fields (i.e. like split
2021 does if your split pattern has capturing parentheses), you would
2022 just make the last parameter undefined (or remove it).
2023
2024
2025 =head2 C<gen_delimited_pat>
2026
2027 The C<gen_delimited_pat> subroutine takes a single (string) argument and
2028    > builds a Friedl-style optimized regex that matches a string delimited
2029 by any one of the characters in the single argument. For example:
2030
2031         gen_delimited_pat(q{'"})
2032
2033 returns the regex:
2034
2035         (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2036
2037 Note that the specified delimiters are automatically quotemeta'd.
2038
2039 A typical use of C<gen_delimited_pat> would be to build special purpose tags
2040 for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2041 (which might contain quoted strings):
2042
2043         my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2044
2045         extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2046
2047
2048 C<gen_delimited_pat> may also be called with an optional second argument,
2049 which specifies the "escape" character(s) to be used for each delimiter.
2050 For example to match a Pascal-style string (where ' is the delimiter
2051 and '' is a literal ' within the string):
2052
2053         gen_delimited_pat(q{'},q{'});
2054
2055 Different escape characters can be specified for different delimiters.
2056 For example, to specify that '/' is the escape for single quotes
2057 and '%' is the escape for double quotes:
2058
2059         gen_delimited_pat(q{'"},q{/%});
2060
2061 If more delimiters than escape chars are specified, the last escape char
2062 is used for the remaining delimiters.
2063 If no escape char is specified for a given specified delimiter, '\' is used.
2064
2065 Note that
2066 C<gen_delimited_pat> was previously called
2067 C<delimited_pat>. That name may still be used, but is now deprecated.
2068
2069
2070 =head1 DIAGNOSTICS
2071
2072 In a list context, all the functions return C<(undef,$original_text)>
2073 on failure. In a scalar context, failure is indicated by returning C<undef>
2074 (in this case the input text is not modified in any way).
2075
2076 In addition, on failure in I<any> context, the C<$@> variable is set.
2077 Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2078 below.
2079 Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2080 which the error was detected (although not necessarily where it occurred!)
2081 Printing C<$@> directly produces the error message, with the offset appended.
2082 On success, the C<$@> variable is guaranteed to be C<undef>.
2083
2084 The available diagnostics are:
2085
2086 =over 4
2087
2088 =item  C<Did not find a suitable bracket: "%s">
2089
2090 The delimiter provided to C<extract_bracketed> was not one of
2091 C<'()[]E<lt>E<gt>{}'>.
2092
2093 =item  C<Did not find prefix: /%s/>
2094
2095 A non-optional prefix was specified but wasn't found at the start of the text.
2096
2097 =item  C<Did not find opening bracket after prefix: "%s">
2098
2099 C<extract_bracketed> or C<extract_codeblock> was expecting a
2100 particular kind of bracket at the start of the text, and didn't find it.
2101
2102 =item  C<No quotelike operator found after prefix: "%s">
2103
2104 C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2105 C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2106 it was extracting.
2107
2108 =item  C<Unmatched closing bracket: "%c">
2109
2110 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2111 a closing bracket where none was expected.
2112
2113 =item  C<Unmatched opening bracket(s): "%s">
2114
2115 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2116 out of characters in the text before closing one or more levels of nested
2117 brackets.
2118
2119 =item C<Unmatched embedded quote (%s)>
2120
2121 C<extract_bracketed> attempted to match an embedded quoted substring, but
2122 failed to find a closing quote to match it.
2123
2124 =item C<Did not find closing delimiter to match '%s'>
2125
2126 C<extract_quotelike> was unable to find a closing delimiter to match the
2127 one that opened the quote-like operation.
2128
2129 =item  C<Mismatched closing bracket: expected "%c" but found "%s">
2130
2131 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2132 a valid bracket delimiter, but it was the wrong species. This usually
2133 indicates a nesting error, but may indicate incorrect quoting or escaping.
2134
2135 =item  C<No block delimiter found after quotelike "%s">
2136
2137 C<extract_quotelike> or C<extract_codeblock> found one of the
2138 quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2139 without a suitable block after it.
2140
2141 =item C<Did not find leading dereferencer>
2142
2143 C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2144 a variable, but didn't find any of them.
2145
2146 =item C<Bad identifier after dereferencer>
2147
2148 C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2149 character was not followed by a legal Perl identifier.
2150
2151 =item C<Did not find expected opening bracket at %s>
2152
2153 C<extract_codeblock> failed to find any of the outermost opening brackets
2154 that were specified.
2155
2156 =item C<Improperly nested codeblock at %s>
2157
2158 A nested code block was found that started with a delimiter that was specified
2159 as being only to be used as an outermost bracket.
2160
2161 =item  C<Missing second block for quotelike "%s">
2162
2163 C<extract_codeblock> or C<extract_quotelike> found one of the
2164 quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2165
2166 =item C<No match found for opening bracket>
2167
2168 C<extract_codeblock> failed to find a closing bracket to match the outermost
2169 opening bracket.
2170
2171 =item C<Did not find opening tag: /%s/>
2172
2173 C<extract_tagged> did not find a suitable opening tag (after any specified
2174 prefix was removed).
2175
2176 =item C<Unable to construct closing tag to match: /%s/>
2177
2178 C<extract_tagged> matched the specified opening tag and tried to
2179 modify the matched text to produce a matching closing tag (because
2180 none was specified). It failed to generate the closing tag, almost
2181 certainly because the opening tag did not start with a
2182 bracket of some kind.
2183
2184 =item C<Found invalid nested tag: %s>
2185
2186 C<extract_tagged> found a nested tag that appeared in the "reject" list
2187 (and the failure mode was not "MAX" or "PARA").
2188
2189 =item C<Found unbalanced nested tag: %s>
2190
2191 C<extract_tagged> found a nested opening tag that was not matched by a
2192 corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2193
2194 =item C<Did not find closing tag>
2195
2196 C<extract_tagged> reached the end of the text without finding a closing tag
2197 to match the original opening tag (and the failure mode was not
2198 "MAX" or "PARA").
2199
2200
2201
2202
2203 =back
2204
2205
2206 =head1 AUTHOR
2207
2208 Damian Conway (damian@conway.org)
2209
2210
2211 =head1 BUGS AND IRRITATIONS
2212
2213 There are undoubtedly serious bugs lurking somewhere in this code, if
2214 only because parts of it give the impression of understanding a great deal
2215 more about Perl than they really do.
2216
2217 Bug reports and other feedback are most welcome.
2218
2219
2220 =head1 COPYRIGHT
2221
2222  Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
2223  This module is free software. It may be used, redistributed
2224      and/or modified under the same terms as Perl itself.