Re: [PATCH B::Deparse] fix string uninterpretation
[p5sagit/p5-mst-13.2.git] / x2p / s2p.PL
CommitLineData
86a59229 1#!/usr/bin/perl
4633a7c4 2
3use Config;
4use File::Basename qw(&basename &dirname);
8a5546a1 5use Cwd;
4633a7c4 6
7# List explicitly here the variables you want Configure to
8# generate. Metaconfig only looks for shell variables, so you
9# have to mention them as if they were shell variables, not
10# %Config entries. Thus you write
11# $startperl
12# to ensure Configure will look for $Config{startperl}.
13
14# This forces PL files to create target in same directory as PL file.
15# This is so that make depend always knows where to find PL derivatives.
8a5546a1 16$origdir = cwd;
44a8e56a 17chdir dirname($0);
18$file = basename($0, '.PL');
774d564b 19$file .= '.com' if $^O eq 'VMS';
4633a7c4 20
21open OUT,">$file" or die "Can't create $file: $!";
22
23print "Extracting $file (with variable substitutions)\n";
24
25# In this section, perl variables will be expanded during extraction.
26# You can use $Config{...} to use Configure variables.
27
28print OUT <<"!GROK!THIS!";
5f05dabc 29$Config{startperl}
30 eval 'exec $Config{perlpath} -S \$0 \${1+"\$@"}'
31 if \$running_under_some_shell;
ed6d8ea1 32my $startperl;
33my $perlpath;
34(\$startperl = <<'/../') =~ s/\\s*\\z//;
35$Config{startperl}
36/../
37(\$perlpath = <<'/../') =~ s/\\s*\\z//;
38$Config{perlpath}
39/../
a687059c 40!GROK!THIS!
41
4633a7c4 42# In the following, perl variables are not expanded during extraction.
43
44print OUT <<'!NO!SUBS!';
a687059c 45
86a59229 46$0 =~ s/^.*?(\w+)$/$1/;
47
48# (p)sed - a stream editor
49# History: Aug 12 2000: Original version.
50
51use strict;
52use integer;
53use Symbol;
8d063cd8 54
d83e3bda 55=head1 NAME
56
86a59229 57sed - a stream editor
d83e3bda 58
59=head1 SYNOPSIS
60
86a59229 61 sed [-an] script [file ...]
62 sed [-an] [-e script] [-f script-file] [file ...]
d83e3bda 63
64=head1 DESCRIPTION
65
86a59229 66A stream editor reads the input stream consisting of the specified files
67(or standard input, if none are given), processes is line by line by
68applying a script consisting of edit commands, and writes resulting lines
69to standard output. The filename `C<->' may be used to read standard input.
70
71The edit script is composed from arguments of B<-e> options and
72script-files, in the given order. A single script argument may be specified
73as the first parameter.
74
75If this program is invoked with the name F<s2p>, it will act as a
76sed-to-Perl translator. See L<"sed Script Translation">.
d83e3bda 77
86a59229 78B<sed> returns an exit code of 0 on success or >0 if an error occurred.
d83e3bda 79
86a59229 80=head1 OPTIONS
d83e3bda 81
86a59229 82=over 4
d83e3bda 83
86a59229 84=item B<-a>
d83e3bda 85
86a59229 86A file specified as argument to the B<w> edit command is by default
87opened before input processing starts. Using B<-a>, opening of such
88files is delayed until the first line is actually written to the file.
89
90=item B<-e> I<script>
91
92The editing commands defined by I<script> are appended to the script.
93Multiple commands must be separated by newlines.
94
95=item B<-f> I<script-file>
96
97Editing commands from the specified I<script-file> are read and appended
98to the script.
d83e3bda 99
100=item B<-n>
101
86a59229 102By default, a line is written to standard output after the editing script
103has been applied to it. The B<-n> option suppresses automatic printing.
104
105=back
106
107=head1 COMMANDS
108
109B<sed> command syntax is defined as
110
111Z<> Z<> Z<> Z<>[I<address>[B<,>I<address>]][B<!>]I<function>[I<argument>]
112
113with whitespace being permitted before or after addresses, and between
114the function character and the argument. The I<address>es and the
115address inverter (C<!>) are used to restrict the application of a
116command to the selected line(s) of input.
117
118Each command must be on a line of its own, except where noted in
119the synopses below.
120
121The edit cycle performed on each input line consist of reading the line
122(without its trailing newline character) into the I<pattern space>,
123applying the applicable commands of the edit script, writing the final
124contents of the pattern space and a newline to the standard output.
125A I<hold space> is provided for saving the contents of the
126pattern space for later use.
127
128=head2 Addresses
129
130A sed address is either a line number or a pattern, which may be combined
131arbitrarily to construct ranges. Lines are numbered across all input files.
132
133Any address may be followed by an exclamation mark (`C<!>'), selecting
134all lines not matching that address.
d83e3bda 135
86a59229 136=over 4
d83e3bda 137
86a59229 138=item I<number>
139
140The line with the given number is selected.
141
142=item B<$>
143
144A dollar sign (C<$>) is the line number of the last line of the input stream.
145
146=item B</>I<regular expression>B</>
147
148A pattern address is a basic regular expression (see
149L<"Basic Regular Expressions">), between the delimiting character C</>.
150Any other character except C<\> or newline may be used to delimit a
151pattern address when the initial delimiter is prefixed with a
152backslash (`C<\>').
d83e3bda 153
154=back
155
86a59229 156If no address is given, the command selects every line.
d83e3bda 157
86a59229 158If one address is given, it selects the line (or lines) matching the
159address.
d83e3bda 160
86a59229 161Two addresses select a range that begins whenever the first address
162matches, and ends (including that line) when the second address matches.
163If the first (second) address is a matching pattern, the second
164address is not applied to the very same line to determine the end of
165the range. Likewise, if the second address is a matching pattern, the
166first address is not applied to the very same line to determine the
167begin of another range. If both addresses are line numbers,
168and the second line number is less than the first line number, then
169only the first line is selected.
d83e3bda 170
d83e3bda 171
86a59229 172=head2 Functions
d83e3bda 173
86a59229 174The maximum permitted number of addresses is indicated with each
175function synopsis below.
d83e3bda 176
86a59229 177The argument I<text> consists of one or more lines following the command.
178Embedded newlines in I<text> must be preceded with a backslash. Other
179backslashes in I<text> are deleted and the following character is taken
180literally.
d83e3bda 181
86a59229 182=over 4
d83e3bda 183
86a59229 184=cut
185
186my %ComTab;
187#--------------------------------------------------------------------------
188$ComTab{'a'}=[ 1, 'txt', \&Emit, '{ push( @Q, <<'."'TheEnd' ) }\n" ]; #ok
189
190=item [1addr]B<a\> I<text>
191
192Write I<text> (which must start on the line following the command)
193to standard output immediately before reading the next line
194of input, either by executing the B<N> function or by beginning a new cycle.
195
196=cut
197
198#--------------------------------------------------------------------------
199$ComTab{'b'}=[ 2, 'str', \&Branch, '{ goto XXX; }' ]; #ok
200
201=item [2addr]B<b> [I<label>]
202
203Branch to the B<:> function with the specified I<label>. If no label
204is given, branch to the end of the script.
205
206=cut
207
208#--------------------------------------------------------------------------
209$ComTab{'c'}=[ 2, 'txt', \&Change, <<'-X-' ]; #ok
210{ print <<'TheEnd'; } $doPrint = 0; goto EOS;
211-X-
212### continue OK => next CYCLE;
213
214=item [2addr]B<c\> I<text>
215
216The line, or range of lines, selected by the address is deleted.
217The I<text> (which must start on the line following the command)
218is written to standard output. With an address range, this occurs at
219the end of the range.
220
221=cut
222
223#--------------------------------------------------------------------------
224$ComTab{'d'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
225{ $doPrint = 0;
226 goto EOS;
227}
228-X-
229### continue OK => next CYCLE;
230
231=item [2addr]B<d>
232
233Deletes the pattern space and starts the next cycle.
234
235=cut
236
237#--------------------------------------------------------------------------
238$ComTab{'D'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
239{ s/^.*\n?//;
240 if(length($_)){ goto BOS } else { goto EOS }
241}
242-X-
243### continue OK => next CYCLE;
244
245=item [2addr]B<D>
246
247Deletes the pattern space through the first embedded newline or to the end.
248If the pattern space becomes empty, a new cycle is started, otherwise
249execution of the script is restarted.
250
251=cut
252
253#--------------------------------------------------------------------------
254$ComTab{'g'}=[ 2, '', \&Emit, '{ $_ = $Hold };' ]; #ok
255
256=item [2addr]B<g>
257
258Replace the contents of the pattern space with the hold space.
259
260=cut
261
262#--------------------------------------------------------------------------
263$ComTab{'G'}=[ 2, '', \&Emit, '{ $_ .= "\n"; $_ .= $Hold };' ]; #ok
264
265=item [2addr]B<G>
266
267Append a newline and the contents of the hold space to the pattern space.
268
269=cut
270
271#--------------------------------------------------------------------------
272$ComTab{'h'}=[ 2, '', \&Emit, '{ $Hold = $_ }' ]; #ok
273
274=item [2addr]B<h>
275
276Replace the contents of the hold space with the pattern space.
277
278=cut
279
280#--------------------------------------------------------------------------
281$ComTab{'H'}=[ 2, '', \&Emit, '{ $Hold .= "\n"; $Hold .= $_; }' ]; #ok
282
283=item [2addr]B<H>
284
285Append a newline and the contents of the pattern space to the hold space.
286
287=cut
288
289#--------------------------------------------------------------------------
290$ComTab{'i'}=[ 1, 'txt', \&Emit, '{ print <<'."'TheEnd' }\n" ]; #ok
291
292=item [1addr]B<i\> I<text>
293
294Write the I<text> (which must start on the line following the command)
295to standard output.
296
297=cut
298
299#--------------------------------------------------------------------------
300$ComTab{'l'}=[ 2, '', \&Emit, '{ _l() }' ]; #okUTF8
301
302=item [2addr]B<l>
303
304Print the contents of the pattern space: non-printable characters are
305shown in C-style escaped form; long lines are split and have a trailing
306`C<\>' at the point of the split; the true end of a line is marked with
307a `C<$>'. Escapes are: `\a', `\t', `\n', `\f', `\r', `\e' for
308BEL, HT, LF, FF, CR, ESC, respectively, and `\' followed by a three-digit
309octal number for all other non-printable characters.
310
311=cut
312
313#--------------------------------------------------------------------------
314$ComTab{'n'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
315{ print $_, "\n" if $doPrint;
316 printQ if @Q;
317 $CondReg = 0;
318 last CYCLE unless getsARGV();
319 chomp();
320}
321-X-
322
323=item [2addr]B<n>
324
325If automatic printing is enabled, write the pattern space to the standard
326output. Replace the pattern space with the next line of input. If
327there is no more input, processing is terminated.
328
329=cut
330
331#--------------------------------------------------------------------------
332$ComTab{'N'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
333{ printQ if @Q;
334 $CondReg = 0;
335 last CYCLE unless getsARGV( $h );
336 chomp( $h );
337 $_ .= "\n$h";
338}
339-X-
340
341=item [2addr]B<N>
342
343Append a newline and the next line of input to the pattern space. If
344there is no more input, processing is terminated.
345
346=cut
347
348#--------------------------------------------------------------------------
349$ComTab{'p'}=[ 2, '', \&Emit, '{ print $_, "\n"; }' ]; #ok
350
351=item [2addr]B<p>
352
353Print the pattern space to the standard output. (Use the B<-n> option
354to suppress automatic printing at the end of a cycle if you want to
355avoid double printing of lines.)
356
357=cut
358
359#--------------------------------------------------------------------------
360$ComTab{'P'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
361{ if( /^(.*)/ ){ print $1, "\n"; } }
362-X-
363
364=item [2addr]B<P>
365
366Prints the pattern space through the first embedded newline or to the end.
367
368=cut
369
370#--------------------------------------------------------------------------
371$ComTab{'q'}=[ 1, '', \&Emit, <<'-X-' ]; #ok
372{ print $_, "\n" if $doPrint;
373 last CYCLE;
374}
375-X-
376
377=item [1addr]B<q>
378
379Branch to the end of the script and quit without starting a new cycle.
380
381=cut
382
383#--------------------------------------------------------------------------
384$ComTab{'r'}=[ 1, 'str', \&Emit, "{ _r( '-X-' ) }" ]; #ok
385### FIXME: lazy reading - big files???
386
387=item [1addr]B<r> I<file>
388
389Copy the contents of the I<file> to standard output immediately before
390the next attempt to read a line of input. Any error encountered while
391reading I<file> is silently ignored.
392
393=cut
394
395#--------------------------------------------------------------------------
396$ComTab{'s'}=[ 2, 'sub', \&Emit, '' ]; #ok
397
398=item [2addr]B<s/>I<regular expression>B</>I<replacement>B</>I<flags>
399
400Substitute the I<replacement> string for the first substring in
401the pattern space that matches the I<regular expression>.
402Any character other than backslash or newline can be used instead of a
403slash to delimit the regular expression and the replacement.
404To use the delimiter as a literal character within the regular expression
405and the replacement, precede the character by a backslash (`C<\>').
406
407Literal newlines may be embedded in the replacement string by
408preceding a newline with a backslash.
409
410Within the replacement, an ampersand (`C<&>') is replaced by the string
411matching the regular expression. The strings `C<\1>' through `C<\9>' are
412replaced by the corresponding subpattern (see L<"Basic Regular Expressions">).
413To get a literal `C<&>' or `C<\>' in the replacement text, precede it
414by a backslash.
415
416The following I<flags> modify the behaviour of the B<s> command:
417
418=over 8
419
420=item B<g>
421
422The replacement is performed for all matching, non-overlapping substrings
423of the pattern space.
424
425=item B<1>..B<9>
426
427Replace only the n-th matching substring of the pattern space.
428
429=item B<p>
430
431If the substitution was made, print the new value of the pattern space.
432
433=item B<w> I<file>
434
435If the substitution was made, write the new value of the pattern space
436to the specified file.
437
438=back
439
440=cut
441
442#--------------------------------------------------------------------------
443$ComTab{'t'}=[ 2, 'str', \&Branch, '{ goto XXX if _t() }' ]; #ok
444
445=item [2addr]B<t> [I<label>]
446
447Branch to the B<:> function with the specified I<label> if any B<s>
448substitutions have been made since the most recent reading of an input line
449or execution of a B<t> function. If no label is given, branch to the end of
450the script.
451
452
453=cut
454
455#--------------------------------------------------------------------------
456$ComTab{'w'}=[ 2, 'str', \&Write, "{ _w( '-X-' ) }" ]; #ok
457
458=item [2addr]B<w> I<file>
459
460The contents of the pattern space are written to the I<file>.
461
462=cut
463
464#--------------------------------------------------------------------------
465$ComTab{'x'}=[ 2, '', \&Emit, '{ ($Hold, $_) = ($_, $Hold) }' ]; #ok
466
467=item [2addr]B<x>
468
469Swap the contents of the pattern space and the hold space.
470
471=cut
472
473#--------------------------------------------------------------------------
474$ComTab{'y'}=[ 2, 'tra', \&Emit, '' ]; #ok
475=item [2addr]B<y>B</>I<string1>B</>I<string2>B</>
476
477In the pattern space, replace all characters occuring in I<string1> by the
478character at the corresponding position in I<string2>. It is possible
479to use any character (other than a backslash or newline) instead of a
480slash to delimit the strings. Within I<string1> and I<string2>, a
481backslash followed by any character other than a newline is that literal
482character, and a backslash followed by an `n' is replaced by a newline
483character.
484
485=cut
d83e3bda 486
86a59229 487#--------------------------------------------------------------------------
488$ComTab{'='}=[ 1, '', \&Emit, '{ print "$.\n" }' ]; #ok
489
490=item [1addr]B<=>
491
492Prints the current line number on the standard output.
493
494=cut
495
496#--------------------------------------------------------------------------
497$ComTab{':'}=[ 0, 'str', \&Label, '' ]; #ok
498
499=item [0addr]B<:> [I<label>]
500
501The command specifies the position of the I<label>. It has no other effect.
502
503=cut
504
505#--------------------------------------------------------------------------
506$ComTab{'{'}=[ 2, '', \&BeginBlock, '{' ]; #ok
507$ComTab{'}'}=[ 0, '', \&EndBlock, ';}' ]; #ok
508# ';' to avoid warning on empty {}-block
509
510=item [2addr]B<{> [I<command>]
511
512=item [0addr]B<}>
513
514These two commands begin and end a command list. The first command may
515be given on the same line as the opening B<{> command. The commands
516within the list are jointly selected by the address(es) given on the
517B<{> command (but may still have individual addresses).
518
519=cut
520
521#--------------------------------------------------------------------------
522$ComTab{'#'}=[ 0, 'str', \&Comment, '' ]; #ok
523
524=item [0addr]B<#> [I<comment>]
525
526The entire line is ignored (treated as a comment). If, however, the first
527two characters in the script are `C<#n>', automatic printing of output is
528suppressed, as if the B<-n> option were given on the command line.
529
530=back
531
532=cut
533
534use vars qw{ $isEOF $Hold %wFiles @Q $CondReg $doPrint };
535
536my $useDEBUG = exists( $ENV{PSEDDEBUG} );
537my $useEXTBRE = $ENV{PSEDEXTBRE} || '';
538$useEXTBRE =~ s/[^<>wWyB]//g; # gawk RE's handle these
539
540my $doAutoPrint = 1; # automatic printing of pattern space (-n => 0)
541my $doOpenWrite = 1; # open w command output files at start (-a => 0)
542my $svOpenWrite = 0; # save $doOpenWrite
543my $doGenerate = $0 eq 's2p';
544
545# Collected and compiled script
546#
547my( @Commands, %Defined, @BlockStack, %Label, $labNum, $Code );
548
549##################
550# Compile Time
551#
552# Labels
553#
554# Error handling
555#
556sub Warn($;$){
557 my( $msg, $loc ) = @_;
558 $loc ||= '';
559 $loc .= ': ' if length( $loc );
560 warn( "$0: $loc$msg\n" );
561}
562
563$labNum = 0;
564sub newLabel(){
565 return 'L_'.++$labNum;
566}
567
568# safeHere: create safe here delimiter and modify opcode and argument
569#
570sub safeHere($$){
571 my( $codref, $argref ) = @_;
572 my $eod = 'EOD000';
573 while( $$argref =~ /^$eod$/m ){
574 $eod++;
575 }
576 $$codref =~ s/TheEnd/$eod/e;
577 $$argref .= "$eod\n";
578}
579
580# Emit: create address logic and emit command
581#
582sub Emit($$$$$$){
583 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
584 my $cond = '';
585 if( defined( $addr1 ) ){
586 if( defined( $addr2 ) ){
587 $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
588 } else {
589 $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
590 }
591 $cond = $negated ? "unless( $addr1 )\n" : "if( $addr1 )\n";
592 }
593
594 if( $opcode eq '' ){
595 $Code .= "$cond$arg\n";
596
597 } elsif( $opcode =~ s/-X-/$arg/e ){
598 $Code .= "$cond$opcode\n";
599
600 } elsif( $opcode =~ /TheEnd/ ){
601 safeHere( \$opcode, \$arg );
602 $Code .= "$cond$opcode$arg";
603
604 } else {
605 $Code .= "$cond$opcode\n";
606 }
607 0;
608}
609
610# Write (w command, w flag): store pathname
611#
612sub Write($$$$$$){
613 my( $addr1, $addr2, $negated, $opcode, $path, $fl ) = @_;
614 $wFiles{$path} = '';
615 Emit( $addr1, $addr2, $negated, $opcode, $path, $fl );
616}
617
618
619# Label (: command): label definition
620#
621sub Label($$$$$$){
622 my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
623 my $rc = 0;
624 $lab =~ s/\s+//;
625 if( length( $lab ) ){
626 my $h;
627 if( ! exists( $Label{$lab} ) ){
628 $h = $Label{$lab}{name} = newLabel();
629 } else {
630 $h = $Label{$lab}{name};
631 if( exists( $Label{$lab}{defined} ) ){
632 my $dl = $Label{$lab}{defined};
633 Warn( "duplicate label $lab (first defined at $dl)", $fl );
634 $rc = 1;
635 }
636 }
637 $Label{$lab}{defined} = $fl;
638 $Code .= "$h:;\n";
639 }
640 $rc;
641}
642
643# BeginBlock ({ command): push block start
644#
645sub BeginBlock($$$$$$){
646 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
647 push( @BlockStack, [ $fl, $addr1, $addr2, $negated ] );
648 Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
649}
650
651# EndBlock (} command): check proper nesting
652#
653sub EndBlock($$$$$$){
654 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
655 my $rc;
656 my $jcom = pop( @BlockStack );
657 if( defined( $jcom ) ){
658 $rc = Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
659 } else {
660 Warn( "unexpected `}'", $fl );
661 $rc = 1;
662 }
663 $rc;
664}
665
666# Branch (t, b commands): check or create label, substitute default
667#
668sub Branch($$$$$$){
669 my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
670 $lab =~ s/\s+//; # no spaces at end
671 my $h;
672 if( length( $lab ) ){
673 if( ! exists( $Label{$lab} ) ){
674 $h = $Label{$lab}{name} = newLabel();
675 } else {
676 $h = $Label{$lab}{name};
677 }
678 push( @{$Label{$lab}{used}}, $fl );
679 } else {
680 $h = 'EOS';
681 }
682 $opcode =~ s/XXX/$h/e;
683 Emit( $addr1, $addr2, $negated, $opcode, '', $fl );
684}
685
686# Change (c command): is special due to range end watching
687#
688sub Change($$$$$$){
689 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
690 my $kwd = $negated ? 'unless' : 'if';
691 if( defined( $addr2 ) ){
692 $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
693 if( ! $negated ){
694 $addr1 = '$icnt = ('.$addr1.')';
695 $opcode = 'if( $icnt =~ /E0$/ )' . $opcode;
696 }
697 } else {
698 $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
699 }
700 safeHere( \$opcode, \$arg );
701 $Code .= "$kwd( $addr1 ){\n $opcode$arg}\n";
702 0;
703}
704
705
706# Comment (# command): A no-op. Who would've thought that!
707#
708sub Comment($$$$$$){
709 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
710### $Code .= "# $arg\n";
711 0;
712}
713
714
715sub stripRegex($$){
716 my( $del, $sref ) = @_;
717 my $regex = $del;
718 print "stripRegex:$del:$$sref:\n" if $useDEBUG;
719 while( $$sref =~ s{^(.*?)(\\*)\Q$del\E(\s*)}{}s ){
720 my $sl = $2;
721 $regex .= $1.$sl.$del;
722 if( length( $sl ) % 2 == 0 ){
723 return $regex;
724 }
725 $regex .= $3;
726 }
727 undef();
728}
729
730# stripTrans: take a <del> terminated string from y command
731# honoring and cleaning up of \-escaped <del>'s
732#
733sub stripTrans($$){
734 my( $del, $sref ) = @_;
735 my $t = '';
736 print "stripTrans:$del:$$sref:\n" if $useDEBUG;
737 while( $$sref =~ s{^(.*?)(\\*)\Q$del\E}{}s ){
738 my $sl = $2;
739 $t .= $1;
740 if( length( $sl ) % 2 == 0 ){
741 $t .= $sl;
742 $t =~ s/\\\\/\\/g;
743 return $t;
744 }
745 chop( $sl );
746 $t .= $sl.$del.$3;
747 }
748 undef();
749}
750
751# makey - construct Perl y/// from sed y///
752#
753sub makey($$$){
754 my( $fr, $to, $fl ) = @_;
755 my $error = 0;
756
757 # Ensure that any '-' is up front.
758 # Diagnose duplicate contradicting mappings
759 my %tr;
760 for( my $i = 0; $i < length($fr); $i++ ){
761 my $fc = substr($fr,$i,1);
762 my $tc = substr($to,$i,1);
763 if( exists( $tr{$fc} ) && $tr{$fc} ne $tc ){
764 Warn( "ambiguos translation for character `$fc' in `y' command",
765 $fl );
766 $error++;
767 }
768 $tr{$fc} = $tc;
769 }
770 $fr = $to = '';
771 if( exists( $tr{'-'} ) ){
772 ( $fr, $to ) = ( '-', $tr{'-'} );
773 delete( $tr{'-'} );
774 } else {
775 $fr = $to = '';
776 }
777 # might just as well sort it...
778 for my $fc ( sort keys( %tr ) ){
779 $fr .= $fc;
780 $to .= $tr{$fc};
781 }
782 # make embedded delimiters and newlines safe
783 $fr =~ s/([{}])/\$1/g;
784 $to =~ s/([{}])/\$1/g;
785 $fr =~ s/\n/\\n/g;
786 $to =~ s/\n/\\n/g;
787 return $error ? undef() : "{ y{$fr}{$to}; }";
788}
789
790######
791# makes - construct Perl s/// from sed s///
792#
793sub makes($$$$$$$){
794 my( $regex, $subst, $path, $global, $print, $nmatch, $fl ) = @_;
795
796 # make embedded newlines safe
797 $regex =~ s/\n/\\n/g;
798 $subst =~ s/\n/\\n/g;
799
800 my $code;
801 # n-th occurrence
802 #
803 if( length( $nmatch ) ){
804 $code = <<TheEnd;
805{ \$n = $nmatch;
806 while( --\$n && ( \$s = m ${regex}g ) ){}
807 \$s = ( substr( \$_, pos() ) =~ s ${regex}${subst}s ) if \$s;
808 \$CondReg ||= \$s;
809TheEnd
810 } else {
811 $code = <<TheEnd;
812{ \$s = s ${regex}${subst}s${global};
813 \$CondReg ||= \$s;
814TheEnd
815 }
816 if( $print ){
817 $code .= ' print $_, "\n" if $s;'."\n";
818 }
819 if( defined( $path ) ){
820 $wFiles{$path} = '';
821 $code .= " _w( '$path' ) if \$s;\n";
822 }
823 $code .= "}";
824}
825
826=head1 BASIC REGULAR EXPRESSIONS
827
828A I<Basic Regular Expression> (BRE), as defined in POSIX 1003.2, consists
829of I<atoms>, for matching parts of a string, and I<bounds>, specifying
830repetitions of a preceding atom.
831
832=head2 Atoms
833
834The possible atoms of a BRE are: B<.>, matching any single character;
835B<^> and B<$>, matching the null string at the beginning or end
836of a string, respectively; a I<bracket expressions>, enclosed
837in B<[> and B<]> (see below); and any single character with no
838other significance (matching that character). A B<\> before one
839of: B<.>, B<^>, B<$>, B<[>, B<*>, B<\>, matching the character
840after the backslash. A sequence of atoms enclosed in B<\(> and B<\)>
841becomes an atom and establishes the target for a I<backreference>,
842consisting of the substring that actually matches the enclosed atoms.
843Finally, B<\> followed by one of the digits B<0> through B<9> is a
844backreference.
845
846A B<^> that is not first, or a B<$> that is not last does not have
847a special significance and need not be preceded by a backslash to
848become literal. The same is true for a B<]>, that does not terminate
849a bracket expression.
850
851An unescaped backslash cannot be last in a BRE.
852
853=head2 Bounds
854
855The BRE bounds are: B<*>, specifying 0 or more matches of the preceding
856atom; B<\{>I<count>B<\}>, specifying that many repetitions;
857B<\{>I<minimum>B<,\}>, giving a lower limit; and
858B<\{>I<minimum>B<,>I<maximum>B<\}> finally defines a lower and upper
859bound.
860
861A bound appearing as the first item in a BRE is taken literally.
862
863=head2 Bracket Expressions
864
865A I<bracket expression> is a list of characters, character ranges
866and character classes enclosed in B<[> and B<]> and matches any
867single character from the represented set of characters.
868
869A character range is written as two characters separated by B<-> and
870represents all characters (according to the character collating sequence)
871that are not less than the first and not greater than the second.
872(Ranges are very collating-sequence-dependent, and portable programs
873should avoid relying on them.)
874
875A character class is one of the class names
876
877 alnum digit punct
878 alpha graph space
879 blank lower upper
880 cntrl print xdigit
881
882enclosed in B<[:> and B<:]> and represents the set of characters
883as defined in ctype(3).
884
885If the first character after B<[> is B<^>, the sense of matching is
886inverted.
887
888To include a literal `C<^>', place it anywhere else but first. To
889include a literal 'C<]>' place it first or immediately after an
890initial B<^>. To include a literal `C<->' make it the first (or
891second after B<^>) or last character, or the second endpoint of
892a range.
893
894The special bracket expression constructs C<[[:E<lt>:]]> and C<[[:E<gt>:]]>
895match the null string at the beginning and end of a word respectively.
896(Note that neither is identical to Perl's `\b' atom.)
897
898=head2 Additional Atoms
899
900Since some sed implementations provide additional regular expression
901atoms (not defined in POSIX 1003.2), B<psed> is capable of translating
902the following backslash escapes:
903
904=over 4
905
906=item B<\E<lt>> This is the same as C<[[:E<gt>:]]>.
907
908=item B<\E<gt>> This is the same as C<[[:E<lt>:]]>.
909
910=item B<\w> This is an abbreviation for C<[[:alnum:]_]>.
911
912=item B<\W> This is an abbreviation for C<[^[:alnum:]_]>.
913
914=item B<\y> Match the empty string at a word boundary.
915
916=item B<\B> Match the empty string between any two either word or non-word characters.
917
918=back
919
920To enable this feature, the environment variable PSEDEXTBRE must be set
921to a string containing the requested characters, e.g.:
922C<PSEDEXTBRE='E<lt>E<gt>wW'>.
923
924=cut
925
926#####
927# bre2p - convert BRE to Perl RE
928#
929sub peek(\$$){
930 my( $pref, $ic ) = @_;
931 $ic < length($$pref)-1 ? substr( $$pref, $ic+1, 1 ) : '';
932}
933
934sub bre2p($$$){
935 my( $del, $pat, $fl ) = @_;
936 my $led = $del;
937 $led =~ tr/{([</})]>/;
938 $led = '' if $led eq $del;
939
940 $pat = substr( $pat, 1, length($pat) - 2 );
941 my $res = '';
942 my $bracklev = 0;
943 my $backref = 0;
944 my $parlev = 0;
945 for( my $ic = 0; $ic < length( $pat ); $ic++ ){
946 my $c = substr( $pat, $ic, 1 );
947 if( $c eq '\\' ){
948 ### backslash escapes
949 my $nc = peek($pat,$ic);
950 if( $nc eq '' ){
951 Warn( "`\\' cannot be last in pattern", $fl );
952 return undef();
953 }
954 $ic++;
955 if( $nc eq $del ){ ## \<pattern del> => \<pattern del>
956 $res .= "\\$del";
957
958 } elsif( $nc =~ /([[.*\\n])/ ){
959 ## check for \-escaped magics and \n:
960 ## \[ \. \* \\ \n stay as they are
961 $res .= '\\'.$nc;
962
963 } elsif( $nc eq '(' ){ ## \( => (
964 $parlev++;
965 $res .= '(';
966
967 } elsif( $nc eq ')' ){ ## \) => )
968 $parlev--;
969 $backref++;
970 if( $parlev < 0 ){
971 Warn( "unmatched `\\)'", $fl );
972 return undef();
973 }
974 $res .= ')';
975
976 } elsif( $nc eq '{' ){ ## repetition factor \{<i>[,[<j>]]\}
977 my $endpos = index( $pat, '\\}', $ic );
978 if( $endpos < 0 ){
979 Warn( "unmatched `\\{'", $fl );
980 return undef();
981 }
982 my $rep = substr( $pat, $ic+1, $endpos-($ic+1) );
983 $ic = $endpos + 1;
984
985 if( $res =~ /^\^?$/ ){
986 $res .= "\\{$rep\}";
987 } elsif( $rep =~ /^(\d+)(,?)(\d*)?$/ ){
988 my $min = $1;
989 my $com = $2 || '';
990 my $max = $3;
991 if( length( $max ) ){
992 if( $max < $min ){
993 Warn( "maximum less than minimum in `\\{$rep\\}'",
994 $fl );
995 return undef();
996 }
997 } else {
998 $max = '';
999 }
1000 # simplify some
1001 if( $min == 0 && $max eq '1' ){
1002 $res .= '?';
1003 } elsif( $min == 1 && "$com$max" eq ',' ){
1004 $res .= '+';
1005 } elsif( $min == 0 && "$com$max" eq ',' ){
1006 $res .= '*';
1007 } else {
1008 $res .= "{$min$com$max}";
1009 }
1010 } else {
1011 Warn( "invalid repeat clause `\\{$rep\\}'", $fl );
1012 return undef();
1013 }
1014
1015 } elsif( $nc =~ /^[1-9]$/ ){
1016 ## \1 .. \9 => \1 .. \9, but check for a following digit
1017 if( $nc > $backref ){
1018 Warn( "invalid backreference ($nc)", $fl );
1019 return undef();
1020 }
1021 $res .= "\\$nc";
1022 if( peek($pat,$ic) =~ /[0-9]/ ){
1023 $res .= '(?:)';
1024 }
1025
1026 } elsif( $useEXTBRE && ( $nc =~ /[$useEXTBRE]/ ) ){
1027 ## extensions - at most <>wWyB - not in POSIX
1028 if( $nc eq '<' ){ ## \< => \b(?=\w), be precise
1029 $res .= '\\b(?<=\\W)';
1030 } elsif( $nc eq '>' ){ ## \> => \b(?=\W), be precise
1031 $res .= '\\b(?=\\W)';
1032 } elsif( $nc eq 'y' ){ ## \y => \b
1033 $res .= '\\b';
1034 } else { ## \B, \w, \W remain the same
1035 $res .= "\\$nc";
1036 }
1037 } elsif( $nc eq $led ){
1038 ## \<closing bracketing-delimiter> - keep '\'
1039 $res .= "\\$nc";
1040
1041 } else { ## \ <char> => <char> ("as if `\' were not present")
1042 $res .= $nc;
1043 }
1044
1045 } elsif( $c eq '.' ){ ## . => .
1046 $res .= $c;
1047
1048 } elsif( $c eq '*' ){ ## * => * but \* if there's nothing preceding it
1049 if( $res =~ /^\^?$/ ){
1050 $res .= '\\*';
1051 } elsif( substr( $res, -1, 1 ) ne '*' ){
1052 $res .= $c;
1053 }
1054
1055 } elsif( $c eq '[' ){
1056 ## parse []: [^...] [^]...] [-...]
1057 my $add = '[';
1058 if( peek($pat,$ic) eq '^' ){
1059 $ic++;
1060 $add .= '^';
1061 }
1062 my $nc = peek($pat,$ic);
1063 if( $nc eq ']' || $nc eq '-' ){
1064 $add .= $nc;
1065 $ic++;
1066 }
1067 # check that [ is not trailing
1068 if( $ic >= length( $pat ) - 1 ){
1069 Warn( "unmatched `['", $fl );
1070 return undef();
1071 }
1072 # look for [:...:] and x-y
1073 my $rstr = substr( $pat, $ic+1 );
1074 if( $rstr =~ /^((?:\[:\(\w+|[><]\):\]|[^]-](?:-[^]])?)*)/ ){
1075 my $cnt = $1;
1076 $ic += length( $cnt );
1077 $cnt =~ s/([\\\$])/\\$1/g; # `\', `$' are magic in Perl []
1078 # try some simplifications
1079 my $red = $cnt;
1080 if( $red =~ s/0-9// ){
1081 $cnt = $red.'\d';
1082 if( $red =~ s/A-Z// && $red =~ s/a-z// && $red =~ s/_// ){
1083 $cnt = $red.'\w';
1084 }
1085 }
1086 $add .= $cnt;
1087
1088 # POSIX 1003.2 has this (optional) for begin/end word
1089 $add = '\\b(?=\\W)' if $add eq '[[:<:]]';
1090 $add = '\\b(?<=\\W)' if $add eq '[[:>:]]';
1091
1092 }
1093
1094 ## may have a trailing `-' before `]'
1095 if( $ic < length($pat) - 1 &&
1096 substr( $pat, $ic+1 ) =~ /^(-?])/ ){
1097 $ic += length( $1 );
1098 $add .= $1;
1099 # another simplification
1100 $add =~ s/^\[(\^?)(\\[dw])]$/ $1 eq '^' ? uc($2) : $2 /e;
1101 $res .= $add;
1102 } else {
1103 Warn( "unmatched `['", $fl );
1104 return undef();
1105 }
3cb6de81 1106
86a59229 1107 } elsif( $c eq $led ){ ## unescaped <closing bracketing-delimiter>
1108 $res .= "\\$c";
d83e3bda 1109
86a59229 1110 } elsif( $c eq ']' ){ ## unmatched ] is not magic
1111 $res .= ']';
d83e3bda 1112
86a59229 1113 } elsif( $c =~ /[|+?{}()]/ ){ ## not magic in BRE, but in Perl: \-quote
1114 $res .= "\\$c";
d83e3bda 1115
86a59229 1116 } elsif( $c eq '^' ){ ## not magic unless 1st, but in Perl: \-quote
1117 $res .= length( $res ) ? '\\^' : '^';
d83e3bda 1118
86a59229 1119 } elsif( $c eq '$' ){ ## not magic unless last, but in Perl: \-quote
1120 $res .= $ic == length( $pat ) - 1 ? '$' : '\\$';
8d063cd8 1121
86a59229 1122 } else {
1123 $res .= $c;
1124 }
8d063cd8 1125 }
0a12ae7d 1126
86a59229 1127 if( $parlev ){
1128 Warn( "unmatched `\\('", $fl );
1129 return undef();
8d063cd8 1130 }
0a12ae7d 1131
86a59229 1132 # final cleanup: eliminate raw HTs
1133 $res =~ s/\t/\\t/g;
1134 return $del . $res . ( $led ? $led : $del );
1135}
0a12ae7d 1136
86a59229 1137
1138#####
1139# sub2p - convert sed substitution to Perl substitution
1140#
1141sub sub2p($$$){
1142 my( $del, $subst, $fl ) = @_;
1143 my $led = $del;
1144 $led =~ tr/{([</})]>/;
1145 $led = '' if $led eq $del;
1146
1147 $subst = substr( $subst, 1, length($subst) - 2 );
1148 my $res = '';
1149
1150 for( my $ic = 0; $ic < length( $subst ); $ic++ ){
1151 my $c = substr( $subst, $ic, 1 );
1152 if( $c eq '\\' ){
1153 ### backslash escapes
1154 my $nc = peek($subst,$ic);
1155 if( $nc eq '' ){
1156 Warn( "`\\' cannot be last in substitution", $fl );
1157 return undef();
1158 }
1159 $ic++;
1160 if( $nc =~ /[\\$del$led]/ ){ ## \ and delimiter
1161 $res .= '\\' . $nc;
1162 } elsif( $nc =~ /[1-9]/ ){ ## \1 - \9 => ${1} - ${9}
1163 $res .= '${' . $nc . '}';
1164 } else { ## everything else (includes &): omit \
1165 $res .= $nc;
1166 }
1167 } elsif( $c eq '&' ){ ## & => $&
1168 $res .= '$&';
1169 } elsif( $c =~ /[\$\@$led]/ ){ ## magic in Perl's substitution string
1170 $res .= '\\' . $c;
1171 } else {
1172 $res .= $c;
8d063cd8 1173 }
8d063cd8 1174 }
1175
86a59229 1176 # final cleanup: eliminate raw HTs
1177 $res =~ s/\t/\\t/g;
1178 return ( $led ? $del : $led ) . $res . ( $led ? $led : $del );
1179}
0a12ae7d 1180
86a59229 1181
1182sub Parse(){
1183 my $error = 0;
1184 my( $pdef, $pfil, $plin );
1185 for( my $icom = 0; $icom < @Commands; $icom++ ){
1186 my $cmd = $Commands[$icom];
1187 print "Parse:$cmd:\n" if $useDEBUG;
1188 $cmd =~ s/^\s+//;
1189 next unless length( $cmd );
1190 my $scom = $icom;
1191 if( exists( $Defined{$icom} ) ){
1192 $pdef = $Defined{$icom};
1193 if( $pdef =~ /^ #(\d+)/ ){
1194 $pfil = 'expression #';
1195 $plin = $1;
1196 } else {
1197 $pfil = "$pdef l.";
1198 $plin = 1;
1199 }
1200 } else {
1201 $plin++;
1202 }
1203 my $fl = "$pfil$plin";
1204
1205 # insert command as comment in gnerated code
1206 #
1207 $Code .= "# $cmd\n" if $doGenerate;
1208
1209 # The Address(es)
1210 #
1211 my( $negated, $naddr, $addr1, $addr2 );
1212 $naddr = 0;
1213 if( $cmd =~ s/^(\d+)\s*// ){
1214 $addr1 = "$1"; $naddr++;
1215 } elsif( $cmd =~ s/^\$\s*// ){
1216 $addr1 = 'eofARGV()'; $naddr++;
1217 } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1218 my $del = $1;
1219 my $regex = stripRegex( $del, \$cmd );
1220 if( defined( $regex ) ){
1221 $addr1 = 'm '.bre2p( $del, $regex, $fl ).'s';
1222 $naddr++;
1223 } else {
1224 Warn( "malformed regex, 1st address", $fl );
1225 $error++;
1226 next;
1227 }
1228 }
1229 if( defined( $addr1 ) && $cmd =~ s/,\s*// ){
1230 if( $cmd =~ s/^(\d+)\s*// ){
1231 $addr2 = "$1"; $naddr++;
1232 } elsif( $cmd =~ s/^\$\s*// ){
1233 $addr2 = 'eofARGV()'; $naddr++;
1234 } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1235 my $del = $1;
1236 my $regex = stripRegex( $del, \$cmd );
1237 if( defined( $regex ) ){
1238 $addr2 = 'm '. bre2p( $del, $regex, $fl ).'s';
1239 $naddr++;
1240 } else {
1241 Warn( "malformed regex, 2nd address", $fl );
1242 $error++;
1243 next;
1244 }
1245 } else {
1246 Warn( "invalid address after `,'", $fl );
1247 $error++;
1248 next;
1249 }
1250 }
1251
1252 # address modifier `!'
1253 #
1254 $negated = $cmd =~ s/^!\s*//;
1255 if( defined( $addr1 ) ){
1256 print "Parse: addr1=$addr1" if $useDEBUG;
1257 if( defined( $addr2 ) ){
1258 print ", addr2=$addr2 " if $useDEBUG;
1259 # both numeric and addr1 > addr2 => eliminate addr2
1260 undef( $addr2 ) if $addr1 =~ /^\d+$/ &&
1261 $addr2 =~ /^\d+$/ && $addr1 > $addr2;
1262 }
9ef589d8 1263 }
86a59229 1264 print 'negated' if $useDEBUG && $negated;
1265 print " command:$cmd\n" if $useDEBUG;
1266
1267 # The Command
1268 #
1269 if( $cmd !~ s/^([:#={}abcdDgGhHilnNpPqrstwxy])\s*// ){
1270 my $h = substr( $cmd, 0, 1 );
1271 Warn( "unknown command `$h'", $fl );
1272 $error++;
8d063cd8 1273 next;
1274 }
86a59229 1275 my $key = $1;
8d063cd8 1276
86a59229 1277 my $tabref = $ComTab{$key};
1278 if( $naddr > $tabref->[0] ){
1279 Warn( "excess address(es)", $fl );
1280 $error++;
8d063cd8 1281 next;
1282 }
1283
86a59229 1284 my $arg = '';
1285 if( $tabref->[1] eq 'str' ){
1286 # take remainder - don't care if it is empty
1287 $arg = $cmd;
1288 $cmd = '';
1289
1290 } elsif( $tabref->[1] eq 'txt' ){
1291 # multi-line text
1292 my $goon = $cmd =~ /(.*)\\$/;
1293 if( length( $1 ) ){
1294 Warn( "extra characters after command ($cmd)", $fl );
1295 $error++;
8d063cd8 1296 }
86a59229 1297 while( $goon ){
1298 $icom++;
1299 if( $icom > $#Commands ){
1300 Warn( "unexpected end of script", $fl );
1301 $error++;
1302 last;
1303 }
1304 $cmd = $Commands[$icom];
1305 $Code .= "# $cmd\n" if $doGenerate;
1306 $goon = $cmd =~ s/\\$//;
1307 $cmd =~ s/\\(.)/$1/g;
1308 $arg .= "\n" if length( $arg );
1309 $arg .= $cmd;
8d063cd8 1310 }
86a59229 1311 $arg .= "\n" if length( $arg );
1312 $cmd = '';
8d063cd8 1313
86a59229 1314 } elsif( $tabref->[1] eq 'sub' ){
1315 # s///
1316 if( ! length( $cmd ) ){
1317 Warn( "`s' command requires argument", $fl );
1318 $error++;
1319 next;
1320 }
1321 if( $cmd =~ s{^([^\\\n])}{} ){
1322 my $del = $1;
1323 my $regex = stripRegex( $del, \$cmd );
1324 if( ! defined( $regex ) ){
1325 Warn( "malformed regular expression", $fl );
1326 $error++;
1327 next;
a687059c 1328 }
86a59229 1329 $regex = bre2p( $del, $regex, $fl );
1330
1331 # a trailing \ indicates embedded NL (in replacement string)
1332 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1333 $icom++;
1334 if( $icom > $#Commands ){
1335 Warn( "unexpected end of script", $fl );
1336 $error++;
1337 last;
8d063cd8 1338 }
86a59229 1339 $cmd .= $Commands[$icom];
1340 $Code .= "# $Commands[$icom]\n" if $doGenerate;
9ef589d8 1341 }
86a59229 1342
1343 my $subst = stripRegex( $del, \$cmd );
1344 if( ! defined( $regex ) ){
1345 Warn( "malformed substitution expression", $fl );
1346 $error++;
0a12ae7d 1347 next;
1348 }
86a59229 1349 $subst = sub2p( $del, $subst, $fl );
1350
1351 # parse s/// modifier: g|p|0-9|w <file>
1352 my( $global, $nmatch, $print, $write ) =
1353 ( '', '', 0, undef );
1354 while( $cmd =~ s/^([gp0-9])// ){
1355 $1 eq 'g' ? ( $global = 'g' ) :
1356 $1 eq 'p' ? ( $print = $1 ) : ( $nmatch .= $1 );
1357 }
1358 $write = $1 if $cmd =~ s/w\s*(.*)$//;
1359 ### $nmatch =~ s/^(\d)\1*$/$1/; ### may be dangerous?
1360 if( $global && length( $nmatch ) || length( $nmatch ) > 1 ){
1361 Warn( "conflicting flags `$global$nmatch'", $fl );
1362 $error++;
0a12ae7d 1363 next;
1364 }
86a59229 1365
1366 $arg = makes( $regex, $subst,
1367 $write, $global, $print, $nmatch, $fl );
1368 if( ! defined( $arg ) ){
1369 $error++;
8d063cd8 1370 next;
1371 }
86a59229 1372
1373 } else {
1374 Warn( "improper delimiter in s command", $fl );
1375 $error++;
1376 next;
1377 }
1378
1379 } elsif( $tabref->[1] eq 'tra' ){
1380 # y///
1381 # a trailing \ indicates embedded newline
1382 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1383 $icom++;
1384 if( $icom > $#Commands ){
1385 Warn( "unexpected end of script", $fl );
1386 $error++;
1387 last;
1388 }
1389 $cmd .= $Commands[$icom];
1390 $Code .= "# $Commands[$icom]\n" if $doGenerate;
1391 }
1392 if( ! length( $cmd ) ){
1393 Warn( "`y' command requires argument", $fl );
1394 $error++;
1395 next;
1396 }
1397 my $d = substr( $cmd, 0, 1 ); $cmd = substr( $cmd, 1 );
1398 if( $d eq '\\' ){
1399 Warn( "`\\' not valid as delimiter in `y' command", $fl );
1400 $error++;
1401 next;
1402 }
1403 my $fr = stripTrans( $d, \$cmd );
1404 if( ! defined( $fr ) || ! length( $cmd ) ){
1405 Warn( "malformed `y' command argument", $fl );
1406 $error++;
1407 next;
1408 }
1409 my $to = stripTrans( $d, \$cmd );
1410 if( ! defined( $to ) ){
1411 Warn( "malformed `y' command argument", $fl );
1412 $error++;
1413 next;
1414 }
1415 if( length($fr) != length($to) ){
1416 Warn( "string lengths in `y' command differ", $fl );
1417 $error++;
1418 next;
1419 }
1420 if( ! defined( $arg = makey( $fr, $to, $fl ) ) ){
1421 $error++;
1422 next;
8d063cd8 1423 }
8d063cd8 1424
8d063cd8 1425 }
1426
86a59229 1427 # $cmd must be now empty - exception is {
1428 if( $cmd !~ /^\s*$/ ){
1429 if( $key eq '{' ){
1430 # dirty hack to process command on '{' line
1431 $Commands[$icom--] = $cmd;
1432 } else {
1433 Warn( "extra characters after command ($cmd)", $fl );
1434 $error++;
1435 next;
1436 }
8d063cd8 1437 }
1438
86a59229 1439 # Make Code
1440 #
1441 if( &{$tabref->[2]}( $addr1, $addr2, $negated,
1442 $tabref->[3], $arg, $fl ) ){
1443 $error++;
8d063cd8 1444 }
86a59229 1445 }
8d063cd8 1446
86a59229 1447 while( @BlockStack ){
1448 my $bl = pop( @BlockStack );
1449 Warn( "start of unterminated `{'", $bl );
1450 $error++;
1451 }
8d063cd8 1452
86a59229 1453 for my $lab ( keys( %Label ) ){
1454 if( ! exists( $Label{$lab}{defined} ) ){
1455 for my $used ( @{$Label{$lab}{used}} ){
1456 Warn( "undefined label `$lab'", $used );
1457 $error++;
1458 }
8d063cd8 1459 }
86a59229 1460 }
8d063cd8 1461
86a59229 1462 exit( 1 ) if $error;
1463}
8d063cd8 1464
8d063cd8 1465
86a59229 1466##############
1467#### MAIN ####
1468##############
8d063cd8 1469
86a59229 1470sub usage(){
1471 print STDERR "Usage: sed [-an] command [file...]\n";
1472 print STDERR " [-an] [-e command] [-f script-file] [file...]\n";
1473}
8d063cd8 1474
86a59229 1475###################
1476# Here we go again...
1477#
1478my $expr = 0;
1479while( @ARGV && $ARGV[0] =~ /^-(.)(.*)$/ ){
1480 my $opt = $1;
1481 my $arg = $2;
1482 shift( @ARGV );
1483 if( $opt eq 'e' ){
1484 if( length( $arg ) ){
1485 push( @Commands, split( "\n", $arg ) );
1486 } elsif( @ARGV ){
1487 push( @Commands, shift( @ARGV ) );
1488 } else {
1489 Warn( "option -e requires an argument" );
1490 usage();
1491 exit( 1 );
1492 }
1493 $expr++;
1494 $Defined{$#Commands} = " #$expr";
1495 next;
1496 }
1497 if( $opt eq 'f' ){
1498 my $path;
1499 if( length( $arg ) ){
1500 $path = $arg;
1501 } elsif( @ARGV ){
1502 $path = shift( @ARGV );
1503 } else {
1504 Warn( "option -f requires an argument" );
1505 usage();
1506 exit( 1 );
1507 }
1508 my $fst = $#Commands + 1;
1509 open( SCRIPT, "<$path" ) || die( "$0: $path: could not open ($!)\n" );
1510 my $cmd;
1511 while( defined( $cmd = <SCRIPT> ) ){
1512 chomp( $cmd );
1513 push( @Commands, $cmd );
1514 }
1515 close( SCRIPT );
1516 if( $#Commands >= $fst ){
1517 $Defined{$fst} = "$path";
8d063cd8 1518 }
86a59229 1519 next;
1520 }
1521 if( $opt eq '-' && $arg eq '' ){
1522 last;
1523 }
1524 if( $opt eq 'h' || $opt eq '?' ){
1525 usage();
1526 exit( 0 );
1527 }
1528 if( $opt eq 'n' ){
1529 $doAutoPrint = 0;
1530 } elsif( $opt eq 'a' ){
1531 $doOpenWrite = 0;
1532 } else {
1533 Warn( "illegal option `$opt'" );
1534 usage();
1535 exit( 1 );
1536 }
1537 if( length( $arg ) ){
1538 unshift( @ARGV, "-$arg" );
1539 }
1540}
8d063cd8 1541
86a59229 1542# A singleton command may be the 1st argument when there are no options.
1543#
1544if( @Commands == 0 ){
1545 if( @ARGV == 0 ){
1546 Warn( "no script command given" );
1547 usage();
1548 exit( 1 );
1549 }
1550 push( @Commands, split( "\n", shift( @ARGV ) ) );
1551 $Defined{0} = ' #1';
1552}
8d063cd8 1553
86a59229 1554print STDERR "Files: @ARGV\n" if $useDEBUG;
8d063cd8 1555
86a59229 1556# generate leading code
1557#
1558 $Code = <<'[TheEnd]';
1559
1560sub openARGV(){
1561 unshift( @ARGV, '-' ) unless @ARGV;
1562 my $file = shift( @ARGV );
1563 open( ARG, "<$file" )
1564 || die( "$0: can't open $file for reading ($!)\n" );
1565 $isEOF = 0;
1566}
8d063cd8 1567
86a59229 1568sub getsARGV(;\$){
1569 my $argref = @_ ? shift() : \$_;
1570 while( $isEOF || ! defined( $$argref = <ARG> ) ){
1571 close( ARG );
1572 return 0 unless @ARGV;
1573 my $file = shift( @ARGV );
1574 open( ARG, "<$file" )
1575 || die( "$0: can't open $file for reading ($!)\n" );
1576 $isEOF = 0;
1577 }
1578 1;
1579}
8d063cd8 1580
86a59229 1581sub eofARGV(){
1582 return @ARGV == 0 && ( $isEOF = eof( ARG ) );
1583}
8d063cd8 1584
86a59229 1585sub makeHandle($){
1586 my( $path ) = @_;
1587 my $handle;
1588 if( ! exists( $wFiles{$path} ) || $wFiles{$path} eq '' ){
1589 $handle = $wFiles{$path} = gensym();
1590 if( $doOpenWrite ){
1591 if( ! open( $handle, ">$path" ) ){
1592 die( "$0: can't open $path for writing: ($!)\n" );
1593 }
9ef589d8 1594 }
86a59229 1595 } else {
1596 $handle = $wFiles{$path};
1597 }
1598 return $handle;
1599}
9ef589d8 1600
86a59229 1601sub _r($){
1602 my $path = shift();
1603 push( @Q, \$path );
1604}
8d063cd8 1605
86a59229 1606sub _l(){
1607 my $h = $_;
1608 my $mcpl = 70;
1609 $h =~ s/\\/\\\\/g;
1610 if( $h =~ /[^[:print:]]/ ){
1611 $h =~ s/\a/\\a/g;
1612 $h =~ s/\f/\\f/g;
1613 $h =~ s/\n/\\n/g;
1614 $h =~ s/\t/\\t/g;
1615 $h =~ s/\r/\\r/g;
1616 $h =~ s/\e/\\e/g;
1617 $h =~ s/([^[:print:]])/sprintf("\\%03o", ord($1))/ge;
1618 }
1619 while( length( $h ) > $mcpl ){
1620 my $l = substr( $h, 0, $mcpl-1 );
1621 $h = substr( $h, $mcpl );
1622 # remove incomplete \-escape from end of line
1623 if( $l =~ s/(?<!\\)(\\[0-7]{0,2})$// ){
1624 $h = $1 . $h;
8d063cd8 1625 }
86a59229 1626 print $l, "\\\n";
8d063cd8 1627 }
86a59229 1628 print "$h\$\n";
8d063cd8 1629}
1630
86a59229 1631sub _w($){
1632 my $path = shift();
1633 my $handle = $wFiles{$path};
1634 if( ! $doOpenWrite &&
1635 ! defined( fileno( $handle ) ) ){
1636 open( $handle, ">$path" )
1637 || die( "$0: $path: cannot open ($!)\n" );
1638 }
1639 print $handle $_, "\n";
1640}
a687059c 1641
86a59229 1642# condition register test/reset
1643#
1644sub _t(){
1645 my $res = $CondReg;
1646 $CondReg = 0;
1647 $res;
1648}
0a12ae7d 1649
86a59229 1650# printQ
1651#
1652sub printQ(){
1653 for my $q ( @Q ){
1654 if( ref( $q ) ){
1655 if( exists( $wFiles{$$q} ) && $wFiles{$$q} ne '' ){
1656 open( $wFiles{$$q}, ">>$$q" );
1657 }
1658 if( open( RF, "<$$q" ) ){
1659 my $line;
1660 while( defined( $line = <RF> ) ){
1661 print $line;
1662 }
1663 close( RF );
1664 }
1665 } else {
1666 print $q;
a687059c 1667 }
86a59229 1668 }
1669 undef( @Q );
1670}
1671
1672sub Run(){
1673 my( $h, $icnt, $s, $n );
1674 # hack (not unbreakable :-/) to avoid // matching an empty string
1675 my $z = "\000"; $z =~ /$z/;
1676 # Initialize.
1677 openARGV();
1678 $Hold = '';
1679 $CondReg = 0;
1680 $doPrint = $doAutoPrint;
1681CYCLE:
1682 while( getsARGV() ){
1683 chomp();
1684 $CondReg = 0; # cleared on t
1685BOS:;
1686[TheEnd]
1687
1688 # parse - avoid opening files when doing s2p
1689 #
1690 ( $svOpenWrite, $doOpenWrite ) = ( $doOpenWrite, $svOpenWrite )
1691 if $doGenerate;
1692 Parse();
1693 ( $svOpenWrite, $doOpenWrite ) = ( $doOpenWrite, $svOpenWrite )
1694 if $doGenerate;
1695
1696 # append trailing code
1697 #
1698 $Code .= <<'[TheEnd]';
1699EOS: if( $doPrint ){
1700 print $_, "\n";
1701 } else {
1702 $doPrint = $doAutoPrint;
a687059c 1703 }
86a59229 1704 printQ() if @Q;
a687059c 1705 }
86a59229 1706
1707 exit( 0 );
a687059c 1708}
86a59229 1709[TheEnd]
1710
1711# magic "#n" - same as -n option
1712#
1713$doAutoPrint = 0 if substr( $Commands[0], 0, 2 ) eq '#n';
a687059c 1714
86a59229 1715# eval code - check for errors
1716#
1717print "Code:\n$Code" if $useDEBUG;
1718eval $Code;
1719if( $@ ){
1720 print "Code:\n$Code";
1721 die( "$0: internal error - generated incorrect Perl code: $@\n" );
9ef589d8 1722}
1723
86a59229 1724if( $doGenerate ){
1725
1726 # write full Perl program
1727 #
1728
1729 # bang line, declarations
1730 print <<TheEnd;
1731#!$perlpath -w
1732eval 'exec $perlpath -S \$0 \${1+"\$@"}'
1733 if 0;
1734\$0 =~ s/^.*?(\\w+)\$/\$1/;
1735
1736use strict;
1737use Symbol;
1738use vars qw{ \$isEOF \$Hold \%wFiles \@Q \$CondReg
1739 \$doAutoPrint \$doOpenWrite \$doPrint };
1740\$doAutoPrint = $doAutoPrint;
1741\$doOpenWrite = $doOpenWrite;
1742TheEnd
1743
1744 my $wf = "'" . join( "', '", keys( %wFiles ) ) . "'";
1745 if( $wf ne "''" ){
1746 print <<TheEnd;
1747sub makeHandle(\$);
1748for my \$p ( $wf ){
1749 exit( 1 ) unless makeHandle( \$p );
9ef589d8 1750}
86a59229 1751TheEnd
1752 }
9ef589d8 1753
86a59229 1754 print $Code;
1755 print "&Run()\n";
1756 exit( 0 );
1aa91729 1757
86a59229 1758} else {
1759
1760 # execute: make handles (and optionally open) all w files; run!
1aa91729 1761
86a59229 1762 for my $p ( keys( %wFiles ) ){
1763 exit( 1 ) unless makeHandle( $p );
1764 }
1765 &Run();
1aa91729 1766}
86a59229 1767
1768
1769=head1 ENVIRONMENT
1770
1771The environment variable C<PSEDEXTBRE> may be set to extend BREs.
1772See L<"Additional Atoms">.
1773
1774=head1 DIAGNOSTICS
1775
1776=over 4
1777
1778=item ambiguos translation for character `%s' in `y' command
1779
1780The indicated character appears twice, with different translations.
1781
1782=item `[' cannot be last in pattern
1783
1784A `[' in a BRE indicates the beginning of a I<bracket expression>.
1785
1786=item `\' cannot be last in pattern
1787
1788A `\' in a BRE is used to make the subsequent character literal.
1789
1790=item `\' cannot be last in substitution
1791
1792A `\' in a subsitution string is used to make the subsequent character literal.
1793
1794=item conflicting flags `%s'
1795
1796In an B<s> command, either the `g' flag and an n-th occurrence flag, or
1797multiple n-th occurrence flags are specified. Note that only the digits
1798`1' through `9' are permitted.
1799
1800=item duplicate label %s (first defined at %s)
1801
1802=item excess address(es)
1803
1804The command has more than the permitted number of addresses.
1805
1806=item extra characters after command (%s)
1807
1808=item illegal option `%s'
1809
1810=item improper delimiter in s command
1811
1812The BRE and substitution may not be delimited with `\' or newline.
1813
1814=item invalid address after `,'
1815
1816=item invalid backreference (%s)
1817
1818The specified backreference number exceeds the number of backreferences
1819in the BRE.
1820
1821=item invalid repeat clause `\{%s\}'
1822
1823The repeat clause does not contain a valid integer value, or pair of
1824values.
1825
1826=item malformed regex, 1st address
1827
1828=item malformed regex, 2nd address
1829
1830=item malformed regular expression
1831
1832=item malformed substitution expression
1833
1834=item malformed `y' command argument
1835
1836The first or second string of a B<y> command is syntactically incorrect.
1837
1838=item maximum less than minimum in `\{%s\}'
1839
1840=item no script command given
1841
1842There must be at least one B<-e> or one B<-f> option specifying a
1843script or script file.
1844
1845=item `\' not valid as delimiter in `y' command
1846
1847=item option -e requires an argument
1848
1849=item option -f requires an argument
1850
1851=item `s' command requires argument
1852
1853=item start of unterminated `{'
1854
1855=item string lengths in `y' command differ
1856
1857The translation table strings in a B<y> commanf must have equal lengths.
1858
1859=item undefined label `%s'
1860
1861=item unexpected `}'
1862
1863A B<}> command without a preceding B<{> command was encountered.
1864
1865=item unexpected end of script
1866
1867The end of the script was reached although a text line after a
1868B<a>, B<c> or B<i> command indicated another line.
1869
1870=item unknown command `%s'
1871
1872=item unterminated `['
1873
1874A BRE contains an unterminated bracket expression.
1875
1876=item unterminated `\('
1877
1878A BRE contains an unterminated backreference.
1879
1880=item `\{' without closing `\}'
1881
1882A BRE contains an unterminated bounds specification.
1883
1884=item `\)' without preceding `\('
1885
1886=item `y' command requires argument
1887
1888=back
1889
1890=head1 EXAMPLE
1891
1892The basic material for the preceding section was generated by running
1893the sed script
1894
1895 #no autoprint
1896 s/^.*Warn( *"\([^"]*\)".*$/\1/
1897 t process
1898 b
1899 :process
1900 s/$!/%s/g
1901 s/$[_[:alnum:]]\{1,\}/%s/g
1902 s/\\\\/\\/g
1903 s/^/=item /
1904 p
1905
1906on the program's own text, and piping the output into C<sort -u>.
1907
1908
1909=head1 SED SCRIPT TRANSLATION
1910
1911If this program is invoked with the name F<s2p> it will act as a
1912sed-to-Perl translator. After option processing (all other
1913arguments are ignored), a Perl program is printed on standard
1914output, which will process the input stream (as read from all
1915arguments) in the way defined by the sed script and the option setting
1916used for the translation.
1917
1918=head1 SEE ALSO
1919
1920perl(1), re_format(7)
1921
1922=head1 BUGS
1923
1924The B<l> command will show escape characters (ESC) as `C<\e>', but
1925a vertical tab (VT) in octal.
1926
1927Trailing spaces are truncated from labels in B<:>, B<t> and B<b> commands.
1928
1929The meaning of an empty regular expression (`C<//>'), as defined by B<sed>,
1930is "the last pattern used, at run time". This deviates from the Perl
1931interpretation, which will re-use the "last last successfully executed
1932regular expression". Since keeping track of pattern usage would create
1933terribly cluttered code, and differences would only appear in obscure
1934context (where other B<sed> implementations appear to deviate, too),
1935the Perl semantics was adopted. Note that common usage of this feature,
1936such as in C</abc/s//xyz/>, will work as expected.
1937
1938Collating elements (of bracket expressions in BREs) are not implemented.
1939
1940=head1 STANDARDS
1941
1942This B<sed> implementation conforms to the IEEE Std1003.2-1992 ("POSIX.2")
1943definition of B<sed>, and is compatible with the I<OpenBSD>
1944implementation, except where otherwise noted (see L<"BUGS">).
1945
1946=head1 AUTHOR
1947
1948This Perl implementation of I<sed> was written by Wolfgang Laun,
1949I<Wolfgang.Laun@alcatel.at>.
1950
1951=head1 COPYRIGHT and LICENSE
1952
1953This program is free and open software. You may use, modify,
1954distribute, and sell this program (and any modified variants) in any
1955way you wish, provided you do not restrict others from doing the same.
1956
1957=cut
1958
a687059c 1959!NO!SUBS!
4633a7c4 1960
1961close OUT or die "Can't close $file: $!";
1962chmod 0755, $file or die "Can't reset permissions for $file: $!\n";
1963exec("$Config{'eunicefix'} $file") if $Config{'eunicefix'} ne ':';
8a5546a1 1964chdir $origdir;