Update Changes.
[p5sagit/p5-mst-13.2.git] / x2p / s2p.PL
CommitLineData
86a59229 1#!/usr/bin/perl
4633a7c4 2
3use Config;
4use File::Basename qw(&basename &dirname);
8a5546a1 5use Cwd;
4633a7c4 6
7# List explicitly here the variables you want Configure to
8# generate. Metaconfig only looks for shell variables, so you
9# have to mention them as if they were shell variables, not
10# %Config entries. Thus you write
11# $startperl
12# to ensure Configure will look for $Config{startperl}.
13
14# This forces PL files to create target in same directory as PL file.
15# This is so that make depend always knows where to find PL derivatives.
8a5546a1 16$origdir = cwd;
44a8e56a 17chdir dirname($0);
18$file = basename($0, '.PL');
774d564b 19$file .= '.com' if $^O eq 'VMS';
4633a7c4 20
21open OUT,">$file" or die "Can't create $file: $!";
22
23print "Extracting $file (with variable substitutions)\n";
24
25# In this section, perl variables will be expanded during extraction.
26# You can use $Config{...} to use Configure variables.
27
28print OUT <<"!GROK!THIS!";
5f05dabc 29$Config{startperl}
30 eval 'exec $Config{perlpath} -S \$0 \${1+"\$@"}'
31 if \$running_under_some_shell;
ed6d8ea1 32my $startperl;
33my $perlpath;
34(\$startperl = <<'/../') =~ s/\\s*\\z//;
35$Config{startperl}
36/../
37(\$perlpath = <<'/../') =~ s/\\s*\\z//;
38$Config{perlpath}
39/../
a687059c 40!GROK!THIS!
41
4633a7c4 42# In the following, perl variables are not expanded during extraction.
43
44print OUT <<'!NO!SUBS!';
a687059c 45
86a59229 46$0 =~ s/^.*?(\w+)$/$1/;
47
48# (p)sed - a stream editor
49# History: Aug 12 2000: Original version.
50
51use strict;
52use integer;
53use Symbol;
8d063cd8 54
d83e3bda 55=head1 NAME
56
ba34211e 57psed - a stream editor
d83e3bda 58
59=head1 SYNOPSIS
60
ba34211e 61 psed [-an] script [file ...]
62 psed [-an] [-e script] [-f script-file] [file ...]
63
64 s2p [-an] [-e script] [-f script-file]
d83e3bda 65
66=head1 DESCRIPTION
67
86a59229 68A stream editor reads the input stream consisting of the specified files
69(or standard input, if none are given), processes is line by line by
70applying a script consisting of edit commands, and writes resulting lines
71to standard output. The filename `C<->' may be used to read standard input.
72
73The edit script is composed from arguments of B<-e> options and
74script-files, in the given order. A single script argument may be specified
75as the first parameter.
76
77If this program is invoked with the name F<s2p>, it will act as a
78sed-to-Perl translator. See L<"sed Script Translation">.
d83e3bda 79
86a59229 80B<sed> returns an exit code of 0 on success or >0 if an error occurred.
d83e3bda 81
86a59229 82=head1 OPTIONS
d83e3bda 83
86a59229 84=over 4
d83e3bda 85
86a59229 86=item B<-a>
d83e3bda 87
86a59229 88A file specified as argument to the B<w> edit command is by default
89opened before input processing starts. Using B<-a>, opening of such
90files is delayed until the first line is actually written to the file.
91
92=item B<-e> I<script>
93
94The editing commands defined by I<script> are appended to the script.
95Multiple commands must be separated by newlines.
96
97=item B<-f> I<script-file>
98
99Editing commands from the specified I<script-file> are read and appended
100to the script.
d83e3bda 101
102=item B<-n>
103
86a59229 104By default, a line is written to standard output after the editing script
105has been applied to it. The B<-n> option suppresses automatic printing.
106
107=back
108
109=head1 COMMANDS
110
111B<sed> command syntax is defined as
112
113Z<> Z<> Z<> Z<>[I<address>[B<,>I<address>]][B<!>]I<function>[I<argument>]
114
115with whitespace being permitted before or after addresses, and between
116the function character and the argument. The I<address>es and the
117address inverter (C<!>) are used to restrict the application of a
118command to the selected line(s) of input.
119
120Each command must be on a line of its own, except where noted in
121the synopses below.
122
123The edit cycle performed on each input line consist of reading the line
124(without its trailing newline character) into the I<pattern space>,
125applying the applicable commands of the edit script, writing the final
126contents of the pattern space and a newline to the standard output.
127A I<hold space> is provided for saving the contents of the
128pattern space for later use.
129
130=head2 Addresses
131
132A sed address is either a line number or a pattern, which may be combined
133arbitrarily to construct ranges. Lines are numbered across all input files.
134
135Any address may be followed by an exclamation mark (`C<!>'), selecting
136all lines not matching that address.
d83e3bda 137
86a59229 138=over 4
d83e3bda 139
86a59229 140=item I<number>
141
142The line with the given number is selected.
143
144=item B<$>
145
146A dollar sign (C<$>) is the line number of the last line of the input stream.
147
148=item B</>I<regular expression>B</>
149
150A pattern address is a basic regular expression (see
151L<"Basic Regular Expressions">), between the delimiting character C</>.
152Any other character except C<\> or newline may be used to delimit a
153pattern address when the initial delimiter is prefixed with a
154backslash (`C<\>').
d83e3bda 155
156=back
157
86a59229 158If no address is given, the command selects every line.
d83e3bda 159
86a59229 160If one address is given, it selects the line (or lines) matching the
161address.
d83e3bda 162
86a59229 163Two addresses select a range that begins whenever the first address
164matches, and ends (including that line) when the second address matches.
165If the first (second) address is a matching pattern, the second
166address is not applied to the very same line to determine the end of
167the range. Likewise, if the second address is a matching pattern, the
168first address is not applied to the very same line to determine the
169begin of another range. If both addresses are line numbers,
170and the second line number is less than the first line number, then
171only the first line is selected.
d83e3bda 172
d83e3bda 173
86a59229 174=head2 Functions
d83e3bda 175
86a59229 176The maximum permitted number of addresses is indicated with each
177function synopsis below.
d83e3bda 178
86a59229 179The argument I<text> consists of one or more lines following the command.
180Embedded newlines in I<text> must be preceded with a backslash. Other
181backslashes in I<text> are deleted and the following character is taken
182literally.
d83e3bda 183
86a59229 184=over 4
d83e3bda 185
86a59229 186=cut
187
188my %ComTab;
189#--------------------------------------------------------------------------
190$ComTab{'a'}=[ 1, 'txt', \&Emit, '{ push( @Q, <<'."'TheEnd' ) }\n" ]; #ok
191
192=item [1addr]B<a\> I<text>
193
194Write I<text> (which must start on the line following the command)
195to standard output immediately before reading the next line
196of input, either by executing the B<N> function or by beginning a new cycle.
197
198=cut
199
200#--------------------------------------------------------------------------
201$ComTab{'b'}=[ 2, 'str', \&Branch, '{ goto XXX; }' ]; #ok
202
203=item [2addr]B<b> [I<label>]
204
205Branch to the B<:> function with the specified I<label>. If no label
206is given, branch to the end of the script.
207
208=cut
209
210#--------------------------------------------------------------------------
211$ComTab{'c'}=[ 2, 'txt', \&Change, <<'-X-' ]; #ok
212{ print <<'TheEnd'; } $doPrint = 0; goto EOS;
213-X-
214### continue OK => next CYCLE;
215
216=item [2addr]B<c\> I<text>
217
218The line, or range of lines, selected by the address is deleted.
219The I<text> (which must start on the line following the command)
220is written to standard output. With an address range, this occurs at
221the end of the range.
222
223=cut
224
225#--------------------------------------------------------------------------
226$ComTab{'d'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
227{ $doPrint = 0;
228 goto EOS;
229}
230-X-
231### continue OK => next CYCLE;
232
233=item [2addr]B<d>
234
235Deletes the pattern space and starts the next cycle.
236
237=cut
238
239#--------------------------------------------------------------------------
240$ComTab{'D'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
241{ s/^.*\n?//;
242 if(length($_)){ goto BOS } else { goto EOS }
243}
244-X-
245### continue OK => next CYCLE;
246
247=item [2addr]B<D>
248
249Deletes the pattern space through the first embedded newline or to the end.
250If the pattern space becomes empty, a new cycle is started, otherwise
251execution of the script is restarted.
252
253=cut
254
255#--------------------------------------------------------------------------
256$ComTab{'g'}=[ 2, '', \&Emit, '{ $_ = $Hold };' ]; #ok
257
258=item [2addr]B<g>
259
260Replace the contents of the pattern space with the hold space.
261
262=cut
263
264#--------------------------------------------------------------------------
265$ComTab{'G'}=[ 2, '', \&Emit, '{ $_ .= "\n"; $_ .= $Hold };' ]; #ok
266
267=item [2addr]B<G>
268
269Append a newline and the contents of the hold space to the pattern space.
270
271=cut
272
273#--------------------------------------------------------------------------
274$ComTab{'h'}=[ 2, '', \&Emit, '{ $Hold = $_ }' ]; #ok
275
276=item [2addr]B<h>
277
278Replace the contents of the hold space with the pattern space.
279
280=cut
281
282#--------------------------------------------------------------------------
283$ComTab{'H'}=[ 2, '', \&Emit, '{ $Hold .= "\n"; $Hold .= $_; }' ]; #ok
284
285=item [2addr]B<H>
286
287Append a newline and the contents of the pattern space to the hold space.
288
289=cut
290
291#--------------------------------------------------------------------------
292$ComTab{'i'}=[ 1, 'txt', \&Emit, '{ print <<'."'TheEnd' }\n" ]; #ok
293
294=item [1addr]B<i\> I<text>
295
296Write the I<text> (which must start on the line following the command)
297to standard output.
298
299=cut
300
301#--------------------------------------------------------------------------
302$ComTab{'l'}=[ 2, '', \&Emit, '{ _l() }' ]; #okUTF8
303
304=item [2addr]B<l>
305
306Print the contents of the pattern space: non-printable characters are
307shown in C-style escaped form; long lines are split and have a trailing
308`C<\>' at the point of the split; the true end of a line is marked with
309a `C<$>'. Escapes are: `\a', `\t', `\n', `\f', `\r', `\e' for
310BEL, HT, LF, FF, CR, ESC, respectively, and `\' followed by a three-digit
311octal number for all other non-printable characters.
312
313=cut
314
315#--------------------------------------------------------------------------
316$ComTab{'n'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
317{ print $_, "\n" if $doPrint;
318 printQ if @Q;
319 $CondReg = 0;
320 last CYCLE unless getsARGV();
321 chomp();
322}
323-X-
324
325=item [2addr]B<n>
326
327If automatic printing is enabled, write the pattern space to the standard
328output. Replace the pattern space with the next line of input. If
329there is no more input, processing is terminated.
330
331=cut
332
333#--------------------------------------------------------------------------
334$ComTab{'N'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
335{ printQ if @Q;
336 $CondReg = 0;
337 last CYCLE unless getsARGV( $h );
338 chomp( $h );
339 $_ .= "\n$h";
340}
341-X-
342
343=item [2addr]B<N>
344
345Append a newline and the next line of input to the pattern space. If
346there is no more input, processing is terminated.
347
348=cut
349
350#--------------------------------------------------------------------------
351$ComTab{'p'}=[ 2, '', \&Emit, '{ print $_, "\n"; }' ]; #ok
352
353=item [2addr]B<p>
354
355Print the pattern space to the standard output. (Use the B<-n> option
356to suppress automatic printing at the end of a cycle if you want to
357avoid double printing of lines.)
358
359=cut
360
361#--------------------------------------------------------------------------
362$ComTab{'P'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
363{ if( /^(.*)/ ){ print $1, "\n"; } }
364-X-
365
366=item [2addr]B<P>
367
368Prints the pattern space through the first embedded newline or to the end.
369
370=cut
371
372#--------------------------------------------------------------------------
373$ComTab{'q'}=[ 1, '', \&Emit, <<'-X-' ]; #ok
374{ print $_, "\n" if $doPrint;
375 last CYCLE;
376}
377-X-
378
379=item [1addr]B<q>
380
381Branch to the end of the script and quit without starting a new cycle.
382
383=cut
384
385#--------------------------------------------------------------------------
386$ComTab{'r'}=[ 1, 'str', \&Emit, "{ _r( '-X-' ) }" ]; #ok
387### FIXME: lazy reading - big files???
388
389=item [1addr]B<r> I<file>
390
391Copy the contents of the I<file> to standard output immediately before
392the next attempt to read a line of input. Any error encountered while
393reading I<file> is silently ignored.
394
395=cut
396
397#--------------------------------------------------------------------------
398$ComTab{'s'}=[ 2, 'sub', \&Emit, '' ]; #ok
399
400=item [2addr]B<s/>I<regular expression>B</>I<replacement>B</>I<flags>
401
402Substitute the I<replacement> string for the first substring in
403the pattern space that matches the I<regular expression>.
404Any character other than backslash or newline can be used instead of a
405slash to delimit the regular expression and the replacement.
406To use the delimiter as a literal character within the regular expression
407and the replacement, precede the character by a backslash (`C<\>').
408
409Literal newlines may be embedded in the replacement string by
410preceding a newline with a backslash.
411
412Within the replacement, an ampersand (`C<&>') is replaced by the string
413matching the regular expression. The strings `C<\1>' through `C<\9>' are
414replaced by the corresponding subpattern (see L<"Basic Regular Expressions">).
415To get a literal `C<&>' or `C<\>' in the replacement text, precede it
416by a backslash.
417
418The following I<flags> modify the behaviour of the B<s> command:
419
420=over 8
421
422=item B<g>
423
424The replacement is performed for all matching, non-overlapping substrings
425of the pattern space.
426
427=item B<1>..B<9>
428
429Replace only the n-th matching substring of the pattern space.
430
431=item B<p>
432
433If the substitution was made, print the new value of the pattern space.
434
435=item B<w> I<file>
436
437If the substitution was made, write the new value of the pattern space
438to the specified file.
439
440=back
441
442=cut
443
444#--------------------------------------------------------------------------
445$ComTab{'t'}=[ 2, 'str', \&Branch, '{ goto XXX if _t() }' ]; #ok
446
447=item [2addr]B<t> [I<label>]
448
449Branch to the B<:> function with the specified I<label> if any B<s>
450substitutions have been made since the most recent reading of an input line
451or execution of a B<t> function. If no label is given, branch to the end of
452the script.
453
454
455=cut
456
457#--------------------------------------------------------------------------
458$ComTab{'w'}=[ 2, 'str', \&Write, "{ _w( '-X-' ) }" ]; #ok
459
460=item [2addr]B<w> I<file>
461
462The contents of the pattern space are written to the I<file>.
463
464=cut
465
466#--------------------------------------------------------------------------
467$ComTab{'x'}=[ 2, '', \&Emit, '{ ($Hold, $_) = ($_, $Hold) }' ]; #ok
468
469=item [2addr]B<x>
470
471Swap the contents of the pattern space and the hold space.
472
473=cut
474
475#--------------------------------------------------------------------------
476$ComTab{'y'}=[ 2, 'tra', \&Emit, '' ]; #ok
477=item [2addr]B<y>B</>I<string1>B</>I<string2>B</>
478
479In the pattern space, replace all characters occuring in I<string1> by the
480character at the corresponding position in I<string2>. It is possible
481to use any character (other than a backslash or newline) instead of a
482slash to delimit the strings. Within I<string1> and I<string2>, a
483backslash followed by any character other than a newline is that literal
484character, and a backslash followed by an `n' is replaced by a newline
485character.
486
487=cut
d83e3bda 488
86a59229 489#--------------------------------------------------------------------------
490$ComTab{'='}=[ 1, '', \&Emit, '{ print "$.\n" }' ]; #ok
491
492=item [1addr]B<=>
493
494Prints the current line number on the standard output.
495
496=cut
497
498#--------------------------------------------------------------------------
499$ComTab{':'}=[ 0, 'str', \&Label, '' ]; #ok
500
501=item [0addr]B<:> [I<label>]
502
503The command specifies the position of the I<label>. It has no other effect.
504
505=cut
506
507#--------------------------------------------------------------------------
508$ComTab{'{'}=[ 2, '', \&BeginBlock, '{' ]; #ok
509$ComTab{'}'}=[ 0, '', \&EndBlock, ';}' ]; #ok
510# ';' to avoid warning on empty {}-block
511
512=item [2addr]B<{> [I<command>]
513
514=item [0addr]B<}>
515
516These two commands begin and end a command list. The first command may
517be given on the same line as the opening B<{> command. The commands
518within the list are jointly selected by the address(es) given on the
519B<{> command (but may still have individual addresses).
520
521=cut
522
523#--------------------------------------------------------------------------
524$ComTab{'#'}=[ 0, 'str', \&Comment, '' ]; #ok
525
526=item [0addr]B<#> [I<comment>]
527
528The entire line is ignored (treated as a comment). If, however, the first
529two characters in the script are `C<#n>', automatic printing of output is
530suppressed, as if the B<-n> option were given on the command line.
531
532=back
533
534=cut
535
536use vars qw{ $isEOF $Hold %wFiles @Q $CondReg $doPrint };
537
538my $useDEBUG = exists( $ENV{PSEDDEBUG} );
539my $useEXTBRE = $ENV{PSEDEXTBRE} || '';
540$useEXTBRE =~ s/[^<>wWyB]//g; # gawk RE's handle these
541
542my $doAutoPrint = 1; # automatic printing of pattern space (-n => 0)
543my $doOpenWrite = 1; # open w command output files at start (-a => 0)
544my $svOpenWrite = 0; # save $doOpenWrite
545my $doGenerate = $0 eq 's2p';
546
547# Collected and compiled script
548#
549my( @Commands, %Defined, @BlockStack, %Label, $labNum, $Code );
550
551##################
552# Compile Time
553#
554# Labels
555#
556# Error handling
557#
558sub Warn($;$){
559 my( $msg, $loc ) = @_;
560 $loc ||= '';
561 $loc .= ': ' if length( $loc );
562 warn( "$0: $loc$msg\n" );
563}
564
565$labNum = 0;
566sub newLabel(){
567 return 'L_'.++$labNum;
568}
569
570# safeHere: create safe here delimiter and modify opcode and argument
571#
572sub safeHere($$){
573 my( $codref, $argref ) = @_;
574 my $eod = 'EOD000';
575 while( $$argref =~ /^$eod$/m ){
576 $eod++;
577 }
578 $$codref =~ s/TheEnd/$eod/e;
579 $$argref .= "$eod\n";
580}
581
582# Emit: create address logic and emit command
583#
584sub Emit($$$$$$){
585 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
586 my $cond = '';
587 if( defined( $addr1 ) ){
588 if( defined( $addr2 ) ){
589 $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
590 } else {
591 $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
592 }
593 $cond = $negated ? "unless( $addr1 )\n" : "if( $addr1 )\n";
594 }
595
596 if( $opcode eq '' ){
597 $Code .= "$cond$arg\n";
598
599 } elsif( $opcode =~ s/-X-/$arg/e ){
600 $Code .= "$cond$opcode\n";
601
602 } elsif( $opcode =~ /TheEnd/ ){
603 safeHere( \$opcode, \$arg );
604 $Code .= "$cond$opcode$arg";
605
606 } else {
607 $Code .= "$cond$opcode\n";
608 }
609 0;
610}
611
612# Write (w command, w flag): store pathname
613#
614sub Write($$$$$$){
615 my( $addr1, $addr2, $negated, $opcode, $path, $fl ) = @_;
616 $wFiles{$path} = '';
617 Emit( $addr1, $addr2, $negated, $opcode, $path, $fl );
618}
619
620
621# Label (: command): label definition
622#
623sub Label($$$$$$){
624 my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
625 my $rc = 0;
626 $lab =~ s/\s+//;
627 if( length( $lab ) ){
628 my $h;
629 if( ! exists( $Label{$lab} ) ){
630 $h = $Label{$lab}{name} = newLabel();
631 } else {
632 $h = $Label{$lab}{name};
633 if( exists( $Label{$lab}{defined} ) ){
634 my $dl = $Label{$lab}{defined};
635 Warn( "duplicate label $lab (first defined at $dl)", $fl );
636 $rc = 1;
637 }
638 }
639 $Label{$lab}{defined} = $fl;
640 $Code .= "$h:;\n";
641 }
642 $rc;
643}
644
645# BeginBlock ({ command): push block start
646#
647sub BeginBlock($$$$$$){
648 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
649 push( @BlockStack, [ $fl, $addr1, $addr2, $negated ] );
650 Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
651}
652
653# EndBlock (} command): check proper nesting
654#
655sub EndBlock($$$$$$){
656 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
657 my $rc;
658 my $jcom = pop( @BlockStack );
659 if( defined( $jcom ) ){
660 $rc = Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
661 } else {
662 Warn( "unexpected `}'", $fl );
663 $rc = 1;
664 }
665 $rc;
666}
667
668# Branch (t, b commands): check or create label, substitute default
669#
670sub Branch($$$$$$){
671 my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
672 $lab =~ s/\s+//; # no spaces at end
673 my $h;
674 if( length( $lab ) ){
675 if( ! exists( $Label{$lab} ) ){
676 $h = $Label{$lab}{name} = newLabel();
677 } else {
678 $h = $Label{$lab}{name};
679 }
680 push( @{$Label{$lab}{used}}, $fl );
681 } else {
682 $h = 'EOS';
683 }
684 $opcode =~ s/XXX/$h/e;
685 Emit( $addr1, $addr2, $negated, $opcode, '', $fl );
686}
687
688# Change (c command): is special due to range end watching
689#
690sub Change($$$$$$){
691 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
692 my $kwd = $negated ? 'unless' : 'if';
693 if( defined( $addr2 ) ){
694 $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
695 if( ! $negated ){
696 $addr1 = '$icnt = ('.$addr1.')';
697 $opcode = 'if( $icnt =~ /E0$/ )' . $opcode;
698 }
699 } else {
700 $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
701 }
702 safeHere( \$opcode, \$arg );
703 $Code .= "$kwd( $addr1 ){\n $opcode$arg}\n";
704 0;
705}
706
707
708# Comment (# command): A no-op. Who would've thought that!
709#
710sub Comment($$$$$$){
711 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
712### $Code .= "# $arg\n";
713 0;
714}
715
716
717sub stripRegex($$){
718 my( $del, $sref ) = @_;
719 my $regex = $del;
720 print "stripRegex:$del:$$sref:\n" if $useDEBUG;
721 while( $$sref =~ s{^(.*?)(\\*)\Q$del\E(\s*)}{}s ){
722 my $sl = $2;
723 $regex .= $1.$sl.$del;
724 if( length( $sl ) % 2 == 0 ){
725 return $regex;
726 }
727 $regex .= $3;
728 }
729 undef();
730}
731
732# stripTrans: take a <del> terminated string from y command
733# honoring and cleaning up of \-escaped <del>'s
734#
735sub stripTrans($$){
736 my( $del, $sref ) = @_;
737 my $t = '';
738 print "stripTrans:$del:$$sref:\n" if $useDEBUG;
739 while( $$sref =~ s{^(.*?)(\\*)\Q$del\E}{}s ){
740 my $sl = $2;
741 $t .= $1;
742 if( length( $sl ) % 2 == 0 ){
743 $t .= $sl;
744 $t =~ s/\\\\/\\/g;
745 return $t;
746 }
747 chop( $sl );
748 $t .= $sl.$del.$3;
749 }
750 undef();
751}
752
753# makey - construct Perl y/// from sed y///
754#
755sub makey($$$){
756 my( $fr, $to, $fl ) = @_;
757 my $error = 0;
758
759 # Ensure that any '-' is up front.
760 # Diagnose duplicate contradicting mappings
761 my %tr;
762 for( my $i = 0; $i < length($fr); $i++ ){
763 my $fc = substr($fr,$i,1);
764 my $tc = substr($to,$i,1);
765 if( exists( $tr{$fc} ) && $tr{$fc} ne $tc ){
766 Warn( "ambiguos translation for character `$fc' in `y' command",
767 $fl );
768 $error++;
769 }
770 $tr{$fc} = $tc;
771 }
772 $fr = $to = '';
773 if( exists( $tr{'-'} ) ){
774 ( $fr, $to ) = ( '-', $tr{'-'} );
775 delete( $tr{'-'} );
776 } else {
777 $fr = $to = '';
778 }
779 # might just as well sort it...
780 for my $fc ( sort keys( %tr ) ){
781 $fr .= $fc;
782 $to .= $tr{$fc};
783 }
784 # make embedded delimiters and newlines safe
785 $fr =~ s/([{}])/\$1/g;
786 $to =~ s/([{}])/\$1/g;
787 $fr =~ s/\n/\\n/g;
788 $to =~ s/\n/\\n/g;
789 return $error ? undef() : "{ y{$fr}{$to}; }";
790}
791
792######
793# makes - construct Perl s/// from sed s///
794#
795sub makes($$$$$$$){
796 my( $regex, $subst, $path, $global, $print, $nmatch, $fl ) = @_;
797
798 # make embedded newlines safe
799 $regex =~ s/\n/\\n/g;
800 $subst =~ s/\n/\\n/g;
801
802 my $code;
803 # n-th occurrence
804 #
805 if( length( $nmatch ) ){
806 $code = <<TheEnd;
807{ \$n = $nmatch;
808 while( --\$n && ( \$s = m ${regex}g ) ){}
809 \$s = ( substr( \$_, pos() ) =~ s ${regex}${subst}s ) if \$s;
810 \$CondReg ||= \$s;
811TheEnd
812 } else {
813 $code = <<TheEnd;
814{ \$s = s ${regex}${subst}s${global};
815 \$CondReg ||= \$s;
816TheEnd
817 }
818 if( $print ){
819 $code .= ' print $_, "\n" if $s;'."\n";
820 }
821 if( defined( $path ) ){
822 $wFiles{$path} = '';
823 $code .= " _w( '$path' ) if \$s;\n";
824 }
825 $code .= "}";
826}
827
828=head1 BASIC REGULAR EXPRESSIONS
829
830A I<Basic Regular Expression> (BRE), as defined in POSIX 1003.2, consists
831of I<atoms>, for matching parts of a string, and I<bounds>, specifying
832repetitions of a preceding atom.
833
834=head2 Atoms
835
836The possible atoms of a BRE are: B<.>, matching any single character;
837B<^> and B<$>, matching the null string at the beginning or end
838of a string, respectively; a I<bracket expressions>, enclosed
839in B<[> and B<]> (see below); and any single character with no
840other significance (matching that character). A B<\> before one
841of: B<.>, B<^>, B<$>, B<[>, B<*>, B<\>, matching the character
842after the backslash. A sequence of atoms enclosed in B<\(> and B<\)>
843becomes an atom and establishes the target for a I<backreference>,
844consisting of the substring that actually matches the enclosed atoms.
845Finally, B<\> followed by one of the digits B<0> through B<9> is a
846backreference.
847
848A B<^> that is not first, or a B<$> that is not last does not have
849a special significance and need not be preceded by a backslash to
850become literal. The same is true for a B<]>, that does not terminate
851a bracket expression.
852
853An unescaped backslash cannot be last in a BRE.
854
855=head2 Bounds
856
857The BRE bounds are: B<*>, specifying 0 or more matches of the preceding
858atom; B<\{>I<count>B<\}>, specifying that many repetitions;
859B<\{>I<minimum>B<,\}>, giving a lower limit; and
860B<\{>I<minimum>B<,>I<maximum>B<\}> finally defines a lower and upper
861bound.
862
863A bound appearing as the first item in a BRE is taken literally.
864
865=head2 Bracket Expressions
866
867A I<bracket expression> is a list of characters, character ranges
868and character classes enclosed in B<[> and B<]> and matches any
869single character from the represented set of characters.
870
871A character range is written as two characters separated by B<-> and
872represents all characters (according to the character collating sequence)
873that are not less than the first and not greater than the second.
874(Ranges are very collating-sequence-dependent, and portable programs
875should avoid relying on them.)
876
877A character class is one of the class names
878
879 alnum digit punct
880 alpha graph space
881 blank lower upper
882 cntrl print xdigit
883
884enclosed in B<[:> and B<:]> and represents the set of characters
885as defined in ctype(3).
886
887If the first character after B<[> is B<^>, the sense of matching is
888inverted.
889
890To include a literal `C<^>', place it anywhere else but first. To
891include a literal 'C<]>' place it first or immediately after an
892initial B<^>. To include a literal `C<->' make it the first (or
893second after B<^>) or last character, or the second endpoint of
894a range.
895
896The special bracket expression constructs C<[[:E<lt>:]]> and C<[[:E<gt>:]]>
897match the null string at the beginning and end of a word respectively.
898(Note that neither is identical to Perl's `\b' atom.)
899
900=head2 Additional Atoms
901
902Since some sed implementations provide additional regular expression
903atoms (not defined in POSIX 1003.2), B<psed> is capable of translating
904the following backslash escapes:
905
906=over 4
907
908=item B<\E<lt>> This is the same as C<[[:E<gt>:]]>.
909
910=item B<\E<gt>> This is the same as C<[[:E<lt>:]]>.
911
912=item B<\w> This is an abbreviation for C<[[:alnum:]_]>.
913
914=item B<\W> This is an abbreviation for C<[^[:alnum:]_]>.
915
916=item B<\y> Match the empty string at a word boundary.
917
918=item B<\B> Match the empty string between any two either word or non-word characters.
919
920=back
921
922To enable this feature, the environment variable PSEDEXTBRE must be set
923to a string containing the requested characters, e.g.:
924C<PSEDEXTBRE='E<lt>E<gt>wW'>.
925
926=cut
927
928#####
929# bre2p - convert BRE to Perl RE
930#
931sub peek(\$$){
932 my( $pref, $ic ) = @_;
933 $ic < length($$pref)-1 ? substr( $$pref, $ic+1, 1 ) : '';
934}
935
936sub bre2p($$$){
937 my( $del, $pat, $fl ) = @_;
938 my $led = $del;
939 $led =~ tr/{([</})]>/;
940 $led = '' if $led eq $del;
941
942 $pat = substr( $pat, 1, length($pat) - 2 );
943 my $res = '';
944 my $bracklev = 0;
945 my $backref = 0;
946 my $parlev = 0;
947 for( my $ic = 0; $ic < length( $pat ); $ic++ ){
948 my $c = substr( $pat, $ic, 1 );
949 if( $c eq '\\' ){
950 ### backslash escapes
951 my $nc = peek($pat,$ic);
952 if( $nc eq '' ){
953 Warn( "`\\' cannot be last in pattern", $fl );
954 return undef();
955 }
956 $ic++;
957 if( $nc eq $del ){ ## \<pattern del> => \<pattern del>
958 $res .= "\\$del";
959
960 } elsif( $nc =~ /([[.*\\n])/ ){
961 ## check for \-escaped magics and \n:
962 ## \[ \. \* \\ \n stay as they are
963 $res .= '\\'.$nc;
964
965 } elsif( $nc eq '(' ){ ## \( => (
966 $parlev++;
967 $res .= '(';
968
969 } elsif( $nc eq ')' ){ ## \) => )
970 $parlev--;
971 $backref++;
972 if( $parlev < 0 ){
973 Warn( "unmatched `\\)'", $fl );
974 return undef();
975 }
976 $res .= ')';
977
978 } elsif( $nc eq '{' ){ ## repetition factor \{<i>[,[<j>]]\}
979 my $endpos = index( $pat, '\\}', $ic );
980 if( $endpos < 0 ){
981 Warn( "unmatched `\\{'", $fl );
982 return undef();
983 }
984 my $rep = substr( $pat, $ic+1, $endpos-($ic+1) );
985 $ic = $endpos + 1;
986
987 if( $res =~ /^\^?$/ ){
988 $res .= "\\{$rep\}";
989 } elsif( $rep =~ /^(\d+)(,?)(\d*)?$/ ){
990 my $min = $1;
991 my $com = $2 || '';
992 my $max = $3;
993 if( length( $max ) ){
994 if( $max < $min ){
995 Warn( "maximum less than minimum in `\\{$rep\\}'",
996 $fl );
997 return undef();
998 }
999 } else {
1000 $max = '';
1001 }
1002 # simplify some
1003 if( $min == 0 && $max eq '1' ){
1004 $res .= '?';
1005 } elsif( $min == 1 && "$com$max" eq ',' ){
1006 $res .= '+';
1007 } elsif( $min == 0 && "$com$max" eq ',' ){
1008 $res .= '*';
1009 } else {
1010 $res .= "{$min$com$max}";
1011 }
1012 } else {
1013 Warn( "invalid repeat clause `\\{$rep\\}'", $fl );
1014 return undef();
1015 }
1016
1017 } elsif( $nc =~ /^[1-9]$/ ){
1018 ## \1 .. \9 => \1 .. \9, but check for a following digit
1019 if( $nc > $backref ){
1020 Warn( "invalid backreference ($nc)", $fl );
1021 return undef();
1022 }
1023 $res .= "\\$nc";
1024 if( peek($pat,$ic) =~ /[0-9]/ ){
1025 $res .= '(?:)';
1026 }
1027
1028 } elsif( $useEXTBRE && ( $nc =~ /[$useEXTBRE]/ ) ){
1029 ## extensions - at most <>wWyB - not in POSIX
1030 if( $nc eq '<' ){ ## \< => \b(?=\w), be precise
1031 $res .= '\\b(?<=\\W)';
1032 } elsif( $nc eq '>' ){ ## \> => \b(?=\W), be precise
1033 $res .= '\\b(?=\\W)';
1034 } elsif( $nc eq 'y' ){ ## \y => \b
1035 $res .= '\\b';
1036 } else { ## \B, \w, \W remain the same
1037 $res .= "\\$nc";
1038 }
1039 } elsif( $nc eq $led ){
1040 ## \<closing bracketing-delimiter> - keep '\'
1041 $res .= "\\$nc";
1042
1043 } else { ## \ <char> => <char> ("as if `\' were not present")
1044 $res .= $nc;
1045 }
1046
1047 } elsif( $c eq '.' ){ ## . => .
1048 $res .= $c;
1049
1050 } elsif( $c eq '*' ){ ## * => * but \* if there's nothing preceding it
1051 if( $res =~ /^\^?$/ ){
1052 $res .= '\\*';
1053 } elsif( substr( $res, -1, 1 ) ne '*' ){
1054 $res .= $c;
1055 }
1056
1057 } elsif( $c eq '[' ){
1058 ## parse []: [^...] [^]...] [-...]
1059 my $add = '[';
1060 if( peek($pat,$ic) eq '^' ){
1061 $ic++;
1062 $add .= '^';
1063 }
1064 my $nc = peek($pat,$ic);
1065 if( $nc eq ']' || $nc eq '-' ){
1066 $add .= $nc;
1067 $ic++;
1068 }
1069 # check that [ is not trailing
1070 if( $ic >= length( $pat ) - 1 ){
1071 Warn( "unmatched `['", $fl );
1072 return undef();
1073 }
1074 # look for [:...:] and x-y
1075 my $rstr = substr( $pat, $ic+1 );
1076 if( $rstr =~ /^((?:\[:\(\w+|[><]\):\]|[^]-](?:-[^]])?)*)/ ){
1077 my $cnt = $1;
1078 $ic += length( $cnt );
1079 $cnt =~ s/([\\\$])/\\$1/g; # `\', `$' are magic in Perl []
1080 # try some simplifications
1081 my $red = $cnt;
1082 if( $red =~ s/0-9// ){
1083 $cnt = $red.'\d';
1084 if( $red =~ s/A-Z// && $red =~ s/a-z// && $red =~ s/_// ){
1085 $cnt = $red.'\w';
1086 }
1087 }
1088 $add .= $cnt;
1089
1090 # POSIX 1003.2 has this (optional) for begin/end word
1091 $add = '\\b(?=\\W)' if $add eq '[[:<:]]';
1092 $add = '\\b(?<=\\W)' if $add eq '[[:>:]]';
1093
1094 }
1095
1096 ## may have a trailing `-' before `]'
1097 if( $ic < length($pat) - 1 &&
1098 substr( $pat, $ic+1 ) =~ /^(-?])/ ){
1099 $ic += length( $1 );
1100 $add .= $1;
1101 # another simplification
1102 $add =~ s/^\[(\^?)(\\[dw])]$/ $1 eq '^' ? uc($2) : $2 /e;
1103 $res .= $add;
1104 } else {
1105 Warn( "unmatched `['", $fl );
1106 return undef();
1107 }
3cb6de81 1108
86a59229 1109 } elsif( $c eq $led ){ ## unescaped <closing bracketing-delimiter>
1110 $res .= "\\$c";
d83e3bda 1111
86a59229 1112 } elsif( $c eq ']' ){ ## unmatched ] is not magic
1113 $res .= ']';
d83e3bda 1114
86a59229 1115 } elsif( $c =~ /[|+?{}()]/ ){ ## not magic in BRE, but in Perl: \-quote
1116 $res .= "\\$c";
d83e3bda 1117
86a59229 1118 } elsif( $c eq '^' ){ ## not magic unless 1st, but in Perl: \-quote
1119 $res .= length( $res ) ? '\\^' : '^';
d83e3bda 1120
86a59229 1121 } elsif( $c eq '$' ){ ## not magic unless last, but in Perl: \-quote
1122 $res .= $ic == length( $pat ) - 1 ? '$' : '\\$';
8d063cd8 1123
86a59229 1124 } else {
1125 $res .= $c;
1126 }
8d063cd8 1127 }
0a12ae7d 1128
86a59229 1129 if( $parlev ){
1130 Warn( "unmatched `\\('", $fl );
1131 return undef();
8d063cd8 1132 }
0a12ae7d 1133
86a59229 1134 # final cleanup: eliminate raw HTs
1135 $res =~ s/\t/\\t/g;
1136 return $del . $res . ( $led ? $led : $del );
1137}
0a12ae7d 1138
86a59229 1139
1140#####
1141# sub2p - convert sed substitution to Perl substitution
1142#
1143sub sub2p($$$){
1144 my( $del, $subst, $fl ) = @_;
1145 my $led = $del;
1146 $led =~ tr/{([</})]>/;
1147 $led = '' if $led eq $del;
1148
1149 $subst = substr( $subst, 1, length($subst) - 2 );
1150 my $res = '';
1151
1152 for( my $ic = 0; $ic < length( $subst ); $ic++ ){
1153 my $c = substr( $subst, $ic, 1 );
1154 if( $c eq '\\' ){
1155 ### backslash escapes
1156 my $nc = peek($subst,$ic);
1157 if( $nc eq '' ){
1158 Warn( "`\\' cannot be last in substitution", $fl );
1159 return undef();
1160 }
1161 $ic++;
1162 if( $nc =~ /[\\$del$led]/ ){ ## \ and delimiter
1163 $res .= '\\' . $nc;
1164 } elsif( $nc =~ /[1-9]/ ){ ## \1 - \9 => ${1} - ${9}
1165 $res .= '${' . $nc . '}';
1166 } else { ## everything else (includes &): omit \
1167 $res .= $nc;
1168 }
1169 } elsif( $c eq '&' ){ ## & => $&
1170 $res .= '$&';
1171 } elsif( $c =~ /[\$\@$led]/ ){ ## magic in Perl's substitution string
1172 $res .= '\\' . $c;
1173 } else {
1174 $res .= $c;
8d063cd8 1175 }
8d063cd8 1176 }
1177
86a59229 1178 # final cleanup: eliminate raw HTs
1179 $res =~ s/\t/\\t/g;
1180 return ( $led ? $del : $led ) . $res . ( $led ? $led : $del );
1181}
0a12ae7d 1182
86a59229 1183
1184sub Parse(){
1185 my $error = 0;
1186 my( $pdef, $pfil, $plin );
1187 for( my $icom = 0; $icom < @Commands; $icom++ ){
1188 my $cmd = $Commands[$icom];
1189 print "Parse:$cmd:\n" if $useDEBUG;
1190 $cmd =~ s/^\s+//;
1191 next unless length( $cmd );
1192 my $scom = $icom;
1193 if( exists( $Defined{$icom} ) ){
1194 $pdef = $Defined{$icom};
1195 if( $pdef =~ /^ #(\d+)/ ){
1196 $pfil = 'expression #';
1197 $plin = $1;
1198 } else {
1199 $pfil = "$pdef l.";
1200 $plin = 1;
1201 }
1202 } else {
1203 $plin++;
1204 }
1205 my $fl = "$pfil$plin";
1206
1207 # insert command as comment in gnerated code
1208 #
1209 $Code .= "# $cmd\n" if $doGenerate;
1210
1211 # The Address(es)
1212 #
1213 my( $negated, $naddr, $addr1, $addr2 );
1214 $naddr = 0;
1215 if( $cmd =~ s/^(\d+)\s*// ){
1216 $addr1 = "$1"; $naddr++;
1217 } elsif( $cmd =~ s/^\$\s*// ){
1218 $addr1 = 'eofARGV()'; $naddr++;
1219 } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1220 my $del = $1;
1221 my $regex = stripRegex( $del, \$cmd );
1222 if( defined( $regex ) ){
1223 $addr1 = 'm '.bre2p( $del, $regex, $fl ).'s';
1224 $naddr++;
1225 } else {
1226 Warn( "malformed regex, 1st address", $fl );
1227 $error++;
1228 next;
1229 }
1230 }
1231 if( defined( $addr1 ) && $cmd =~ s/,\s*// ){
1232 if( $cmd =~ s/^(\d+)\s*// ){
1233 $addr2 = "$1"; $naddr++;
1234 } elsif( $cmd =~ s/^\$\s*// ){
1235 $addr2 = 'eofARGV()'; $naddr++;
1236 } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1237 my $del = $1;
1238 my $regex = stripRegex( $del, \$cmd );
1239 if( defined( $regex ) ){
1240 $addr2 = 'm '. bre2p( $del, $regex, $fl ).'s';
1241 $naddr++;
1242 } else {
1243 Warn( "malformed regex, 2nd address", $fl );
1244 $error++;
1245 next;
1246 }
1247 } else {
1248 Warn( "invalid address after `,'", $fl );
1249 $error++;
1250 next;
1251 }
1252 }
1253
1254 # address modifier `!'
1255 #
1256 $negated = $cmd =~ s/^!\s*//;
1257 if( defined( $addr1 ) ){
1258 print "Parse: addr1=$addr1" if $useDEBUG;
1259 if( defined( $addr2 ) ){
1260 print ", addr2=$addr2 " if $useDEBUG;
1261 # both numeric and addr1 > addr2 => eliminate addr2
1262 undef( $addr2 ) if $addr1 =~ /^\d+$/ &&
1263 $addr2 =~ /^\d+$/ && $addr1 > $addr2;
1264 }
9ef589d8 1265 }
86a59229 1266 print 'negated' if $useDEBUG && $negated;
1267 print " command:$cmd\n" if $useDEBUG;
1268
1269 # The Command
1270 #
1271 if( $cmd !~ s/^([:#={}abcdDgGhHilnNpPqrstwxy])\s*// ){
1272 my $h = substr( $cmd, 0, 1 );
1273 Warn( "unknown command `$h'", $fl );
1274 $error++;
8d063cd8 1275 next;
1276 }
86a59229 1277 my $key = $1;
8d063cd8 1278
86a59229 1279 my $tabref = $ComTab{$key};
1280 if( $naddr > $tabref->[0] ){
1281 Warn( "excess address(es)", $fl );
1282 $error++;
8d063cd8 1283 next;
1284 }
1285
86a59229 1286 my $arg = '';
1287 if( $tabref->[1] eq 'str' ){
1288 # take remainder - don't care if it is empty
1289 $arg = $cmd;
1290 $cmd = '';
1291
1292 } elsif( $tabref->[1] eq 'txt' ){
1293 # multi-line text
1294 my $goon = $cmd =~ /(.*)\\$/;
1295 if( length( $1 ) ){
1296 Warn( "extra characters after command ($cmd)", $fl );
1297 $error++;
8d063cd8 1298 }
86a59229 1299 while( $goon ){
1300 $icom++;
1301 if( $icom > $#Commands ){
1302 Warn( "unexpected end of script", $fl );
1303 $error++;
1304 last;
1305 }
1306 $cmd = $Commands[$icom];
1307 $Code .= "# $cmd\n" if $doGenerate;
1308 $goon = $cmd =~ s/\\$//;
1309 $cmd =~ s/\\(.)/$1/g;
1310 $arg .= "\n" if length( $arg );
1311 $arg .= $cmd;
8d063cd8 1312 }
86a59229 1313 $arg .= "\n" if length( $arg );
1314 $cmd = '';
8d063cd8 1315
86a59229 1316 } elsif( $tabref->[1] eq 'sub' ){
1317 # s///
1318 if( ! length( $cmd ) ){
1319 Warn( "`s' command requires argument", $fl );
1320 $error++;
1321 next;
1322 }
1323 if( $cmd =~ s{^([^\\\n])}{} ){
1324 my $del = $1;
1325 my $regex = stripRegex( $del, \$cmd );
1326 if( ! defined( $regex ) ){
1327 Warn( "malformed regular expression", $fl );
1328 $error++;
1329 next;
a687059c 1330 }
86a59229 1331 $regex = bre2p( $del, $regex, $fl );
1332
1333 # a trailing \ indicates embedded NL (in replacement string)
1334 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1335 $icom++;
1336 if( $icom > $#Commands ){
1337 Warn( "unexpected end of script", $fl );
1338 $error++;
1339 last;
8d063cd8 1340 }
86a59229 1341 $cmd .= $Commands[$icom];
1342 $Code .= "# $Commands[$icom]\n" if $doGenerate;
9ef589d8 1343 }
86a59229 1344
1345 my $subst = stripRegex( $del, \$cmd );
1346 if( ! defined( $regex ) ){
1347 Warn( "malformed substitution expression", $fl );
1348 $error++;
0a12ae7d 1349 next;
1350 }
86a59229 1351 $subst = sub2p( $del, $subst, $fl );
1352
1353 # parse s/// modifier: g|p|0-9|w <file>
1354 my( $global, $nmatch, $print, $write ) =
1355 ( '', '', 0, undef );
1356 while( $cmd =~ s/^([gp0-9])// ){
1357 $1 eq 'g' ? ( $global = 'g' ) :
1358 $1 eq 'p' ? ( $print = $1 ) : ( $nmatch .= $1 );
1359 }
1360 $write = $1 if $cmd =~ s/w\s*(.*)$//;
1361 ### $nmatch =~ s/^(\d)\1*$/$1/; ### may be dangerous?
1362 if( $global && length( $nmatch ) || length( $nmatch ) > 1 ){
1363 Warn( "conflicting flags `$global$nmatch'", $fl );
1364 $error++;
0a12ae7d 1365 next;
1366 }
86a59229 1367
1368 $arg = makes( $regex, $subst,
1369 $write, $global, $print, $nmatch, $fl );
1370 if( ! defined( $arg ) ){
1371 $error++;
8d063cd8 1372 next;
1373 }
86a59229 1374
1375 } else {
1376 Warn( "improper delimiter in s command", $fl );
1377 $error++;
1378 next;
1379 }
1380
1381 } elsif( $tabref->[1] eq 'tra' ){
1382 # y///
1383 # a trailing \ indicates embedded newline
1384 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1385 $icom++;
1386 if( $icom > $#Commands ){
1387 Warn( "unexpected end of script", $fl );
1388 $error++;
1389 last;
1390 }
1391 $cmd .= $Commands[$icom];
1392 $Code .= "# $Commands[$icom]\n" if $doGenerate;
1393 }
1394 if( ! length( $cmd ) ){
1395 Warn( "`y' command requires argument", $fl );
1396 $error++;
1397 next;
1398 }
1399 my $d = substr( $cmd, 0, 1 ); $cmd = substr( $cmd, 1 );
1400 if( $d eq '\\' ){
1401 Warn( "`\\' not valid as delimiter in `y' command", $fl );
1402 $error++;
1403 next;
1404 }
1405 my $fr = stripTrans( $d, \$cmd );
1406 if( ! defined( $fr ) || ! length( $cmd ) ){
1407 Warn( "malformed `y' command argument", $fl );
1408 $error++;
1409 next;
1410 }
1411 my $to = stripTrans( $d, \$cmd );
1412 if( ! defined( $to ) ){
1413 Warn( "malformed `y' command argument", $fl );
1414 $error++;
1415 next;
1416 }
1417 if( length($fr) != length($to) ){
1418 Warn( "string lengths in `y' command differ", $fl );
1419 $error++;
1420 next;
1421 }
1422 if( ! defined( $arg = makey( $fr, $to, $fl ) ) ){
1423 $error++;
1424 next;
8d063cd8 1425 }
8d063cd8 1426
8d063cd8 1427 }
1428
86a59229 1429 # $cmd must be now empty - exception is {
1430 if( $cmd !~ /^\s*$/ ){
1431 if( $key eq '{' ){
1432 # dirty hack to process command on '{' line
1433 $Commands[$icom--] = $cmd;
1434 } else {
1435 Warn( "extra characters after command ($cmd)", $fl );
1436 $error++;
1437 next;
1438 }
8d063cd8 1439 }
1440
86a59229 1441 # Make Code
1442 #
1443 if( &{$tabref->[2]}( $addr1, $addr2, $negated,
1444 $tabref->[3], $arg, $fl ) ){
1445 $error++;
8d063cd8 1446 }
86a59229 1447 }
8d063cd8 1448
86a59229 1449 while( @BlockStack ){
1450 my $bl = pop( @BlockStack );
1451 Warn( "start of unterminated `{'", $bl );
1452 $error++;
1453 }
8d063cd8 1454
86a59229 1455 for my $lab ( keys( %Label ) ){
1456 if( ! exists( $Label{$lab}{defined} ) ){
1457 for my $used ( @{$Label{$lab}{used}} ){
1458 Warn( "undefined label `$lab'", $used );
1459 $error++;
1460 }
8d063cd8 1461 }
86a59229 1462 }
8d063cd8 1463
86a59229 1464 exit( 1 ) if $error;
1465}
8d063cd8 1466
8d063cd8 1467
86a59229 1468##############
1469#### MAIN ####
1470##############
8d063cd8 1471
86a59229 1472sub usage(){
1473 print STDERR "Usage: sed [-an] command [file...]\n";
1474 print STDERR " [-an] [-e command] [-f script-file] [file...]\n";
1475}
8d063cd8 1476
86a59229 1477###################
1478# Here we go again...
1479#
1480my $expr = 0;
1481while( @ARGV && $ARGV[0] =~ /^-(.)(.*)$/ ){
1482 my $opt = $1;
1483 my $arg = $2;
1484 shift( @ARGV );
1485 if( $opt eq 'e' ){
1486 if( length( $arg ) ){
1487 push( @Commands, split( "\n", $arg ) );
1488 } elsif( @ARGV ){
1489 push( @Commands, shift( @ARGV ) );
1490 } else {
1491 Warn( "option -e requires an argument" );
1492 usage();
1493 exit( 1 );
1494 }
1495 $expr++;
1496 $Defined{$#Commands} = " #$expr";
1497 next;
1498 }
1499 if( $opt eq 'f' ){
1500 my $path;
1501 if( length( $arg ) ){
1502 $path = $arg;
1503 } elsif( @ARGV ){
1504 $path = shift( @ARGV );
1505 } else {
1506 Warn( "option -f requires an argument" );
1507 usage();
1508 exit( 1 );
1509 }
1510 my $fst = $#Commands + 1;
1511 open( SCRIPT, "<$path" ) || die( "$0: $path: could not open ($!)\n" );
1512 my $cmd;
1513 while( defined( $cmd = <SCRIPT> ) ){
1514 chomp( $cmd );
1515 push( @Commands, $cmd );
1516 }
1517 close( SCRIPT );
1518 if( $#Commands >= $fst ){
1519 $Defined{$fst} = "$path";
8d063cd8 1520 }
86a59229 1521 next;
1522 }
1523 if( $opt eq '-' && $arg eq '' ){
1524 last;
1525 }
1526 if( $opt eq 'h' || $opt eq '?' ){
1527 usage();
1528 exit( 0 );
1529 }
1530 if( $opt eq 'n' ){
1531 $doAutoPrint = 0;
1532 } elsif( $opt eq 'a' ){
1533 $doOpenWrite = 0;
1534 } else {
1535 Warn( "illegal option `$opt'" );
1536 usage();
1537 exit( 1 );
1538 }
1539 if( length( $arg ) ){
1540 unshift( @ARGV, "-$arg" );
1541 }
1542}
8d063cd8 1543
86a59229 1544# A singleton command may be the 1st argument when there are no options.
1545#
1546if( @Commands == 0 ){
1547 if( @ARGV == 0 ){
1548 Warn( "no script command given" );
1549 usage();
1550 exit( 1 );
1551 }
1552 push( @Commands, split( "\n", shift( @ARGV ) ) );
1553 $Defined{0} = ' #1';
1554}
8d063cd8 1555
86a59229 1556print STDERR "Files: @ARGV\n" if $useDEBUG;
8d063cd8 1557
86a59229 1558# generate leading code
1559#
1560 $Code = <<'[TheEnd]';
1561
1562sub openARGV(){
1563 unshift( @ARGV, '-' ) unless @ARGV;
1564 my $file = shift( @ARGV );
1565 open( ARG, "<$file" )
1566 || die( "$0: can't open $file for reading ($!)\n" );
1567 $isEOF = 0;
1568}
8d063cd8 1569
86a59229 1570sub getsARGV(;\$){
1571 my $argref = @_ ? shift() : \$_;
1572 while( $isEOF || ! defined( $$argref = <ARG> ) ){
1573 close( ARG );
1574 return 0 unless @ARGV;
1575 my $file = shift( @ARGV );
1576 open( ARG, "<$file" )
1577 || die( "$0: can't open $file for reading ($!)\n" );
1578 $isEOF = 0;
1579 }
1580 1;
1581}
8d063cd8 1582
86a59229 1583sub eofARGV(){
1584 return @ARGV == 0 && ( $isEOF = eof( ARG ) );
1585}
8d063cd8 1586
86a59229 1587sub makeHandle($){
1588 my( $path ) = @_;
1589 my $handle;
1590 if( ! exists( $wFiles{$path} ) || $wFiles{$path} eq '' ){
1591 $handle = $wFiles{$path} = gensym();
1592 if( $doOpenWrite ){
1593 if( ! open( $handle, ">$path" ) ){
1594 die( "$0: can't open $path for writing: ($!)\n" );
1595 }
9ef589d8 1596 }
86a59229 1597 } else {
1598 $handle = $wFiles{$path};
1599 }
1600 return $handle;
1601}
9ef589d8 1602
86a59229 1603sub _r($){
1604 my $path = shift();
1605 push( @Q, \$path );
1606}
8d063cd8 1607
86a59229 1608sub _l(){
1609 my $h = $_;
1610 my $mcpl = 70;
1611 $h =~ s/\\/\\\\/g;
1612 if( $h =~ /[^[:print:]]/ ){
1613 $h =~ s/\a/\\a/g;
1614 $h =~ s/\f/\\f/g;
1615 $h =~ s/\n/\\n/g;
1616 $h =~ s/\t/\\t/g;
1617 $h =~ s/\r/\\r/g;
1618 $h =~ s/\e/\\e/g;
1619 $h =~ s/([^[:print:]])/sprintf("\\%03o", ord($1))/ge;
1620 }
1621 while( length( $h ) > $mcpl ){
1622 my $l = substr( $h, 0, $mcpl-1 );
1623 $h = substr( $h, $mcpl );
1624 # remove incomplete \-escape from end of line
1625 if( $l =~ s/(?<!\\)(\\[0-7]{0,2})$// ){
1626 $h = $1 . $h;
8d063cd8 1627 }
86a59229 1628 print $l, "\\\n";
8d063cd8 1629 }
86a59229 1630 print "$h\$\n";
8d063cd8 1631}
1632
86a59229 1633sub _w($){
1634 my $path = shift();
1635 my $handle = $wFiles{$path};
1636 if( ! $doOpenWrite &&
1637 ! defined( fileno( $handle ) ) ){
1638 open( $handle, ">$path" )
1639 || die( "$0: $path: cannot open ($!)\n" );
1640 }
1641 print $handle $_, "\n";
1642}
a687059c 1643
86a59229 1644# condition register test/reset
1645#
1646sub _t(){
1647 my $res = $CondReg;
1648 $CondReg = 0;
1649 $res;
1650}
0a12ae7d 1651
86a59229 1652# printQ
1653#
1654sub printQ(){
1655 for my $q ( @Q ){
1656 if( ref( $q ) ){
1657 if( exists( $wFiles{$$q} ) && $wFiles{$$q} ne '' ){
1658 open( $wFiles{$$q}, ">>$$q" );
1659 }
1660 if( open( RF, "<$$q" ) ){
1661 my $line;
1662 while( defined( $line = <RF> ) ){
1663 print $line;
1664 }
1665 close( RF );
1666 }
1667 } else {
1668 print $q;
a687059c 1669 }
86a59229 1670 }
1671 undef( @Q );
1672}
1673
1674sub Run(){
1675 my( $h, $icnt, $s, $n );
1676 # hack (not unbreakable :-/) to avoid // matching an empty string
1677 my $z = "\000"; $z =~ /$z/;
1678 # Initialize.
1679 openARGV();
1680 $Hold = '';
1681 $CondReg = 0;
1682 $doPrint = $doAutoPrint;
1683CYCLE:
1684 while( getsARGV() ){
1685 chomp();
1686 $CondReg = 0; # cleared on t
1687BOS:;
1688[TheEnd]
1689
1690 # parse - avoid opening files when doing s2p
1691 #
1692 ( $svOpenWrite, $doOpenWrite ) = ( $doOpenWrite, $svOpenWrite )
1693 if $doGenerate;
1694 Parse();
1695 ( $svOpenWrite, $doOpenWrite ) = ( $doOpenWrite, $svOpenWrite )
1696 if $doGenerate;
1697
1698 # append trailing code
1699 #
1700 $Code .= <<'[TheEnd]';
1701EOS: if( $doPrint ){
1702 print $_, "\n";
1703 } else {
1704 $doPrint = $doAutoPrint;
a687059c 1705 }
86a59229 1706 printQ() if @Q;
a687059c 1707 }
86a59229 1708
1709 exit( 0 );
a687059c 1710}
86a59229 1711[TheEnd]
1712
1713# magic "#n" - same as -n option
1714#
1715$doAutoPrint = 0 if substr( $Commands[0], 0, 2 ) eq '#n';
a687059c 1716
86a59229 1717# eval code - check for errors
1718#
1719print "Code:\n$Code" if $useDEBUG;
1720eval $Code;
1721if( $@ ){
1722 print "Code:\n$Code";
1723 die( "$0: internal error - generated incorrect Perl code: $@\n" );
9ef589d8 1724}
1725
86a59229 1726if( $doGenerate ){
1727
1728 # write full Perl program
1729 #
1730
1731 # bang line, declarations
1732 print <<TheEnd;
1733#!$perlpath -w
1734eval 'exec $perlpath -S \$0 \${1+"\$@"}'
1735 if 0;
1736\$0 =~ s/^.*?(\\w+)\$/\$1/;
1737
1738use strict;
1739use Symbol;
1740use vars qw{ \$isEOF \$Hold \%wFiles \@Q \$CondReg
1741 \$doAutoPrint \$doOpenWrite \$doPrint };
1742\$doAutoPrint = $doAutoPrint;
1743\$doOpenWrite = $doOpenWrite;
1744TheEnd
1745
1746 my $wf = "'" . join( "', '", keys( %wFiles ) ) . "'";
1747 if( $wf ne "''" ){
1748 print <<TheEnd;
1749sub makeHandle(\$);
1750for my \$p ( $wf ){
1751 exit( 1 ) unless makeHandle( \$p );
9ef589d8 1752}
86a59229 1753TheEnd
1754 }
9ef589d8 1755
86a59229 1756 print $Code;
1757 print "&Run()\n";
1758 exit( 0 );
1aa91729 1759
86a59229 1760} else {
1761
1762 # execute: make handles (and optionally open) all w files; run!
1aa91729 1763
86a59229 1764 for my $p ( keys( %wFiles ) ){
1765 exit( 1 ) unless makeHandle( $p );
1766 }
1767 &Run();
1aa91729 1768}
86a59229 1769
1770
1771=head1 ENVIRONMENT
1772
1773The environment variable C<PSEDEXTBRE> may be set to extend BREs.
1774See L<"Additional Atoms">.
1775
1776=head1 DIAGNOSTICS
1777
1778=over 4
1779
1780=item ambiguos translation for character `%s' in `y' command
1781
1782The indicated character appears twice, with different translations.
1783
1784=item `[' cannot be last in pattern
1785
1786A `[' in a BRE indicates the beginning of a I<bracket expression>.
1787
1788=item `\' cannot be last in pattern
1789
1790A `\' in a BRE is used to make the subsequent character literal.
1791
1792=item `\' cannot be last in substitution
1793
1794A `\' in a subsitution string is used to make the subsequent character literal.
1795
1796=item conflicting flags `%s'
1797
1798In an B<s> command, either the `g' flag and an n-th occurrence flag, or
1799multiple n-th occurrence flags are specified. Note that only the digits
1800`1' through `9' are permitted.
1801
1802=item duplicate label %s (first defined at %s)
1803
1804=item excess address(es)
1805
1806The command has more than the permitted number of addresses.
1807
1808=item extra characters after command (%s)
1809
1810=item illegal option `%s'
1811
1812=item improper delimiter in s command
1813
1814The BRE and substitution may not be delimited with `\' or newline.
1815
1816=item invalid address after `,'
1817
1818=item invalid backreference (%s)
1819
1820The specified backreference number exceeds the number of backreferences
1821in the BRE.
1822
1823=item invalid repeat clause `\{%s\}'
1824
1825The repeat clause does not contain a valid integer value, or pair of
1826values.
1827
1828=item malformed regex, 1st address
1829
1830=item malformed regex, 2nd address
1831
1832=item malformed regular expression
1833
1834=item malformed substitution expression
1835
1836=item malformed `y' command argument
1837
1838The first or second string of a B<y> command is syntactically incorrect.
1839
1840=item maximum less than minimum in `\{%s\}'
1841
1842=item no script command given
1843
1844There must be at least one B<-e> or one B<-f> option specifying a
1845script or script file.
1846
1847=item `\' not valid as delimiter in `y' command
1848
1849=item option -e requires an argument
1850
1851=item option -f requires an argument
1852
1853=item `s' command requires argument
1854
1855=item start of unterminated `{'
1856
1857=item string lengths in `y' command differ
1858
1859The translation table strings in a B<y> commanf must have equal lengths.
1860
1861=item undefined label `%s'
1862
1863=item unexpected `}'
1864
1865A B<}> command without a preceding B<{> command was encountered.
1866
1867=item unexpected end of script
1868
1869The end of the script was reached although a text line after a
1870B<a>, B<c> or B<i> command indicated another line.
1871
1872=item unknown command `%s'
1873
1874=item unterminated `['
1875
1876A BRE contains an unterminated bracket expression.
1877
1878=item unterminated `\('
1879
1880A BRE contains an unterminated backreference.
1881
1882=item `\{' without closing `\}'
1883
1884A BRE contains an unterminated bounds specification.
1885
1886=item `\)' without preceding `\('
1887
1888=item `y' command requires argument
1889
1890=back
1891
1892=head1 EXAMPLE
1893
1894The basic material for the preceding section was generated by running
1895the sed script
1896
1897 #no autoprint
1898 s/^.*Warn( *"\([^"]*\)".*$/\1/
1899 t process
1900 b
1901 :process
1902 s/$!/%s/g
1903 s/$[_[:alnum:]]\{1,\}/%s/g
1904 s/\\\\/\\/g
1905 s/^/=item /
1906 p
1907
1908on the program's own text, and piping the output into C<sort -u>.
1909
1910
1911=head1 SED SCRIPT TRANSLATION
1912
1913If this program is invoked with the name F<s2p> it will act as a
1914sed-to-Perl translator. After option processing (all other
1915arguments are ignored), a Perl program is printed on standard
1916output, which will process the input stream (as read from all
1917arguments) in the way defined by the sed script and the option setting
1918used for the translation.
1919
1920=head1 SEE ALSO
1921
1922perl(1), re_format(7)
1923
1924=head1 BUGS
1925
1926The B<l> command will show escape characters (ESC) as `C<\e>', but
1927a vertical tab (VT) in octal.
1928
1929Trailing spaces are truncated from labels in B<:>, B<t> and B<b> commands.
1930
1931The meaning of an empty regular expression (`C<//>'), as defined by B<sed>,
1932is "the last pattern used, at run time". This deviates from the Perl
1933interpretation, which will re-use the "last last successfully executed
1934regular expression". Since keeping track of pattern usage would create
1935terribly cluttered code, and differences would only appear in obscure
1936context (where other B<sed> implementations appear to deviate, too),
1937the Perl semantics was adopted. Note that common usage of this feature,
1938such as in C</abc/s//xyz/>, will work as expected.
1939
1940Collating elements (of bracket expressions in BREs) are not implemented.
1941
1942=head1 STANDARDS
1943
1944This B<sed> implementation conforms to the IEEE Std1003.2-1992 ("POSIX.2")
1945definition of B<sed>, and is compatible with the I<OpenBSD>
1946implementation, except where otherwise noted (see L<"BUGS">).
1947
1948=head1 AUTHOR
1949
1950This Perl implementation of I<sed> was written by Wolfgang Laun,
1951I<Wolfgang.Laun@alcatel.at>.
1952
1953=head1 COPYRIGHT and LICENSE
1954
1955This program is free and open software. You may use, modify,
1956distribute, and sell this program (and any modified variants) in any
1957way you wish, provided you do not restrict others from doing the same.
1958
1959=cut
1960
a687059c 1961!NO!SUBS!
4633a7c4 1962
1963close OUT or die "Can't close $file: $!";
1964chmod 0755, $file or die "Can't reset permissions for $file: $!\n";
d38b618a 1965unlink 'psed';
1966print "Linking s2p to psed.\n";
1967if (defined $Config{d_link}) {
1968 link 's2p', 'psed';
1969} else {
1970 unshift @INC, '../lib';
1971 require File::Copy;
1972 File::Copy::syscopy('s2p', 'psed');
1973}
4633a7c4 1974exec("$Config{'eunicefix'} $file") if $Config{'eunicefix'} ne ':';
8a5546a1 1975chdir $origdir;