Provide infrastructure for PERL_ASYNC_CHECK() style safe signals.
[p5sagit/p5-mst-13.2.git] / x2p / s2p.PL
CommitLineData
86a59229 1#!/usr/bin/perl
4633a7c4 2
3use Config;
4use File::Basename qw(&basename &dirname);
8a5546a1 5use Cwd;
4633a7c4 6
7# List explicitly here the variables you want Configure to
8# generate. Metaconfig only looks for shell variables, so you
9# have to mention them as if they were shell variables, not
10# %Config entries. Thus you write
11# $startperl
12# to ensure Configure will look for $Config{startperl}.
13
14# This forces PL files to create target in same directory as PL file.
15# This is so that make depend always knows where to find PL derivatives.
8a5546a1 16$origdir = cwd;
44a8e56a 17chdir dirname($0);
18$file = basename($0, '.PL');
774d564b 19$file .= '.com' if $^O eq 'VMS';
4633a7c4 20
21open OUT,">$file" or die "Can't create $file: $!";
22
23print "Extracting $file (with variable substitutions)\n";
24
25# In this section, perl variables will be expanded during extraction.
26# You can use $Config{...} to use Configure variables.
27
28print OUT <<"!GROK!THIS!";
5f05dabc 29$Config{startperl}
30 eval 'exec $Config{perlpath} -S \$0 \${1+"\$@"}'
31 if \$running_under_some_shell;
86a59229 32my \$startperl = "$Config{startperl}";
33my \$perlpath = "$Config{perlpath}";
a687059c 34!GROK!THIS!
35
4633a7c4 36# In the following, perl variables are not expanded during extraction.
37
38print OUT <<'!NO!SUBS!';
a687059c 39
86a59229 40$0 =~ s/^.*?(\w+)$/$1/;
41
42# (p)sed - a stream editor
43# History: Aug 12 2000: Original version.
44
45use strict;
46use integer;
47use Symbol;
8d063cd8 48
d83e3bda 49=head1 NAME
50
86a59229 51sed - a stream editor
d83e3bda 52
53=head1 SYNOPSIS
54
86a59229 55 sed [-an] script [file ...]
56 sed [-an] [-e script] [-f script-file] [file ...]
d83e3bda 57
58=head1 DESCRIPTION
59
86a59229 60A stream editor reads the input stream consisting of the specified files
61(or standard input, if none are given), processes is line by line by
62applying a script consisting of edit commands, and writes resulting lines
63to standard output. The filename `C<->' may be used to read standard input.
64
65The edit script is composed from arguments of B<-e> options and
66script-files, in the given order. A single script argument may be specified
67as the first parameter.
68
69If this program is invoked with the name F<s2p>, it will act as a
70sed-to-Perl translator. See L<"sed Script Translation">.
d83e3bda 71
86a59229 72B<sed> returns an exit code of 0 on success or >0 if an error occurred.
d83e3bda 73
86a59229 74=head1 OPTIONS
d83e3bda 75
86a59229 76=over 4
d83e3bda 77
86a59229 78=item B<-a>
d83e3bda 79
86a59229 80A file specified as argument to the B<w> edit command is by default
81opened before input processing starts. Using B<-a>, opening of such
82files is delayed until the first line is actually written to the file.
83
84=item B<-e> I<script>
85
86The editing commands defined by I<script> are appended to the script.
87Multiple commands must be separated by newlines.
88
89=item B<-f> I<script-file>
90
91Editing commands from the specified I<script-file> are read and appended
92to the script.
d83e3bda 93
94=item B<-n>
95
86a59229 96By default, a line is written to standard output after the editing script
97has been applied to it. The B<-n> option suppresses automatic printing.
98
99=back
100
101=head1 COMMANDS
102
103B<sed> command syntax is defined as
104
105Z<> Z<> Z<> Z<>[I<address>[B<,>I<address>]][B<!>]I<function>[I<argument>]
106
107with whitespace being permitted before or after addresses, and between
108the function character and the argument. The I<address>es and the
109address inverter (C<!>) are used to restrict the application of a
110command to the selected line(s) of input.
111
112Each command must be on a line of its own, except where noted in
113the synopses below.
114
115The edit cycle performed on each input line consist of reading the line
116(without its trailing newline character) into the I<pattern space>,
117applying the applicable commands of the edit script, writing the final
118contents of the pattern space and a newline to the standard output.
119A I<hold space> is provided for saving the contents of the
120pattern space for later use.
121
122=head2 Addresses
123
124A sed address is either a line number or a pattern, which may be combined
125arbitrarily to construct ranges. Lines are numbered across all input files.
126
127Any address may be followed by an exclamation mark (`C<!>'), selecting
128all lines not matching that address.
d83e3bda 129
86a59229 130=over 4
d83e3bda 131
86a59229 132=item I<number>
133
134The line with the given number is selected.
135
136=item B<$>
137
138A dollar sign (C<$>) is the line number of the last line of the input stream.
139
140=item B</>I<regular expression>B</>
141
142A pattern address is a basic regular expression (see
143L<"Basic Regular Expressions">), between the delimiting character C</>.
144Any other character except C<\> or newline may be used to delimit a
145pattern address when the initial delimiter is prefixed with a
146backslash (`C<\>').
d83e3bda 147
148=back
149
86a59229 150If no address is given, the command selects every line.
d83e3bda 151
86a59229 152If one address is given, it selects the line (or lines) matching the
153address.
d83e3bda 154
86a59229 155Two addresses select a range that begins whenever the first address
156matches, and ends (including that line) when the second address matches.
157If the first (second) address is a matching pattern, the second
158address is not applied to the very same line to determine the end of
159the range. Likewise, if the second address is a matching pattern, the
160first address is not applied to the very same line to determine the
161begin of another range. If both addresses are line numbers,
162and the second line number is less than the first line number, then
163only the first line is selected.
d83e3bda 164
d83e3bda 165
86a59229 166=head2 Functions
d83e3bda 167
86a59229 168The maximum permitted number of addresses is indicated with each
169function synopsis below.
d83e3bda 170
86a59229 171The argument I<text> consists of one or more lines following the command.
172Embedded newlines in I<text> must be preceded with a backslash. Other
173backslashes in I<text> are deleted and the following character is taken
174literally.
d83e3bda 175
86a59229 176=over 4
d83e3bda 177
86a59229 178=cut
179
180my %ComTab;
181#--------------------------------------------------------------------------
182$ComTab{'a'}=[ 1, 'txt', \&Emit, '{ push( @Q, <<'."'TheEnd' ) }\n" ]; #ok
183
184=item [1addr]B<a\> I<text>
185
186Write I<text> (which must start on the line following the command)
187to standard output immediately before reading the next line
188of input, either by executing the B<N> function or by beginning a new cycle.
189
190=cut
191
192#--------------------------------------------------------------------------
193$ComTab{'b'}=[ 2, 'str', \&Branch, '{ goto XXX; }' ]; #ok
194
195=item [2addr]B<b> [I<label>]
196
197Branch to the B<:> function with the specified I<label>. If no label
198is given, branch to the end of the script.
199
200=cut
201
202#--------------------------------------------------------------------------
203$ComTab{'c'}=[ 2, 'txt', \&Change, <<'-X-' ]; #ok
204{ print <<'TheEnd'; } $doPrint = 0; goto EOS;
205-X-
206### continue OK => next CYCLE;
207
208=item [2addr]B<c\> I<text>
209
210The line, or range of lines, selected by the address is deleted.
211The I<text> (which must start on the line following the command)
212is written to standard output. With an address range, this occurs at
213the end of the range.
214
215=cut
216
217#--------------------------------------------------------------------------
218$ComTab{'d'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
219{ $doPrint = 0;
220 goto EOS;
221}
222-X-
223### continue OK => next CYCLE;
224
225=item [2addr]B<d>
226
227Deletes the pattern space and starts the next cycle.
228
229=cut
230
231#--------------------------------------------------------------------------
232$ComTab{'D'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
233{ s/^.*\n?//;
234 if(length($_)){ goto BOS } else { goto EOS }
235}
236-X-
237### continue OK => next CYCLE;
238
239=item [2addr]B<D>
240
241Deletes the pattern space through the first embedded newline or to the end.
242If the pattern space becomes empty, a new cycle is started, otherwise
243execution of the script is restarted.
244
245=cut
246
247#--------------------------------------------------------------------------
248$ComTab{'g'}=[ 2, '', \&Emit, '{ $_ = $Hold };' ]; #ok
249
250=item [2addr]B<g>
251
252Replace the contents of the pattern space with the hold space.
253
254=cut
255
256#--------------------------------------------------------------------------
257$ComTab{'G'}=[ 2, '', \&Emit, '{ $_ .= "\n"; $_ .= $Hold };' ]; #ok
258
259=item [2addr]B<G>
260
261Append a newline and the contents of the hold space to the pattern space.
262
263=cut
264
265#--------------------------------------------------------------------------
266$ComTab{'h'}=[ 2, '', \&Emit, '{ $Hold = $_ }' ]; #ok
267
268=item [2addr]B<h>
269
270Replace the contents of the hold space with the pattern space.
271
272=cut
273
274#--------------------------------------------------------------------------
275$ComTab{'H'}=[ 2, '', \&Emit, '{ $Hold .= "\n"; $Hold .= $_; }' ]; #ok
276
277=item [2addr]B<H>
278
279Append a newline and the contents of the pattern space to the hold space.
280
281=cut
282
283#--------------------------------------------------------------------------
284$ComTab{'i'}=[ 1, 'txt', \&Emit, '{ print <<'."'TheEnd' }\n" ]; #ok
285
286=item [1addr]B<i\> I<text>
287
288Write the I<text> (which must start on the line following the command)
289to standard output.
290
291=cut
292
293#--------------------------------------------------------------------------
294$ComTab{'l'}=[ 2, '', \&Emit, '{ _l() }' ]; #okUTF8
295
296=item [2addr]B<l>
297
298Print the contents of the pattern space: non-printable characters are
299shown in C-style escaped form; long lines are split and have a trailing
300`C<\>' at the point of the split; the true end of a line is marked with
301a `C<$>'. Escapes are: `\a', `\t', `\n', `\f', `\r', `\e' for
302BEL, HT, LF, FF, CR, ESC, respectively, and `\' followed by a three-digit
303octal number for all other non-printable characters.
304
305=cut
306
307#--------------------------------------------------------------------------
308$ComTab{'n'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
309{ print $_, "\n" if $doPrint;
310 printQ if @Q;
311 $CondReg = 0;
312 last CYCLE unless getsARGV();
313 chomp();
314}
315-X-
316
317=item [2addr]B<n>
318
319If automatic printing is enabled, write the pattern space to the standard
320output. Replace the pattern space with the next line of input. If
321there is no more input, processing is terminated.
322
323=cut
324
325#--------------------------------------------------------------------------
326$ComTab{'N'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
327{ printQ if @Q;
328 $CondReg = 0;
329 last CYCLE unless getsARGV( $h );
330 chomp( $h );
331 $_ .= "\n$h";
332}
333-X-
334
335=item [2addr]B<N>
336
337Append a newline and the next line of input to the pattern space. If
338there is no more input, processing is terminated.
339
340=cut
341
342#--------------------------------------------------------------------------
343$ComTab{'p'}=[ 2, '', \&Emit, '{ print $_, "\n"; }' ]; #ok
344
345=item [2addr]B<p>
346
347Print the pattern space to the standard output. (Use the B<-n> option
348to suppress automatic printing at the end of a cycle if you want to
349avoid double printing of lines.)
350
351=cut
352
353#--------------------------------------------------------------------------
354$ComTab{'P'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
355{ if( /^(.*)/ ){ print $1, "\n"; } }
356-X-
357
358=item [2addr]B<P>
359
360Prints the pattern space through the first embedded newline or to the end.
361
362=cut
363
364#--------------------------------------------------------------------------
365$ComTab{'q'}=[ 1, '', \&Emit, <<'-X-' ]; #ok
366{ print $_, "\n" if $doPrint;
367 last CYCLE;
368}
369-X-
370
371=item [1addr]B<q>
372
373Branch to the end of the script and quit without starting a new cycle.
374
375=cut
376
377#--------------------------------------------------------------------------
378$ComTab{'r'}=[ 1, 'str', \&Emit, "{ _r( '-X-' ) }" ]; #ok
379### FIXME: lazy reading - big files???
380
381=item [1addr]B<r> I<file>
382
383Copy the contents of the I<file> to standard output immediately before
384the next attempt to read a line of input. Any error encountered while
385reading I<file> is silently ignored.
386
387=cut
388
389#--------------------------------------------------------------------------
390$ComTab{'s'}=[ 2, 'sub', \&Emit, '' ]; #ok
391
392=item [2addr]B<s/>I<regular expression>B</>I<replacement>B</>I<flags>
393
394Substitute the I<replacement> string for the first substring in
395the pattern space that matches the I<regular expression>.
396Any character other than backslash or newline can be used instead of a
397slash to delimit the regular expression and the replacement.
398To use the delimiter as a literal character within the regular expression
399and the replacement, precede the character by a backslash (`C<\>').
400
401Literal newlines may be embedded in the replacement string by
402preceding a newline with a backslash.
403
404Within the replacement, an ampersand (`C<&>') is replaced by the string
405matching the regular expression. The strings `C<\1>' through `C<\9>' are
406replaced by the corresponding subpattern (see L<"Basic Regular Expressions">).
407To get a literal `C<&>' or `C<\>' in the replacement text, precede it
408by a backslash.
409
410The following I<flags> modify the behaviour of the B<s> command:
411
412=over 8
413
414=item B<g>
415
416The replacement is performed for all matching, non-overlapping substrings
417of the pattern space.
418
419=item B<1>..B<9>
420
421Replace only the n-th matching substring of the pattern space.
422
423=item B<p>
424
425If the substitution was made, print the new value of the pattern space.
426
427=item B<w> I<file>
428
429If the substitution was made, write the new value of the pattern space
430to the specified file.
431
432=back
433
434=cut
435
436#--------------------------------------------------------------------------
437$ComTab{'t'}=[ 2, 'str', \&Branch, '{ goto XXX if _t() }' ]; #ok
438
439=item [2addr]B<t> [I<label>]
440
441Branch to the B<:> function with the specified I<label> if any B<s>
442substitutions have been made since the most recent reading of an input line
443or execution of a B<t> function. If no label is given, branch to the end of
444the script.
445
446
447=cut
448
449#--------------------------------------------------------------------------
450$ComTab{'w'}=[ 2, 'str', \&Write, "{ _w( '-X-' ) }" ]; #ok
451
452=item [2addr]B<w> I<file>
453
454The contents of the pattern space are written to the I<file>.
455
456=cut
457
458#--------------------------------------------------------------------------
459$ComTab{'x'}=[ 2, '', \&Emit, '{ ($Hold, $_) = ($_, $Hold) }' ]; #ok
460
461=item [2addr]B<x>
462
463Swap the contents of the pattern space and the hold space.
464
465=cut
466
467#--------------------------------------------------------------------------
468$ComTab{'y'}=[ 2, 'tra', \&Emit, '' ]; #ok
469=item [2addr]B<y>B</>I<string1>B</>I<string2>B</>
470
471In the pattern space, replace all characters occuring in I<string1> by the
472character at the corresponding position in I<string2>. It is possible
473to use any character (other than a backslash or newline) instead of a
474slash to delimit the strings. Within I<string1> and I<string2>, a
475backslash followed by any character other than a newline is that literal
476character, and a backslash followed by an `n' is replaced by a newline
477character.
478
479=cut
d83e3bda 480
86a59229 481#--------------------------------------------------------------------------
482$ComTab{'='}=[ 1, '', \&Emit, '{ print "$.\n" }' ]; #ok
483
484=item [1addr]B<=>
485
486Prints the current line number on the standard output.
487
488=cut
489
490#--------------------------------------------------------------------------
491$ComTab{':'}=[ 0, 'str', \&Label, '' ]; #ok
492
493=item [0addr]B<:> [I<label>]
494
495The command specifies the position of the I<label>. It has no other effect.
496
497=cut
498
499#--------------------------------------------------------------------------
500$ComTab{'{'}=[ 2, '', \&BeginBlock, '{' ]; #ok
501$ComTab{'}'}=[ 0, '', \&EndBlock, ';}' ]; #ok
502# ';' to avoid warning on empty {}-block
503
504=item [2addr]B<{> [I<command>]
505
506=item [0addr]B<}>
507
508These two commands begin and end a command list. The first command may
509be given on the same line as the opening B<{> command. The commands
510within the list are jointly selected by the address(es) given on the
511B<{> command (but may still have individual addresses).
512
513=cut
514
515#--------------------------------------------------------------------------
516$ComTab{'#'}=[ 0, 'str', \&Comment, '' ]; #ok
517
518=item [0addr]B<#> [I<comment>]
519
520The entire line is ignored (treated as a comment). If, however, the first
521two characters in the script are `C<#n>', automatic printing of output is
522suppressed, as if the B<-n> option were given on the command line.
523
524=back
525
526=cut
527
528use vars qw{ $isEOF $Hold %wFiles @Q $CondReg $doPrint };
529
530my $useDEBUG = exists( $ENV{PSEDDEBUG} );
531my $useEXTBRE = $ENV{PSEDEXTBRE} || '';
532$useEXTBRE =~ s/[^<>wWyB]//g; # gawk RE's handle these
533
534my $doAutoPrint = 1; # automatic printing of pattern space (-n => 0)
535my $doOpenWrite = 1; # open w command output files at start (-a => 0)
536my $svOpenWrite = 0; # save $doOpenWrite
537my $doGenerate = $0 eq 's2p';
538
539# Collected and compiled script
540#
541my( @Commands, %Defined, @BlockStack, %Label, $labNum, $Code );
542
543##################
544# Compile Time
545#
546# Labels
547#
548# Error handling
549#
550sub Warn($;$){
551 my( $msg, $loc ) = @_;
552 $loc ||= '';
553 $loc .= ': ' if length( $loc );
554 warn( "$0: $loc$msg\n" );
555}
556
557$labNum = 0;
558sub newLabel(){
559 return 'L_'.++$labNum;
560}
561
562# safeHere: create safe here delimiter and modify opcode and argument
563#
564sub safeHere($$){
565 my( $codref, $argref ) = @_;
566 my $eod = 'EOD000';
567 while( $$argref =~ /^$eod$/m ){
568 $eod++;
569 }
570 $$codref =~ s/TheEnd/$eod/e;
571 $$argref .= "$eod\n";
572}
573
574# Emit: create address logic and emit command
575#
576sub Emit($$$$$$){
577 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
578 my $cond = '';
579 if( defined( $addr1 ) ){
580 if( defined( $addr2 ) ){
581 $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
582 } else {
583 $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
584 }
585 $cond = $negated ? "unless( $addr1 )\n" : "if( $addr1 )\n";
586 }
587
588 if( $opcode eq '' ){
589 $Code .= "$cond$arg\n";
590
591 } elsif( $opcode =~ s/-X-/$arg/e ){
592 $Code .= "$cond$opcode\n";
593
594 } elsif( $opcode =~ /TheEnd/ ){
595 safeHere( \$opcode, \$arg );
596 $Code .= "$cond$opcode$arg";
597
598 } else {
599 $Code .= "$cond$opcode\n";
600 }
601 0;
602}
603
604# Write (w command, w flag): store pathname
605#
606sub Write($$$$$$){
607 my( $addr1, $addr2, $negated, $opcode, $path, $fl ) = @_;
608 $wFiles{$path} = '';
609 Emit( $addr1, $addr2, $negated, $opcode, $path, $fl );
610}
611
612
613# Label (: command): label definition
614#
615sub Label($$$$$$){
616 my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
617 my $rc = 0;
618 $lab =~ s/\s+//;
619 if( length( $lab ) ){
620 my $h;
621 if( ! exists( $Label{$lab} ) ){
622 $h = $Label{$lab}{name} = newLabel();
623 } else {
624 $h = $Label{$lab}{name};
625 if( exists( $Label{$lab}{defined} ) ){
626 my $dl = $Label{$lab}{defined};
627 Warn( "duplicate label $lab (first defined at $dl)", $fl );
628 $rc = 1;
629 }
630 }
631 $Label{$lab}{defined} = $fl;
632 $Code .= "$h:;\n";
633 }
634 $rc;
635}
636
637# BeginBlock ({ command): push block start
638#
639sub BeginBlock($$$$$$){
640 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
641 push( @BlockStack, [ $fl, $addr1, $addr2, $negated ] );
642 Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
643}
644
645# EndBlock (} command): check proper nesting
646#
647sub EndBlock($$$$$$){
648 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
649 my $rc;
650 my $jcom = pop( @BlockStack );
651 if( defined( $jcom ) ){
652 $rc = Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
653 } else {
654 Warn( "unexpected `}'", $fl );
655 $rc = 1;
656 }
657 $rc;
658}
659
660# Branch (t, b commands): check or create label, substitute default
661#
662sub Branch($$$$$$){
663 my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
664 $lab =~ s/\s+//; # no spaces at end
665 my $h;
666 if( length( $lab ) ){
667 if( ! exists( $Label{$lab} ) ){
668 $h = $Label{$lab}{name} = newLabel();
669 } else {
670 $h = $Label{$lab}{name};
671 }
672 push( @{$Label{$lab}{used}}, $fl );
673 } else {
674 $h = 'EOS';
675 }
676 $opcode =~ s/XXX/$h/e;
677 Emit( $addr1, $addr2, $negated, $opcode, '', $fl );
678}
679
680# Change (c command): is special due to range end watching
681#
682sub Change($$$$$$){
683 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
684 my $kwd = $negated ? 'unless' : 'if';
685 if( defined( $addr2 ) ){
686 $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
687 if( ! $negated ){
688 $addr1 = '$icnt = ('.$addr1.')';
689 $opcode = 'if( $icnt =~ /E0$/ )' . $opcode;
690 }
691 } else {
692 $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
693 }
694 safeHere( \$opcode, \$arg );
695 $Code .= "$kwd( $addr1 ){\n $opcode$arg}\n";
696 0;
697}
698
699
700# Comment (# command): A no-op. Who would've thought that!
701#
702sub Comment($$$$$$){
703 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
704### $Code .= "# $arg\n";
705 0;
706}
707
708
709sub stripRegex($$){
710 my( $del, $sref ) = @_;
711 my $regex = $del;
712 print "stripRegex:$del:$$sref:\n" if $useDEBUG;
713 while( $$sref =~ s{^(.*?)(\\*)\Q$del\E(\s*)}{}s ){
714 my $sl = $2;
715 $regex .= $1.$sl.$del;
716 if( length( $sl ) % 2 == 0 ){
717 return $regex;
718 }
719 $regex .= $3;
720 }
721 undef();
722}
723
724# stripTrans: take a <del> terminated string from y command
725# honoring and cleaning up of \-escaped <del>'s
726#
727sub stripTrans($$){
728 my( $del, $sref ) = @_;
729 my $t = '';
730 print "stripTrans:$del:$$sref:\n" if $useDEBUG;
731 while( $$sref =~ s{^(.*?)(\\*)\Q$del\E}{}s ){
732 my $sl = $2;
733 $t .= $1;
734 if( length( $sl ) % 2 == 0 ){
735 $t .= $sl;
736 $t =~ s/\\\\/\\/g;
737 return $t;
738 }
739 chop( $sl );
740 $t .= $sl.$del.$3;
741 }
742 undef();
743}
744
745# makey - construct Perl y/// from sed y///
746#
747sub makey($$$){
748 my( $fr, $to, $fl ) = @_;
749 my $error = 0;
750
751 # Ensure that any '-' is up front.
752 # Diagnose duplicate contradicting mappings
753 my %tr;
754 for( my $i = 0; $i < length($fr); $i++ ){
755 my $fc = substr($fr,$i,1);
756 my $tc = substr($to,$i,1);
757 if( exists( $tr{$fc} ) && $tr{$fc} ne $tc ){
758 Warn( "ambiguos translation for character `$fc' in `y' command",
759 $fl );
760 $error++;
761 }
762 $tr{$fc} = $tc;
763 }
764 $fr = $to = '';
765 if( exists( $tr{'-'} ) ){
766 ( $fr, $to ) = ( '-', $tr{'-'} );
767 delete( $tr{'-'} );
768 } else {
769 $fr = $to = '';
770 }
771 # might just as well sort it...
772 for my $fc ( sort keys( %tr ) ){
773 $fr .= $fc;
774 $to .= $tr{$fc};
775 }
776 # make embedded delimiters and newlines safe
777 $fr =~ s/([{}])/\$1/g;
778 $to =~ s/([{}])/\$1/g;
779 $fr =~ s/\n/\\n/g;
780 $to =~ s/\n/\\n/g;
781 return $error ? undef() : "{ y{$fr}{$to}; }";
782}
783
784######
785# makes - construct Perl s/// from sed s///
786#
787sub makes($$$$$$$){
788 my( $regex, $subst, $path, $global, $print, $nmatch, $fl ) = @_;
789
790 # make embedded newlines safe
791 $regex =~ s/\n/\\n/g;
792 $subst =~ s/\n/\\n/g;
793
794 my $code;
795 # n-th occurrence
796 #
797 if( length( $nmatch ) ){
798 $code = <<TheEnd;
799{ \$n = $nmatch;
800 while( --\$n && ( \$s = m ${regex}g ) ){}
801 \$s = ( substr( \$_, pos() ) =~ s ${regex}${subst}s ) if \$s;
802 \$CondReg ||= \$s;
803TheEnd
804 } else {
805 $code = <<TheEnd;
806{ \$s = s ${regex}${subst}s${global};
807 \$CondReg ||= \$s;
808TheEnd
809 }
810 if( $print ){
811 $code .= ' print $_, "\n" if $s;'."\n";
812 }
813 if( defined( $path ) ){
814 $wFiles{$path} = '';
815 $code .= " _w( '$path' ) if \$s;\n";
816 }
817 $code .= "}";
818}
819
820=head1 BASIC REGULAR EXPRESSIONS
821
822A I<Basic Regular Expression> (BRE), as defined in POSIX 1003.2, consists
823of I<atoms>, for matching parts of a string, and I<bounds>, specifying
824repetitions of a preceding atom.
825
826=head2 Atoms
827
828The possible atoms of a BRE are: B<.>, matching any single character;
829B<^> and B<$>, matching the null string at the beginning or end
830of a string, respectively; a I<bracket expressions>, enclosed
831in B<[> and B<]> (see below); and any single character with no
832other significance (matching that character). A B<\> before one
833of: B<.>, B<^>, B<$>, B<[>, B<*>, B<\>, matching the character
834after the backslash. A sequence of atoms enclosed in B<\(> and B<\)>
835becomes an atom and establishes the target for a I<backreference>,
836consisting of the substring that actually matches the enclosed atoms.
837Finally, B<\> followed by one of the digits B<0> through B<9> is a
838backreference.
839
840A B<^> that is not first, or a B<$> that is not last does not have
841a special significance and need not be preceded by a backslash to
842become literal. The same is true for a B<]>, that does not terminate
843a bracket expression.
844
845An unescaped backslash cannot be last in a BRE.
846
847=head2 Bounds
848
849The BRE bounds are: B<*>, specifying 0 or more matches of the preceding
850atom; B<\{>I<count>B<\}>, specifying that many repetitions;
851B<\{>I<minimum>B<,\}>, giving a lower limit; and
852B<\{>I<minimum>B<,>I<maximum>B<\}> finally defines a lower and upper
853bound.
854
855A bound appearing as the first item in a BRE is taken literally.
856
857=head2 Bracket Expressions
858
859A I<bracket expression> is a list of characters, character ranges
860and character classes enclosed in B<[> and B<]> and matches any
861single character from the represented set of characters.
862
863A character range is written as two characters separated by B<-> and
864represents all characters (according to the character collating sequence)
865that are not less than the first and not greater than the second.
866(Ranges are very collating-sequence-dependent, and portable programs
867should avoid relying on them.)
868
869A character class is one of the class names
870
871 alnum digit punct
872 alpha graph space
873 blank lower upper
874 cntrl print xdigit
875
876enclosed in B<[:> and B<:]> and represents the set of characters
877as defined in ctype(3).
878
879If the first character after B<[> is B<^>, the sense of matching is
880inverted.
881
882To include a literal `C<^>', place it anywhere else but first. To
883include a literal 'C<]>' place it first or immediately after an
884initial B<^>. To include a literal `C<->' make it the first (or
885second after B<^>) or last character, or the second endpoint of
886a range.
887
888The special bracket expression constructs C<[[:E<lt>:]]> and C<[[:E<gt>:]]>
889match the null string at the beginning and end of a word respectively.
890(Note that neither is identical to Perl's `\b' atom.)
891
892=head2 Additional Atoms
893
894Since some sed implementations provide additional regular expression
895atoms (not defined in POSIX 1003.2), B<psed> is capable of translating
896the following backslash escapes:
897
898=over 4
899
900=item B<\E<lt>> This is the same as C<[[:E<gt>:]]>.
901
902=item B<\E<gt>> This is the same as C<[[:E<lt>:]]>.
903
904=item B<\w> This is an abbreviation for C<[[:alnum:]_]>.
905
906=item B<\W> This is an abbreviation for C<[^[:alnum:]_]>.
907
908=item B<\y> Match the empty string at a word boundary.
909
910=item B<\B> Match the empty string between any two either word or non-word characters.
911
912=back
913
914To enable this feature, the environment variable PSEDEXTBRE must be set
915to a string containing the requested characters, e.g.:
916C<PSEDEXTBRE='E<lt>E<gt>wW'>.
917
918=cut
919
920#####
921# bre2p - convert BRE to Perl RE
922#
923sub peek(\$$){
924 my( $pref, $ic ) = @_;
925 $ic < length($$pref)-1 ? substr( $$pref, $ic+1, 1 ) : '';
926}
927
928sub bre2p($$$){
929 my( $del, $pat, $fl ) = @_;
930 my $led = $del;
931 $led =~ tr/{([</})]>/;
932 $led = '' if $led eq $del;
933
934 $pat = substr( $pat, 1, length($pat) - 2 );
935 my $res = '';
936 my $bracklev = 0;
937 my $backref = 0;
938 my $parlev = 0;
939 for( my $ic = 0; $ic < length( $pat ); $ic++ ){
940 my $c = substr( $pat, $ic, 1 );
941 if( $c eq '\\' ){
942 ### backslash escapes
943 my $nc = peek($pat,$ic);
944 if( $nc eq '' ){
945 Warn( "`\\' cannot be last in pattern", $fl );
946 return undef();
947 }
948 $ic++;
949 if( $nc eq $del ){ ## \<pattern del> => \<pattern del>
950 $res .= "\\$del";
951
952 } elsif( $nc =~ /([[.*\\n])/ ){
953 ## check for \-escaped magics and \n:
954 ## \[ \. \* \\ \n stay as they are
955 $res .= '\\'.$nc;
956
957 } elsif( $nc eq '(' ){ ## \( => (
958 $parlev++;
959 $res .= '(';
960
961 } elsif( $nc eq ')' ){ ## \) => )
962 $parlev--;
963 $backref++;
964 if( $parlev < 0 ){
965 Warn( "unmatched `\\)'", $fl );
966 return undef();
967 }
968 $res .= ')';
969
970 } elsif( $nc eq '{' ){ ## repetition factor \{<i>[,[<j>]]\}
971 my $endpos = index( $pat, '\\}', $ic );
972 if( $endpos < 0 ){
973 Warn( "unmatched `\\{'", $fl );
974 return undef();
975 }
976 my $rep = substr( $pat, $ic+1, $endpos-($ic+1) );
977 $ic = $endpos + 1;
978
979 if( $res =~ /^\^?$/ ){
980 $res .= "\\{$rep\}";
981 } elsif( $rep =~ /^(\d+)(,?)(\d*)?$/ ){
982 my $min = $1;
983 my $com = $2 || '';
984 my $max = $3;
985 if( length( $max ) ){
986 if( $max < $min ){
987 Warn( "maximum less than minimum in `\\{$rep\\}'",
988 $fl );
989 return undef();
990 }
991 } else {
992 $max = '';
993 }
994 # simplify some
995 if( $min == 0 && $max eq '1' ){
996 $res .= '?';
997 } elsif( $min == 1 && "$com$max" eq ',' ){
998 $res .= '+';
999 } elsif( $min == 0 && "$com$max" eq ',' ){
1000 $res .= '*';
1001 } else {
1002 $res .= "{$min$com$max}";
1003 }
1004 } else {
1005 Warn( "invalid repeat clause `\\{$rep\\}'", $fl );
1006 return undef();
1007 }
1008
1009 } elsif( $nc =~ /^[1-9]$/ ){
1010 ## \1 .. \9 => \1 .. \9, but check for a following digit
1011 if( $nc > $backref ){
1012 Warn( "invalid backreference ($nc)", $fl );
1013 return undef();
1014 }
1015 $res .= "\\$nc";
1016 if( peek($pat,$ic) =~ /[0-9]/ ){
1017 $res .= '(?:)';
1018 }
1019
1020 } elsif( $useEXTBRE && ( $nc =~ /[$useEXTBRE]/ ) ){
1021 ## extensions - at most <>wWyB - not in POSIX
1022 if( $nc eq '<' ){ ## \< => \b(?=\w), be precise
1023 $res .= '\\b(?<=\\W)';
1024 } elsif( $nc eq '>' ){ ## \> => \b(?=\W), be precise
1025 $res .= '\\b(?=\\W)';
1026 } elsif( $nc eq 'y' ){ ## \y => \b
1027 $res .= '\\b';
1028 } else { ## \B, \w, \W remain the same
1029 $res .= "\\$nc";
1030 }
1031 } elsif( $nc eq $led ){
1032 ## \<closing bracketing-delimiter> - keep '\'
1033 $res .= "\\$nc";
1034
1035 } else { ## \ <char> => <char> ("as if `\' were not present")
1036 $res .= $nc;
1037 }
1038
1039 } elsif( $c eq '.' ){ ## . => .
1040 $res .= $c;
1041
1042 } elsif( $c eq '*' ){ ## * => * but \* if there's nothing preceding it
1043 if( $res =~ /^\^?$/ ){
1044 $res .= '\\*';
1045 } elsif( substr( $res, -1, 1 ) ne '*' ){
1046 $res .= $c;
1047 }
1048
1049 } elsif( $c eq '[' ){
1050 ## parse []: [^...] [^]...] [-...]
1051 my $add = '[';
1052 if( peek($pat,$ic) eq '^' ){
1053 $ic++;
1054 $add .= '^';
1055 }
1056 my $nc = peek($pat,$ic);
1057 if( $nc eq ']' || $nc eq '-' ){
1058 $add .= $nc;
1059 $ic++;
1060 }
1061 # check that [ is not trailing
1062 if( $ic >= length( $pat ) - 1 ){
1063 Warn( "unmatched `['", $fl );
1064 return undef();
1065 }
1066 # look for [:...:] and x-y
1067 my $rstr = substr( $pat, $ic+1 );
1068 if( $rstr =~ /^((?:\[:\(\w+|[><]\):\]|[^]-](?:-[^]])?)*)/ ){
1069 my $cnt = $1;
1070 $ic += length( $cnt );
1071 $cnt =~ s/([\\\$])/\\$1/g; # `\', `$' are magic in Perl []
1072 # try some simplifications
1073 my $red = $cnt;
1074 if( $red =~ s/0-9// ){
1075 $cnt = $red.'\d';
1076 if( $red =~ s/A-Z// && $red =~ s/a-z// && $red =~ s/_// ){
1077 $cnt = $red.'\w';
1078 }
1079 }
1080 $add .= $cnt;
1081
1082 # POSIX 1003.2 has this (optional) for begin/end word
1083 $add = '\\b(?=\\W)' if $add eq '[[:<:]]';
1084 $add = '\\b(?<=\\W)' if $add eq '[[:>:]]';
1085
1086 }
1087
1088 ## may have a trailing `-' before `]'
1089 if( $ic < length($pat) - 1 &&
1090 substr( $pat, $ic+1 ) =~ /^(-?])/ ){
1091 $ic += length( $1 );
1092 $add .= $1;
1093 # another simplification
1094 $add =~ s/^\[(\^?)(\\[dw])]$/ $1 eq '^' ? uc($2) : $2 /e;
1095 $res .= $add;
1096 } else {
1097 Warn( "unmatched `['", $fl );
1098 return undef();
1099 }
3cb6de81 1100
86a59229 1101 } elsif( $c eq $led ){ ## unescaped <closing bracketing-delimiter>
1102 $res .= "\\$c";
d83e3bda 1103
86a59229 1104 } elsif( $c eq ']' ){ ## unmatched ] is not magic
1105 $res .= ']';
d83e3bda 1106
86a59229 1107 } elsif( $c =~ /[|+?{}()]/ ){ ## not magic in BRE, but in Perl: \-quote
1108 $res .= "\\$c";
d83e3bda 1109
86a59229 1110 } elsif( $c eq '^' ){ ## not magic unless 1st, but in Perl: \-quote
1111 $res .= length( $res ) ? '\\^' : '^';
d83e3bda 1112
86a59229 1113 } elsif( $c eq '$' ){ ## not magic unless last, but in Perl: \-quote
1114 $res .= $ic == length( $pat ) - 1 ? '$' : '\\$';
8d063cd8 1115
86a59229 1116 } else {
1117 $res .= $c;
1118 }
8d063cd8 1119 }
0a12ae7d 1120
86a59229 1121 if( $parlev ){
1122 Warn( "unmatched `\\('", $fl );
1123 return undef();
8d063cd8 1124 }
0a12ae7d 1125
86a59229 1126 # final cleanup: eliminate raw HTs
1127 $res =~ s/\t/\\t/g;
1128 return $del . $res . ( $led ? $led : $del );
1129}
0a12ae7d 1130
86a59229 1131
1132#####
1133# sub2p - convert sed substitution to Perl substitution
1134#
1135sub sub2p($$$){
1136 my( $del, $subst, $fl ) = @_;
1137 my $led = $del;
1138 $led =~ tr/{([</})]>/;
1139 $led = '' if $led eq $del;
1140
1141 $subst = substr( $subst, 1, length($subst) - 2 );
1142 my $res = '';
1143
1144 for( my $ic = 0; $ic < length( $subst ); $ic++ ){
1145 my $c = substr( $subst, $ic, 1 );
1146 if( $c eq '\\' ){
1147 ### backslash escapes
1148 my $nc = peek($subst,$ic);
1149 if( $nc eq '' ){
1150 Warn( "`\\' cannot be last in substitution", $fl );
1151 return undef();
1152 }
1153 $ic++;
1154 if( $nc =~ /[\\$del$led]/ ){ ## \ and delimiter
1155 $res .= '\\' . $nc;
1156 } elsif( $nc =~ /[1-9]/ ){ ## \1 - \9 => ${1} - ${9}
1157 $res .= '${' . $nc . '}';
1158 } else { ## everything else (includes &): omit \
1159 $res .= $nc;
1160 }
1161 } elsif( $c eq '&' ){ ## & => $&
1162 $res .= '$&';
1163 } elsif( $c =~ /[\$\@$led]/ ){ ## magic in Perl's substitution string
1164 $res .= '\\' . $c;
1165 } else {
1166 $res .= $c;
8d063cd8 1167 }
8d063cd8 1168 }
1169
86a59229 1170 # final cleanup: eliminate raw HTs
1171 $res =~ s/\t/\\t/g;
1172 return ( $led ? $del : $led ) . $res . ( $led ? $led : $del );
1173}
0a12ae7d 1174
86a59229 1175
1176sub Parse(){
1177 my $error = 0;
1178 my( $pdef, $pfil, $plin );
1179 for( my $icom = 0; $icom < @Commands; $icom++ ){
1180 my $cmd = $Commands[$icom];
1181 print "Parse:$cmd:\n" if $useDEBUG;
1182 $cmd =~ s/^\s+//;
1183 next unless length( $cmd );
1184 my $scom = $icom;
1185 if( exists( $Defined{$icom} ) ){
1186 $pdef = $Defined{$icom};
1187 if( $pdef =~ /^ #(\d+)/ ){
1188 $pfil = 'expression #';
1189 $plin = $1;
1190 } else {
1191 $pfil = "$pdef l.";
1192 $plin = 1;
1193 }
1194 } else {
1195 $plin++;
1196 }
1197 my $fl = "$pfil$plin";
1198
1199 # insert command as comment in gnerated code
1200 #
1201 $Code .= "# $cmd\n" if $doGenerate;
1202
1203 # The Address(es)
1204 #
1205 my( $negated, $naddr, $addr1, $addr2 );
1206 $naddr = 0;
1207 if( $cmd =~ s/^(\d+)\s*// ){
1208 $addr1 = "$1"; $naddr++;
1209 } elsif( $cmd =~ s/^\$\s*// ){
1210 $addr1 = 'eofARGV()'; $naddr++;
1211 } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1212 my $del = $1;
1213 my $regex = stripRegex( $del, \$cmd );
1214 if( defined( $regex ) ){
1215 $addr1 = 'm '.bre2p( $del, $regex, $fl ).'s';
1216 $naddr++;
1217 } else {
1218 Warn( "malformed regex, 1st address", $fl );
1219 $error++;
1220 next;
1221 }
1222 }
1223 if( defined( $addr1 ) && $cmd =~ s/,\s*// ){
1224 if( $cmd =~ s/^(\d+)\s*// ){
1225 $addr2 = "$1"; $naddr++;
1226 } elsif( $cmd =~ s/^\$\s*// ){
1227 $addr2 = 'eofARGV()'; $naddr++;
1228 } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1229 my $del = $1;
1230 my $regex = stripRegex( $del, \$cmd );
1231 if( defined( $regex ) ){
1232 $addr2 = 'm '. bre2p( $del, $regex, $fl ).'s';
1233 $naddr++;
1234 } else {
1235 Warn( "malformed regex, 2nd address", $fl );
1236 $error++;
1237 next;
1238 }
1239 } else {
1240 Warn( "invalid address after `,'", $fl );
1241 $error++;
1242 next;
1243 }
1244 }
1245
1246 # address modifier `!'
1247 #
1248 $negated = $cmd =~ s/^!\s*//;
1249 if( defined( $addr1 ) ){
1250 print "Parse: addr1=$addr1" if $useDEBUG;
1251 if( defined( $addr2 ) ){
1252 print ", addr2=$addr2 " if $useDEBUG;
1253 # both numeric and addr1 > addr2 => eliminate addr2
1254 undef( $addr2 ) if $addr1 =~ /^\d+$/ &&
1255 $addr2 =~ /^\d+$/ && $addr1 > $addr2;
1256 }
9ef589d8 1257 }
86a59229 1258 print 'negated' if $useDEBUG && $negated;
1259 print " command:$cmd\n" if $useDEBUG;
1260
1261 # The Command
1262 #
1263 if( $cmd !~ s/^([:#={}abcdDgGhHilnNpPqrstwxy])\s*// ){
1264 my $h = substr( $cmd, 0, 1 );
1265 Warn( "unknown command `$h'", $fl );
1266 $error++;
8d063cd8 1267 next;
1268 }
86a59229 1269 my $key = $1;
8d063cd8 1270
86a59229 1271 my $tabref = $ComTab{$key};
1272 if( $naddr > $tabref->[0] ){
1273 Warn( "excess address(es)", $fl );
1274 $error++;
8d063cd8 1275 next;
1276 }
1277
86a59229 1278 my $arg = '';
1279 if( $tabref->[1] eq 'str' ){
1280 # take remainder - don't care if it is empty
1281 $arg = $cmd;
1282 $cmd = '';
1283
1284 } elsif( $tabref->[1] eq 'txt' ){
1285 # multi-line text
1286 my $goon = $cmd =~ /(.*)\\$/;
1287 if( length( $1 ) ){
1288 Warn( "extra characters after command ($cmd)", $fl );
1289 $error++;
8d063cd8 1290 }
86a59229 1291 while( $goon ){
1292 $icom++;
1293 if( $icom > $#Commands ){
1294 Warn( "unexpected end of script", $fl );
1295 $error++;
1296 last;
1297 }
1298 $cmd = $Commands[$icom];
1299 $Code .= "# $cmd\n" if $doGenerate;
1300 $goon = $cmd =~ s/\\$//;
1301 $cmd =~ s/\\(.)/$1/g;
1302 $arg .= "\n" if length( $arg );
1303 $arg .= $cmd;
8d063cd8 1304 }
86a59229 1305 $arg .= "\n" if length( $arg );
1306 $cmd = '';
8d063cd8 1307
86a59229 1308 } elsif( $tabref->[1] eq 'sub' ){
1309 # s///
1310 if( ! length( $cmd ) ){
1311 Warn( "`s' command requires argument", $fl );
1312 $error++;
1313 next;
1314 }
1315 if( $cmd =~ s{^([^\\\n])}{} ){
1316 my $del = $1;
1317 my $regex = stripRegex( $del, \$cmd );
1318 if( ! defined( $regex ) ){
1319 Warn( "malformed regular expression", $fl );
1320 $error++;
1321 next;
a687059c 1322 }
86a59229 1323 $regex = bre2p( $del, $regex, $fl );
1324
1325 # a trailing \ indicates embedded NL (in replacement string)
1326 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1327 $icom++;
1328 if( $icom > $#Commands ){
1329 Warn( "unexpected end of script", $fl );
1330 $error++;
1331 last;
8d063cd8 1332 }
86a59229 1333 $cmd .= $Commands[$icom];
1334 $Code .= "# $Commands[$icom]\n" if $doGenerate;
9ef589d8 1335 }
86a59229 1336
1337 my $subst = stripRegex( $del, \$cmd );
1338 if( ! defined( $regex ) ){
1339 Warn( "malformed substitution expression", $fl );
1340 $error++;
0a12ae7d 1341 next;
1342 }
86a59229 1343 $subst = sub2p( $del, $subst, $fl );
1344
1345 # parse s/// modifier: g|p|0-9|w <file>
1346 my( $global, $nmatch, $print, $write ) =
1347 ( '', '', 0, undef );
1348 while( $cmd =~ s/^([gp0-9])// ){
1349 $1 eq 'g' ? ( $global = 'g' ) :
1350 $1 eq 'p' ? ( $print = $1 ) : ( $nmatch .= $1 );
1351 }
1352 $write = $1 if $cmd =~ s/w\s*(.*)$//;
1353 ### $nmatch =~ s/^(\d)\1*$/$1/; ### may be dangerous?
1354 if( $global && length( $nmatch ) || length( $nmatch ) > 1 ){
1355 Warn( "conflicting flags `$global$nmatch'", $fl );
1356 $error++;
0a12ae7d 1357 next;
1358 }
86a59229 1359
1360 $arg = makes( $regex, $subst,
1361 $write, $global, $print, $nmatch, $fl );
1362 if( ! defined( $arg ) ){
1363 $error++;
8d063cd8 1364 next;
1365 }
86a59229 1366
1367 } else {
1368 Warn( "improper delimiter in s command", $fl );
1369 $error++;
1370 next;
1371 }
1372
1373 } elsif( $tabref->[1] eq 'tra' ){
1374 # y///
1375 # a trailing \ indicates embedded newline
1376 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1377 $icom++;
1378 if( $icom > $#Commands ){
1379 Warn( "unexpected end of script", $fl );
1380 $error++;
1381 last;
1382 }
1383 $cmd .= $Commands[$icom];
1384 $Code .= "# $Commands[$icom]\n" if $doGenerate;
1385 }
1386 if( ! length( $cmd ) ){
1387 Warn( "`y' command requires argument", $fl );
1388 $error++;
1389 next;
1390 }
1391 my $d = substr( $cmd, 0, 1 ); $cmd = substr( $cmd, 1 );
1392 if( $d eq '\\' ){
1393 Warn( "`\\' not valid as delimiter in `y' command", $fl );
1394 $error++;
1395 next;
1396 }
1397 my $fr = stripTrans( $d, \$cmd );
1398 if( ! defined( $fr ) || ! length( $cmd ) ){
1399 Warn( "malformed `y' command argument", $fl );
1400 $error++;
1401 next;
1402 }
1403 my $to = stripTrans( $d, \$cmd );
1404 if( ! defined( $to ) ){
1405 Warn( "malformed `y' command argument", $fl );
1406 $error++;
1407 next;
1408 }
1409 if( length($fr) != length($to) ){
1410 Warn( "string lengths in `y' command differ", $fl );
1411 $error++;
1412 next;
1413 }
1414 if( ! defined( $arg = makey( $fr, $to, $fl ) ) ){
1415 $error++;
1416 next;
8d063cd8 1417 }
8d063cd8 1418
8d063cd8 1419 }
1420
86a59229 1421 # $cmd must be now empty - exception is {
1422 if( $cmd !~ /^\s*$/ ){
1423 if( $key eq '{' ){
1424 # dirty hack to process command on '{' line
1425 $Commands[$icom--] = $cmd;
1426 } else {
1427 Warn( "extra characters after command ($cmd)", $fl );
1428 $error++;
1429 next;
1430 }
8d063cd8 1431 }
1432
86a59229 1433 # Make Code
1434 #
1435 if( &{$tabref->[2]}( $addr1, $addr2, $negated,
1436 $tabref->[3], $arg, $fl ) ){
1437 $error++;
8d063cd8 1438 }
86a59229 1439 }
8d063cd8 1440
86a59229 1441 while( @BlockStack ){
1442 my $bl = pop( @BlockStack );
1443 Warn( "start of unterminated `{'", $bl );
1444 $error++;
1445 }
8d063cd8 1446
86a59229 1447 for my $lab ( keys( %Label ) ){
1448 if( ! exists( $Label{$lab}{defined} ) ){
1449 for my $used ( @{$Label{$lab}{used}} ){
1450 Warn( "undefined label `$lab'", $used );
1451 $error++;
1452 }
8d063cd8 1453 }
86a59229 1454 }
8d063cd8 1455
86a59229 1456 exit( 1 ) if $error;
1457}
8d063cd8 1458
8d063cd8 1459
86a59229 1460##############
1461#### MAIN ####
1462##############
8d063cd8 1463
86a59229 1464sub usage(){
1465 print STDERR "Usage: sed [-an] command [file...]\n";
1466 print STDERR " [-an] [-e command] [-f script-file] [file...]\n";
1467}
8d063cd8 1468
86a59229 1469###################
1470# Here we go again...
1471#
1472my $expr = 0;
1473while( @ARGV && $ARGV[0] =~ /^-(.)(.*)$/ ){
1474 my $opt = $1;
1475 my $arg = $2;
1476 shift( @ARGV );
1477 if( $opt eq 'e' ){
1478 if( length( $arg ) ){
1479 push( @Commands, split( "\n", $arg ) );
1480 } elsif( @ARGV ){
1481 push( @Commands, shift( @ARGV ) );
1482 } else {
1483 Warn( "option -e requires an argument" );
1484 usage();
1485 exit( 1 );
1486 }
1487 $expr++;
1488 $Defined{$#Commands} = " #$expr";
1489 next;
1490 }
1491 if( $opt eq 'f' ){
1492 my $path;
1493 if( length( $arg ) ){
1494 $path = $arg;
1495 } elsif( @ARGV ){
1496 $path = shift( @ARGV );
1497 } else {
1498 Warn( "option -f requires an argument" );
1499 usage();
1500 exit( 1 );
1501 }
1502 my $fst = $#Commands + 1;
1503 open( SCRIPT, "<$path" ) || die( "$0: $path: could not open ($!)\n" );
1504 my $cmd;
1505 while( defined( $cmd = <SCRIPT> ) ){
1506 chomp( $cmd );
1507 push( @Commands, $cmd );
1508 }
1509 close( SCRIPT );
1510 if( $#Commands >= $fst ){
1511 $Defined{$fst} = "$path";
8d063cd8 1512 }
86a59229 1513 next;
1514 }
1515 if( $opt eq '-' && $arg eq '' ){
1516 last;
1517 }
1518 if( $opt eq 'h' || $opt eq '?' ){
1519 usage();
1520 exit( 0 );
1521 }
1522 if( $opt eq 'n' ){
1523 $doAutoPrint = 0;
1524 } elsif( $opt eq 'a' ){
1525 $doOpenWrite = 0;
1526 } else {
1527 Warn( "illegal option `$opt'" );
1528 usage();
1529 exit( 1 );
1530 }
1531 if( length( $arg ) ){
1532 unshift( @ARGV, "-$arg" );
1533 }
1534}
8d063cd8 1535
86a59229 1536# A singleton command may be the 1st argument when there are no options.
1537#
1538if( @Commands == 0 ){
1539 if( @ARGV == 0 ){
1540 Warn( "no script command given" );
1541 usage();
1542 exit( 1 );
1543 }
1544 push( @Commands, split( "\n", shift( @ARGV ) ) );
1545 $Defined{0} = ' #1';
1546}
8d063cd8 1547
86a59229 1548print STDERR "Files: @ARGV\n" if $useDEBUG;
8d063cd8 1549
86a59229 1550# generate leading code
1551#
1552 $Code = <<'[TheEnd]';
1553
1554sub openARGV(){
1555 unshift( @ARGV, '-' ) unless @ARGV;
1556 my $file = shift( @ARGV );
1557 open( ARG, "<$file" )
1558 || die( "$0: can't open $file for reading ($!)\n" );
1559 $isEOF = 0;
1560}
8d063cd8 1561
86a59229 1562sub getsARGV(;\$){
1563 my $argref = @_ ? shift() : \$_;
1564 while( $isEOF || ! defined( $$argref = <ARG> ) ){
1565 close( ARG );
1566 return 0 unless @ARGV;
1567 my $file = shift( @ARGV );
1568 open( ARG, "<$file" )
1569 || die( "$0: can't open $file for reading ($!)\n" );
1570 $isEOF = 0;
1571 }
1572 1;
1573}
8d063cd8 1574
86a59229 1575sub eofARGV(){
1576 return @ARGV == 0 && ( $isEOF = eof( ARG ) );
1577}
8d063cd8 1578
86a59229 1579sub makeHandle($){
1580 my( $path ) = @_;
1581 my $handle;
1582 if( ! exists( $wFiles{$path} ) || $wFiles{$path} eq '' ){
1583 $handle = $wFiles{$path} = gensym();
1584 if( $doOpenWrite ){
1585 if( ! open( $handle, ">$path" ) ){
1586 die( "$0: can't open $path for writing: ($!)\n" );
1587 }
9ef589d8 1588 }
86a59229 1589 } else {
1590 $handle = $wFiles{$path};
1591 }
1592 return $handle;
1593}
9ef589d8 1594
86a59229 1595sub _r($){
1596 my $path = shift();
1597 push( @Q, \$path );
1598}
8d063cd8 1599
86a59229 1600sub _l(){
1601 my $h = $_;
1602 my $mcpl = 70;
1603 $h =~ s/\\/\\\\/g;
1604 if( $h =~ /[^[:print:]]/ ){
1605 $h =~ s/\a/\\a/g;
1606 $h =~ s/\f/\\f/g;
1607 $h =~ s/\n/\\n/g;
1608 $h =~ s/\t/\\t/g;
1609 $h =~ s/\r/\\r/g;
1610 $h =~ s/\e/\\e/g;
1611 $h =~ s/([^[:print:]])/sprintf("\\%03o", ord($1))/ge;
1612 }
1613 while( length( $h ) > $mcpl ){
1614 my $l = substr( $h, 0, $mcpl-1 );
1615 $h = substr( $h, $mcpl );
1616 # remove incomplete \-escape from end of line
1617 if( $l =~ s/(?<!\\)(\\[0-7]{0,2})$// ){
1618 $h = $1 . $h;
8d063cd8 1619 }
86a59229 1620 print $l, "\\\n";
8d063cd8 1621 }
86a59229 1622 print "$h\$\n";
8d063cd8 1623}
1624
86a59229 1625sub _w($){
1626 my $path = shift();
1627 my $handle = $wFiles{$path};
1628 if( ! $doOpenWrite &&
1629 ! defined( fileno( $handle ) ) ){
1630 open( $handle, ">$path" )
1631 || die( "$0: $path: cannot open ($!)\n" );
1632 }
1633 print $handle $_, "\n";
1634}
a687059c 1635
86a59229 1636# condition register test/reset
1637#
1638sub _t(){
1639 my $res = $CondReg;
1640 $CondReg = 0;
1641 $res;
1642}
0a12ae7d 1643
86a59229 1644# printQ
1645#
1646sub printQ(){
1647 for my $q ( @Q ){
1648 if( ref( $q ) ){
1649 if( exists( $wFiles{$$q} ) && $wFiles{$$q} ne '' ){
1650 open( $wFiles{$$q}, ">>$$q" );
1651 }
1652 if( open( RF, "<$$q" ) ){
1653 my $line;
1654 while( defined( $line = <RF> ) ){
1655 print $line;
1656 }
1657 close( RF );
1658 }
1659 } else {
1660 print $q;
a687059c 1661 }
86a59229 1662 }
1663 undef( @Q );
1664}
1665
1666sub Run(){
1667 my( $h, $icnt, $s, $n );
1668 # hack (not unbreakable :-/) to avoid // matching an empty string
1669 my $z = "\000"; $z =~ /$z/;
1670 # Initialize.
1671 openARGV();
1672 $Hold = '';
1673 $CondReg = 0;
1674 $doPrint = $doAutoPrint;
1675CYCLE:
1676 while( getsARGV() ){
1677 chomp();
1678 $CondReg = 0; # cleared on t
1679BOS:;
1680[TheEnd]
1681
1682 # parse - avoid opening files when doing s2p
1683 #
1684 ( $svOpenWrite, $doOpenWrite ) = ( $doOpenWrite, $svOpenWrite )
1685 if $doGenerate;
1686 Parse();
1687 ( $svOpenWrite, $doOpenWrite ) = ( $doOpenWrite, $svOpenWrite )
1688 if $doGenerate;
1689
1690 # append trailing code
1691 #
1692 $Code .= <<'[TheEnd]';
1693EOS: if( $doPrint ){
1694 print $_, "\n";
1695 } else {
1696 $doPrint = $doAutoPrint;
a687059c 1697 }
86a59229 1698 printQ() if @Q;
a687059c 1699 }
86a59229 1700
1701 exit( 0 );
a687059c 1702}
86a59229 1703[TheEnd]
1704
1705# magic "#n" - same as -n option
1706#
1707$doAutoPrint = 0 if substr( $Commands[0], 0, 2 ) eq '#n';
a687059c 1708
86a59229 1709# eval code - check for errors
1710#
1711print "Code:\n$Code" if $useDEBUG;
1712eval $Code;
1713if( $@ ){
1714 print "Code:\n$Code";
1715 die( "$0: internal error - generated incorrect Perl code: $@\n" );
9ef589d8 1716}
1717
86a59229 1718if( $doGenerate ){
1719
1720 # write full Perl program
1721 #
1722
1723 # bang line, declarations
1724 print <<TheEnd;
1725#!$perlpath -w
1726eval 'exec $perlpath -S \$0 \${1+"\$@"}'
1727 if 0;
1728\$0 =~ s/^.*?(\\w+)\$/\$1/;
1729
1730use strict;
1731use Symbol;
1732use vars qw{ \$isEOF \$Hold \%wFiles \@Q \$CondReg
1733 \$doAutoPrint \$doOpenWrite \$doPrint };
1734\$doAutoPrint = $doAutoPrint;
1735\$doOpenWrite = $doOpenWrite;
1736TheEnd
1737
1738 my $wf = "'" . join( "', '", keys( %wFiles ) ) . "'";
1739 if( $wf ne "''" ){
1740 print <<TheEnd;
1741sub makeHandle(\$);
1742for my \$p ( $wf ){
1743 exit( 1 ) unless makeHandle( \$p );
9ef589d8 1744}
86a59229 1745TheEnd
1746 }
9ef589d8 1747
86a59229 1748 print $Code;
1749 print "&Run()\n";
1750 exit( 0 );
1aa91729 1751
86a59229 1752} else {
1753
1754 # execute: make handles (and optionally open) all w files; run!
1aa91729 1755
86a59229 1756 for my $p ( keys( %wFiles ) ){
1757 exit( 1 ) unless makeHandle( $p );
1758 }
1759 &Run();
1aa91729 1760}
86a59229 1761
1762
1763=head1 ENVIRONMENT
1764
1765The environment variable C<PSEDEXTBRE> may be set to extend BREs.
1766See L<"Additional Atoms">.
1767
1768=head1 DIAGNOSTICS
1769
1770=over 4
1771
1772=item ambiguos translation for character `%s' in `y' command
1773
1774The indicated character appears twice, with different translations.
1775
1776=item `[' cannot be last in pattern
1777
1778A `[' in a BRE indicates the beginning of a I<bracket expression>.
1779
1780=item `\' cannot be last in pattern
1781
1782A `\' in a BRE is used to make the subsequent character literal.
1783
1784=item `\' cannot be last in substitution
1785
1786A `\' in a subsitution string is used to make the subsequent character literal.
1787
1788=item conflicting flags `%s'
1789
1790In an B<s> command, either the `g' flag and an n-th occurrence flag, or
1791multiple n-th occurrence flags are specified. Note that only the digits
1792`1' through `9' are permitted.
1793
1794=item duplicate label %s (first defined at %s)
1795
1796=item excess address(es)
1797
1798The command has more than the permitted number of addresses.
1799
1800=item extra characters after command (%s)
1801
1802=item illegal option `%s'
1803
1804=item improper delimiter in s command
1805
1806The BRE and substitution may not be delimited with `\' or newline.
1807
1808=item invalid address after `,'
1809
1810=item invalid backreference (%s)
1811
1812The specified backreference number exceeds the number of backreferences
1813in the BRE.
1814
1815=item invalid repeat clause `\{%s\}'
1816
1817The repeat clause does not contain a valid integer value, or pair of
1818values.
1819
1820=item malformed regex, 1st address
1821
1822=item malformed regex, 2nd address
1823
1824=item malformed regular expression
1825
1826=item malformed substitution expression
1827
1828=item malformed `y' command argument
1829
1830The first or second string of a B<y> command is syntactically incorrect.
1831
1832=item maximum less than minimum in `\{%s\}'
1833
1834=item no script command given
1835
1836There must be at least one B<-e> or one B<-f> option specifying a
1837script or script file.
1838
1839=item `\' not valid as delimiter in `y' command
1840
1841=item option -e requires an argument
1842
1843=item option -f requires an argument
1844
1845=item `s' command requires argument
1846
1847=item start of unterminated `{'
1848
1849=item string lengths in `y' command differ
1850
1851The translation table strings in a B<y> commanf must have equal lengths.
1852
1853=item undefined label `%s'
1854
1855=item unexpected `}'
1856
1857A B<}> command without a preceding B<{> command was encountered.
1858
1859=item unexpected end of script
1860
1861The end of the script was reached although a text line after a
1862B<a>, B<c> or B<i> command indicated another line.
1863
1864=item unknown command `%s'
1865
1866=item unterminated `['
1867
1868A BRE contains an unterminated bracket expression.
1869
1870=item unterminated `\('
1871
1872A BRE contains an unterminated backreference.
1873
1874=item `\{' without closing `\}'
1875
1876A BRE contains an unterminated bounds specification.
1877
1878=item `\)' without preceding `\('
1879
1880=item `y' command requires argument
1881
1882=back
1883
1884=head1 EXAMPLE
1885
1886The basic material for the preceding section was generated by running
1887the sed script
1888
1889 #no autoprint
1890 s/^.*Warn( *"\([^"]*\)".*$/\1/
1891 t process
1892 b
1893 :process
1894 s/$!/%s/g
1895 s/$[_[:alnum:]]\{1,\}/%s/g
1896 s/\\\\/\\/g
1897 s/^/=item /
1898 p
1899
1900on the program's own text, and piping the output into C<sort -u>.
1901
1902
1903=head1 SED SCRIPT TRANSLATION
1904
1905If this program is invoked with the name F<s2p> it will act as a
1906sed-to-Perl translator. After option processing (all other
1907arguments are ignored), a Perl program is printed on standard
1908output, which will process the input stream (as read from all
1909arguments) in the way defined by the sed script and the option setting
1910used for the translation.
1911
1912=head1 SEE ALSO
1913
1914perl(1), re_format(7)
1915
1916=head1 BUGS
1917
1918The B<l> command will show escape characters (ESC) as `C<\e>', but
1919a vertical tab (VT) in octal.
1920
1921Trailing spaces are truncated from labels in B<:>, B<t> and B<b> commands.
1922
1923The meaning of an empty regular expression (`C<//>'), as defined by B<sed>,
1924is "the last pattern used, at run time". This deviates from the Perl
1925interpretation, which will re-use the "last last successfully executed
1926regular expression". Since keeping track of pattern usage would create
1927terribly cluttered code, and differences would only appear in obscure
1928context (where other B<sed> implementations appear to deviate, too),
1929the Perl semantics was adopted. Note that common usage of this feature,
1930such as in C</abc/s//xyz/>, will work as expected.
1931
1932Collating elements (of bracket expressions in BREs) are not implemented.
1933
1934=head1 STANDARDS
1935
1936This B<sed> implementation conforms to the IEEE Std1003.2-1992 ("POSIX.2")
1937definition of B<sed>, and is compatible with the I<OpenBSD>
1938implementation, except where otherwise noted (see L<"BUGS">).
1939
1940=head1 AUTHOR
1941
1942This Perl implementation of I<sed> was written by Wolfgang Laun,
1943I<Wolfgang.Laun@alcatel.at>.
1944
1945=head1 COPYRIGHT and LICENSE
1946
1947This program is free and open software. You may use, modify,
1948distribute, and sell this program (and any modified variants) in any
1949way you wish, provided you do not restrict others from doing the same.
1950
1951=cut
1952
a687059c 1953!NO!SUBS!
4633a7c4 1954
1955close OUT or die "Can't close $file: $!";
1956chmod 0755, $file or die "Can't reset permissions for $file: $!\n";
1957exec("$Config{'eunicefix'} $file") if $Config{'eunicefix'} ne ':';
8a5546a1 1958chdir $origdir;