Commit | Line | Data |
fcb875d4 |
1 | #!/usr/local/bin/perl |
2 | # |
0ab8f81e |
3 | # $Id: ucmlint,v 0.2 2002/04/22 02:45:50 dankogai Exp $ |
fcb875d4 |
4 | # |
5 | |
6 | use strict; |
0ab8f81e |
7 | our $VERSION = do { my @r = (q$Revision: 0.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
fcb875d4 |
8 | |
9 | use Getopt::Std; |
10 | our %Opt; |
11 | getopts("Dehfv", \%Opt); |
12 | |
13 | if ($Opt{e}){ |
14 | eval{ require Encode; }; |
15 | $@ and die "can't load Encode : $@"; |
16 | } |
17 | |
18 | $Opt{h} and help(); |
19 | @ARGV or help(); |
20 | |
21 | sub help{ |
22 | print <<""; |
23 | $0 -[Dehfv] [ucm files ...] |
24 | -D debug mode on |
25 | -e test with Encode module also (requires perl 5.7.3 or higher) |
26 | -h shows this message |
27 | -f forces roundtrip check even for |[123] |
28 | -v verbose mode |
29 | |
30 | } |
31 | |
32 | $| = 1; |
33 | my (%Hdr, %U2E, %E2U); |
34 | my $in_charmap = 0; |
35 | my $nerror = 0; |
36 | my $nwarning = 0; |
37 | |
38 | sub nit($;$){ |
39 | my ($msg, $level) = @_; |
40 | my $lstr; |
41 | if ($level == 2){ |
42 | $lstr = 'notice'; |
43 | }elsif ($level == 1){ |
44 | $lstr = 'warning'; $nwarning++; |
45 | }else{ |
46 | $lstr = 'error'; $nerror++; |
47 | } |
48 | print "$ARGV:$lstr in line $.: $msg\n"; |
49 | } |
50 | |
51 | for $ARGV (@ARGV){ |
52 | open UCM, $ARGV or die "$ARGV:$!"; |
53 | %Hdr = %U2E = %E2U = (); |
54 | $in_charmap = $nerror = $nwarning = 0; |
55 | $. = 0; |
56 | while(<UCM>){ |
57 | chomp; |
58 | s/\s*#.*$//o; /^$/ and next; |
59 | if ($_ eq "CHARMAP"){ |
60 | $in_charmap = 1; |
61 | for my $must (qw/code_set_name mb_cur_min mb_cur_max/){ |
62 | exists $Hdr{$must} or nit "<$must> nonexistent"; |
63 | } |
64 | $Hdr{mb_cur_min} > $Hdr{mb_cur_max} |
65 | and nit sprintf("mb_cur_min(%d) > mb_cur_max(%d)", |
66 | $Hdr{mb_cur_min},$Hdr{mb_cur_max}); |
67 | $in_charmap = 1; |
68 | next; |
69 | } |
70 | unless ($in_charmap){ |
71 | my($hkey, $hvalue) = /^<(\S+)>\s+[\"\']?([^\"\']+)/o or next; |
72 | $Opt{D} and warn "$hkey => $hvalue"; |
73 | if ($hkey eq "code_set_name"){ # name check |
74 | exists $Hdr{code_set_name} |
75 | and nit "Duplicate <code_set_name>: $hkey"; |
76 | } |
77 | if ($hkey eq "code_set_alias"){ # alias check |
78 | $hvalue eq $Hdr{code_set_name} |
79 | and nit qq(alias "$hvalue" is already in <code_set_name>); |
80 | } |
81 | $Hdr{$hkey} = $hvalue; |
82 | }else{ |
83 | my $name = $Hdr{code_set_name}; |
84 | my($unistr, $encstr, $fb) = /^(\S+)\s+(\S+)\s(\S+)/o or next; |
85 | $Opt{v} and nit $_, 2; |
86 | my $uni = uniparse($unistr); |
87 | my $enc = encparse($encstr); |
88 | $fb =~ /^\|([0123])$/ or nit "malformed fallback: $fb"; |
89 | $fb = $1; |
90 | $Opt{f} and $fb = 0; |
91 | unless ($fb == 1){ # check uni -> enc |
92 | if (exists $U2E{$uni}){ |
93 | nit "dupe encode map: U$uni => $U2E{$uni} and $enc", 1; |
94 | }else{ |
95 | $U2E{$uni} = $enc; |
96 | if ($Opt{e} and $fb != 3) { |
97 | my $e = hex2enc($enc); |
98 | my $u = hex2uni($uni); |
99 | my $eu = Encode::encode($name, $u); |
100 | $e eq $eu |
101 | or nit qq(encode('$name', $uni) != $enc); |
102 | } |
103 | } |
104 | } |
105 | unless ($fb == 3){ # check enc -> uni |
106 | if (exists $E2U{$enc}){ |
107 | nit "dupe decode map: $enc => U$E2U{$enc} and U$uni", 1; |
108 | }else{ |
109 | $E2U{$enc} = $uni; |
110 | if ($Opt{e} and $fb != 1) { |
111 | my $e = hex2enc($enc); |
112 | my $u = hex2uni($uni); |
113 | $Opt{D} and warn "$uni, $enc"; |
114 | my $de = Encode::decode($name, $e); |
115 | $de eq $u |
116 | or nit qq(decode('$name', $enc) != $uni); |
117 | } |
118 | } |
119 | } |
120 | # warn "$uni, $enc, $fb"; |
121 | } |
122 | } |
123 | $in_charmap or nit "Where is CHARMAP?"; |
124 | checkRT(); |
125 | printf ("$ARGV: %s error%s found\n", |
126 | ($nerror == 0 ? 'no' : $nerror), |
127 | ($nerror > 1 ? 's' : '')); |
128 | } |
129 | |
130 | exit; |
131 | |
132 | sub hex2enc{ |
133 | pack("C*", map {hex($_)} split(",", shift)); |
134 | } |
135 | sub hex2uni{ |
136 | join("", map { chr(hex($_)) } split(",", shift)); |
137 | } |
138 | |
139 | sub checkRT{ |
140 | for my $uni (keys %E2U){ |
141 | my $enc = $U2E{$uni} or next; # okay |
142 | $E2U{$U2E{$uni}} eq $uni or |
143 | nit "RT failure: U$uni => $enc =>U$E2U{$U2E{$uni}}"; |
144 | } |
145 | for my $enc (keys %E2U){ |
146 | my $uni = $E2U{$enc} or next; # okay |
147 | $U2E{$E2U{$enc}} eq $enc or |
148 | nit "RT failure: $enc => U$uni => $U2E{$E2U{$enc}}"; |
149 | } |
150 | } |
151 | |
152 | |
153 | sub uniparse{ |
154 | my $str = shift; |
155 | my @u; |
156 | push @u, $1 while($str =~ /\G<U(.*?)>/ig); |
157 | for my $u (@u){ |
158 | $u =~ /^([0-9A-Za-z]+)$/o |
159 | or nit "malformed Unicode character: $u"; |
160 | } |
161 | return join(',', @u); |
162 | } |
163 | |
164 | sub encparse{ |
165 | my $str = shift; |
166 | my @e; |
167 | for my $e (split /\\x/io, $str){ |
168 | $e or next; # first \x |
169 | $e =~ /^([0-9A-Za-z]{1,2})$/io |
170 | or nit "Hex $e in $str is bogus"; |
171 | push @e, $1; |
172 | } |
173 | return join(',', @e); |
174 | } |
175 | |
176 | |
177 | |
178 | __END__ |
179 | |
0ab8f81e |
180 | A UCM file looks like this. |
fcb875d4 |
181 | |
182 | # |
183 | # Comments |
184 | # |
185 | <code_set_name> "US-ascii" # Required |
186 | <code_set_alias> "ascii" # Optional |
187 | <mb_cur_min> 1 # Required; usually 1 |
188 | <mb_cur_max> 1 # Max. # of bytes/char |
189 | <subchar> \x3F # Substitution char |
190 | # |
191 | CHARMAP |
192 | <U0000> \x00 |0 # <control> |
193 | <U0001> \x01 |0 # <control> |
194 | <U0002> \x02 |0 # <control> |
195 | .... |
196 | <U007C> \x7C |0 # VERTICAL LINE |
197 | <U007D> \x7D |0 # RIGHT CURLY BRACKET |
198 | <U007E> \x7E |0 # TILDE |
199 | <U007F> \x7F |0 # <control> |
200 | END CHARMAP |
201 | |