From: Nick Ing-Simmons Date: Tue, 5 Feb 2002 16:01:15 +0000 (+0000) Subject: More Encode tweaks: X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=4cfc977cb33a032e78e373bce7db50a1970926f3;p=p5sagit%2Fp5-mst-13.2.git More Encode tweaks: - make expensive and marginal substring search optional (-O) - enable -O for ASCII-oid encodings (search space is small) - add ASCII-oid jis0201 to basic Encode.so - add some other Japanese encodings to EUC_JP bundle (without -O) p4raw-id: //depot/perlio@14563 --- diff --git a/MANIFEST b/MANIFEST index b5598d6..47870bb 100644 --- a/MANIFEST +++ b/MANIFEST @@ -296,6 +296,7 @@ ext/Encode/Encode/gsm0338.enc Encode table ext/Encode/Encode/HZ.enc Encode table ext/Encode/Encode/ir-197.enc Encode table ext/Encode/Encode/jis0201.enc Encode table +ext/Encode/Encode/jis0201.ucm Encode table ext/Encode/Encode/jis0208.enc Encode table ext/Encode/Encode/jis0212.enc Encode table ext/Encode/Encode/koi8-f.enc Encode table diff --git a/ext/Encode/EUC_JP/Makefile.PL b/ext/Encode/EUC_JP/Makefile.PL index 0327741..ffa6902 100644 --- a/ext/Encode/EUC_JP/Makefile.PL +++ b/ext/Encode/EUC_JP/Makefile.PL @@ -3,11 +3,10 @@ use strict; use ExtUtils::MakeMaker; my %tables = (EUC_JP => [ - 'euc-jp.ucm', -# 'jis0201.enc', -# 'jis0212.enc', -# 'jis0208.enc', -# 'shiftjis.enc', + 'euc-jp.ucm', + 'jis0208.enc', + 'jis0212.enc', + 'shiftjis.enc', ]); diff --git a/ext/Encode/Encode/jis0201.ucm b/ext/Encode/Encode/jis0201.ucm new file mode 100644 index 0000000..a14f8ce --- /dev/null +++ b/ext/Encode/Encode/jis0201.ucm @@ -0,0 +1,231 @@ +# compile -o Encode/jis0201.ucm Encode/jis0201.enc + "jis0201" + 1 + 1 + \x3F +# +CHARMAP + \x00 |0 # + \x01 |0 # + \x02 |0 # + \x03 |0 # + \x04 |0 # + \x05 |0 # + \x06 |0 # + \x07 |0 # + \x08 |0 # + \x09 |0 # + \x0A |0 # + \x0B |0 # + \x0C |0 # + \x0D |0 # + \x0E |0 # + \x0F |0 # + \x10 |0 # + \x11 |0 # + \x12 |0 # + \x13 |0 # + \x14 |0 # + \x15 |0 # + \x16 |0 # + \x17 |0 # + \x18 |0 # + \x19 |0 # + \x1A |0 # + \x1B |0 # + \x1C |0 # + \x1D |0 # + \x1E |0 # + \x1F |0 # + \x20 |0 # SPACE + \x21 |0 # EXCLAMATION MARK + \x22 |0 # QUOTATION MARK + \x23 |0 # NUMBER SIGN + \x24 |0 # DOLLAR SIGN + \x25 |0 # PERCENT SIGN + \x26 |0 # AMPERSAND + \x27 |0 # APOSTROPHE + \x28 |0 # LEFT PARENTHESIS + \x29 |0 # RIGHT PARENTHESIS + \x2A |0 # ASTERISK + \x2B |0 # PLUS SIGN + \x2C |0 # COMMA + \x2D |0 # HYPHEN-MINUS + \x2E |0 # FULL STOP + \x2F |0 # SOLIDUS + \x30 |0 # DIGIT ZERO + \x31 |0 # DIGIT ONE + \x32 |0 # DIGIT TWO + \x33 |0 # DIGIT THREE + \x34 |0 # DIGIT FOUR + \x35 |0 # DIGIT FIVE + \x36 |0 # DIGIT SIX + \x37 |0 # DIGIT SEVEN + \x38 |0 # DIGIT EIGHT + \x39 |0 # DIGIT NINE + \x3A |0 # COLON + \x3B |0 # SEMICOLON + \x3C |0 # LESS-THAN SIGN + \x3D |0 # EQUALS SIGN + \x3E |0 # GREATER-THAN SIGN + \x3F |0 # QUESTION MARK + \x40 |0 # COMMERCIAL AT + \x41 |0 # LATIN CAPITAL LETTER A + \x42 |0 # LATIN CAPITAL LETTER B + \x43 |0 # LATIN CAPITAL LETTER C + \x44 |0 # LATIN CAPITAL LETTER D + \x45 |0 # LATIN CAPITAL LETTER E + \x46 |0 # LATIN CAPITAL LETTER F + \x47 |0 # LATIN CAPITAL LETTER G + \x48 |0 # LATIN CAPITAL LETTER H + \x49 |0 # LATIN CAPITAL LETTER I + \x4A |0 # LATIN CAPITAL LETTER J + \x4B |0 # LATIN CAPITAL LETTER K + \x4C |0 # LATIN CAPITAL LETTER L + \x4D |0 # LATIN CAPITAL LETTER M + \x4E |0 # LATIN CAPITAL LETTER N + \x4F |0 # LATIN CAPITAL LETTER O + \x50 |0 # LATIN CAPITAL LETTER P + \x51 |0 # LATIN CAPITAL LETTER Q + \x52 |0 # LATIN CAPITAL LETTER R + \x53 |0 # LATIN CAPITAL LETTER S + \x54 |0 # LATIN CAPITAL LETTER T + \x55 |0 # LATIN CAPITAL LETTER U + \x56 |0 # LATIN CAPITAL LETTER V + \x57 |0 # LATIN CAPITAL LETTER W + \x58 |0 # LATIN CAPITAL LETTER X + \x59 |0 # LATIN CAPITAL LETTER Y + \x5A |0 # LATIN CAPITAL LETTER Z + \x5B |0 # LEFT SQUARE BRACKET + \x5C |0 # REVERSE SOLIDUS + \x5D |0 # RIGHT SQUARE BRACKET + \x5E |0 # CIRCUMFLEX ACCENT + \x5F |0 # LOW LINE + \x60 |0 # GRAVE ACCENT + \x61 |0 # LATIN SMALL LETTER A + \x62 |0 # LATIN SMALL LETTER B + \x63 |0 # LATIN SMALL LETTER C + \x64 |0 # LATIN SMALL LETTER D + \x65 |0 # LATIN SMALL LETTER E + \x66 |0 # LATIN SMALL LETTER F + \x67 |0 # LATIN SMALL LETTER G + \x68 |0 # LATIN SMALL LETTER H + \x69 |0 # LATIN SMALL LETTER I + \x6A |0 # LATIN SMALL LETTER J + \x6B |0 # LATIN SMALL LETTER K + \x6C |0 # LATIN SMALL LETTER L + \x6D |0 # LATIN SMALL LETTER M + \x6E |0 # LATIN SMALL LETTER N + \x6F |0 # LATIN SMALL LETTER O + \x70 |0 # LATIN SMALL LETTER P + \x71 |0 # LATIN SMALL LETTER Q + \x72 |0 # LATIN SMALL LETTER R + \x73 |0 # LATIN SMALL LETTER S + \x74 |0 # LATIN SMALL LETTER T + \x75 |0 # LATIN SMALL LETTER U + \x76 |0 # LATIN SMALL LETTER V + \x77 |0 # LATIN SMALL LETTER W + \x78 |0 # LATIN SMALL LETTER X + \x79 |0 # LATIN SMALL LETTER Y + \x7A |0 # LATIN SMALL LETTER Z + \x7B |0 # LEFT CURLY BRACKET + \x7C |0 # VERTICAL LINE + \x7D |0 # RIGHT CURLY BRACKET + \x7E |0 # OVERLINE + \x7F |0 # + \x80 |0 # + \x81 |0 # + \x82 |0 # + \x83 |0 # + \x84 |0 # + \x85 |0 # + \x86 |0 # + \x87 |0 # + \x88 |0 # + \x89 |0 # + \x8A |0 # + \x8B |0 # + \x8C |0 # + \x8D |0 # + \x8E |0 # + \x8F |0 # + \x90 |0 # + \x91 |0 # + \x92 |0 # + \x93 |0 # + \x94 |0 # + \x95 |0 # + \x96 |0 # + \x97 |0 # + \x98 |0 # + \x99 |0 # + \x9A |0 # + \x9B |0 # + \x9C |0 # + \x9D |0 # + \x9E |0 # + \x9F |0 # + \xA1 |0 # HALFWIDTH IDEOGRAPHIC FULL STOP + \xA2 |0 # HALFWIDTH LEFT CORNER BRACKET + \xA3 |0 # HALFWIDTH RIGHT CORNER BRACKET + \xA4 |0 # HALFWIDTH IDEOGRAPHIC COMMA + \xA5 |0 # HALFWIDTH KATAKANA MIDDLE DOT + \xA6 |0 # HALFWIDTH KATAKANA LETTER WO + \xA7 |0 # HALFWIDTH KATAKANA LETTER SMALL A + \xA8 |0 # HALFWIDTH KATAKANA LETTER SMALL I + \xA9 |0 # HALFWIDTH KATAKANA LETTER SMALL U + \xAA |0 # HALFWIDTH KATAKANA LETTER SMALL E + \xAB |0 # HALFWIDTH KATAKANA LETTER SMALL O + \xAC |0 # HALFWIDTH KATAKANA LETTER SMALL YA + \xAD |0 # HALFWIDTH KATAKANA LETTER SMALL YU + \xAE |0 # HALFWIDTH KATAKANA LETTER SMALL YO + \xAF |0 # HALFWIDTH KATAKANA LETTER SMALL TU + \xB0 |0 # HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK + \xB1 |0 # HALFWIDTH KATAKANA LETTER A + \xB2 |0 # HALFWIDTH KATAKANA LETTER I + \xB3 |0 # HALFWIDTH KATAKANA LETTER U + \xB4 |0 # HALFWIDTH KATAKANA LETTER E + \xB5 |0 # HALFWIDTH KATAKANA LETTER O + \xB6 |0 # HALFWIDTH KATAKANA LETTER KA + \xB7 |0 # HALFWIDTH KATAKANA LETTER KI + \xB8 |0 # HALFWIDTH KATAKANA LETTER KU + \xB9 |0 # HALFWIDTH KATAKANA LETTER KE + \xBA |0 # HALFWIDTH KATAKANA LETTER KO + \xBB |0 # HALFWIDTH KATAKANA LETTER SA + \xBC |0 # HALFWIDTH KATAKANA LETTER SI + \xBD |0 # HALFWIDTH KATAKANA LETTER SU + \xBE |0 # HALFWIDTH KATAKANA LETTER SE + \xBF |0 # HALFWIDTH KATAKANA LETTER SO + \xC0 |0 # HALFWIDTH KATAKANA LETTER TA + \xC1 |0 # HALFWIDTH KATAKANA LETTER TI + \xC2 |0 # HALFWIDTH KATAKANA LETTER TU + \xC3 |0 # HALFWIDTH KATAKANA LETTER TE + \xC4 |0 # HALFWIDTH KATAKANA LETTER TO + \xC5 |0 # HALFWIDTH KATAKANA LETTER NA + \xC6 |0 # HALFWIDTH KATAKANA LETTER NI + \xC7 |0 # HALFWIDTH KATAKANA LETTER NU + \xC8 |0 # HALFWIDTH KATAKANA LETTER NE + \xC9 |0 # HALFWIDTH KATAKANA LETTER NO + \xCA |0 # HALFWIDTH KATAKANA LETTER HA + \xCB |0 # HALFWIDTH KATAKANA LETTER HI + \xCC |0 # HALFWIDTH KATAKANA LETTER HU + \xCD |0 # HALFWIDTH KATAKANA LETTER HE + \xCE |0 # HALFWIDTH KATAKANA LETTER HO + \xCF |0 # HALFWIDTH KATAKANA LETTER MA + \xD0 |0 # HALFWIDTH KATAKANA LETTER MI + \xD1 |0 # HALFWIDTH KATAKANA LETTER MU + \xD2 |0 # HALFWIDTH KATAKANA LETTER ME + \xD3 |0 # HALFWIDTH KATAKANA LETTER MO + \xD4 |0 # HALFWIDTH KATAKANA LETTER YA + \xD5 |0 # HALFWIDTH KATAKANA LETTER YU + \xD6 |0 # HALFWIDTH KATAKANA LETTER YO + \xD7 |0 # HALFWIDTH KATAKANA LETTER RA + \xD8 |0 # HALFWIDTH KATAKANA LETTER RI + \xD9 |0 # HALFWIDTH KATAKANA LETTER RU + \xDA |0 # HALFWIDTH KATAKANA LETTER RE + \xDB |0 # HALFWIDTH KATAKANA LETTER RO + \xDC |0 # HALFWIDTH KATAKANA LETTER WA + \xDD |0 # HALFWIDTH KATAKANA LETTER N + \xDE |0 # HALFWIDTH KATAKANA VOICED SOUND MARK + \xDF |0 # HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK +END CHARMAP diff --git a/ext/Encode/Makefile.PL b/ext/Encode/Makefile.PL index 5193d0e..d6db779 100644 --- a/ext/Encode/Makefile.PL +++ b/ext/Encode/Makefile.PL @@ -2,7 +2,7 @@ use 5.7.2; use strict; use ExtUtils::MakeMaker; -my %tables = (8859 => ['ascii.ucm', 'cp1250.ucm', 'koi8-r.ucm' ], +my %tables = (8859 => ['ascii.ucm', 'cp1250.ucm', 'koi8-r.ucm', 'jis0201.ucm' ], EBCDIC => ['cp1047.ucm','cp37.ucm','posix-bc.ucm'], Symbols => ['symbol.ucm','dingbats.ucm'], ); @@ -93,7 +93,7 @@ sub postamble $continuator = ''; } } - $str .= "\n\t\$(PERL) compile -o \$\@ -f $table.fnm\n\n"; + $str .= "\n\t\$(PERL) compile -O -o \$\@ -f $table.fnm\n\n"; open (FILELIST, ">$table.fnm") || die "Could not open $table.fnm: $!"; foreach my $file (@{$tables{$table}}) diff --git a/ext/Encode/compile b/ext/Encode/compile index f52b4ed..ee6d778 100755 --- a/ext/Encode/compile +++ b/ext/Encode/compile @@ -46,7 +46,7 @@ sub encode_M eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32'); my %opt; -getopts('qo:f:n:',\%opt); +getopts('qOo:f:n:',\%opt); my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV); chmod(0666,$cname) if -f $cname && !-w $cname; open(C,">$cname") || die "Cannot open $cname:$!"; @@ -434,17 +434,20 @@ sub outstring } else { - foreach my $o (keys %strings) - { - my $i = index($o,$s); - if ($i >= 0) - { - $sym = $strings{$o}; - $sym .= sprintf("+0x%02x",$i) if ($i); - $subsave += length($s); - return $sym; - } - } + if ($opt{'O'}) { + foreach my $o (keys %strings) + { + my $i = index($o,$s); + if ($i >= 0) + { + $sym = $strings{$o}; + $sym .= sprintf("+0x%02x",$i) if ($i); + $subsave += length($s); + $strings{$s} = $sym; + return $sym; + } + } + } $strings{$s} = $sym = $name; $strings += length($s); printf $fh "\nstatic const U8 %s[%d] =\n",$name,length($s);