Commit | Line | Data |
3fea05b9 |
1 | package URI::_punycode; |
2 | |
3 | use strict; |
4 | our $VERSION = 0.02; |
5 | |
6 | require Exporter; |
7 | our @ISA = qw(Exporter); |
8 | our @EXPORT = qw(encode_punycode decode_punycode); |
9 | |
10 | use integer; |
11 | |
12 | our $DEBUG = 0; |
13 | |
14 | use constant BASE => 36; |
15 | use constant TMIN => 1; |
16 | use constant TMAX => 26; |
17 | use constant SKEW => 38; |
18 | use constant DAMP => 700; |
19 | use constant INITIAL_BIAS => 72; |
20 | use constant INITIAL_N => 128; |
21 | |
22 | my $Delimiter = chr 0x2D; |
23 | my $BasicRE = qr/[\x00-\x7f]/; |
24 | |
25 | sub _croak { require Carp; Carp::croak(@_); } |
26 | |
27 | sub digit_value { |
28 | my $code = shift; |
29 | return ord($code) - ord("A") if $code =~ /[A-Z]/; |
30 | return ord($code) - ord("a") if $code =~ /[a-z]/; |
31 | return ord($code) - ord("0") + 26 if $code =~ /[0-9]/; |
32 | return; |
33 | } |
34 | |
35 | sub code_point { |
36 | my $digit = shift; |
37 | return $digit + ord('a') if 0 <= $digit && $digit <= 25; |
38 | return $digit + ord('0') - 26 if 26 <= $digit && $digit <= 36; |
39 | die 'NOT COME HERE'; |
40 | } |
41 | |
42 | sub adapt { |
43 | my($delta, $numpoints, $firsttime) = @_; |
44 | $delta = $firsttime ? $delta / DAMP : $delta / 2; |
45 | $delta += $delta / $numpoints; |
46 | my $k = 0; |
47 | while ($delta > ((BASE - TMIN) * TMAX) / 2) { |
48 | $delta /= BASE - TMIN; |
49 | $k += BASE; |
50 | } |
51 | return $k + (((BASE - TMIN + 1) * $delta) / ($delta + SKEW)); |
52 | } |
53 | |
54 | sub decode_punycode { |
55 | my $code = shift; |
56 | |
57 | my $n = INITIAL_N; |
58 | my $i = 0; |
59 | my $bias = INITIAL_BIAS; |
60 | my @output; |
61 | |
62 | if ($code =~ s/(.*)$Delimiter//o) { |
63 | push @output, map ord, split //, $1; |
64 | return _croak('non-basic code point') unless $1 =~ /^$BasicRE*$/o; |
65 | } |
66 | |
67 | while ($code) { |
68 | my $oldi = $i; |
69 | my $w = 1; |
70 | LOOP: |
71 | for (my $k = BASE; 1; $k += BASE) { |
72 | my $cp = substr($code, 0, 1, ''); |
73 | my $digit = digit_value($cp); |
74 | defined $digit or return _croak("invalid punycode input"); |
75 | $i += $digit * $w; |
76 | my $t = ($k <= $bias) ? TMIN |
77 | : ($k >= $bias + TMAX) ? TMAX : $k - $bias; |
78 | last LOOP if $digit < $t; |
79 | $w *= (BASE - $t); |
80 | } |
81 | $bias = adapt($i - $oldi, @output + 1, $oldi == 0); |
82 | warn "bias becomes $bias" if $DEBUG; |
83 | $n += $i / (@output + 1); |
84 | $i = $i % (@output + 1); |
85 | splice(@output, $i, 0, $n); |
86 | warn join " ", map sprintf('%04x', $_), @output if $DEBUG; |
87 | $i++; |
88 | } |
89 | return join '', map chr, @output; |
90 | } |
91 | |
92 | sub encode_punycode { |
93 | my $input = shift; |
94 | # my @input = split //, $input; # doesn't work in 5.6.x! |
95 | my @input = map substr($input, $_, 1), 0..length($input)-1; |
96 | |
97 | my $n = INITIAL_N; |
98 | my $delta = 0; |
99 | my $bias = INITIAL_BIAS; |
100 | |
101 | my @output; |
102 | my @basic = grep /$BasicRE/, @input; |
103 | my $h = my $b = @basic; |
104 | push @output, @basic; |
105 | push @output, $Delimiter if $b && $h < @input; |
106 | warn "basic codepoints: (@output)" if $DEBUG; |
107 | |
108 | while ($h < @input) { |
109 | my $m = min(grep { $_ >= $n } map ord, @input); |
110 | warn sprintf "next code point to insert is %04x", $m if $DEBUG; |
111 | $delta += ($m - $n) * ($h + 1); |
112 | $n = $m; |
113 | for my $i (@input) { |
114 | my $c = ord($i); |
115 | $delta++ if $c < $n; |
116 | if ($c == $n) { |
117 | my $q = $delta; |
118 | LOOP: |
119 | for (my $k = BASE; 1; $k += BASE) { |
120 | my $t = ($k <= $bias) ? TMIN : |
121 | ($k >= $bias + TMAX) ? TMAX : $k - $bias; |
122 | last LOOP if $q < $t; |
123 | my $cp = code_point($t + (($q - $t) % (BASE - $t))); |
124 | push @output, chr($cp); |
125 | $q = ($q - $t) / (BASE - $t); |
126 | } |
127 | push @output, chr(code_point($q)); |
128 | $bias = adapt($delta, $h + 1, $h == $b); |
129 | warn "bias becomes $bias" if $DEBUG; |
130 | $delta = 0; |
131 | $h++; |
132 | } |
133 | } |
134 | $delta++; |
135 | $n++; |
136 | } |
137 | return join '', @output; |
138 | } |
139 | |
140 | sub min { |
141 | my $min = shift; |
142 | for (@_) { $min = $_ if $_ <= $min } |
143 | return $min; |
144 | } |
145 | |
146 | 1; |
147 | __END__ |
148 | |
149 | =head1 NAME |
150 | |
151 | URI::_punycode - encodes Unicode string in Punycode |
152 | |
153 | =head1 SYNOPSIS |
154 | |
155 | use URI::_punycode; |
156 | $punycode = encode_punycode($unicode); |
157 | $unicode = decode_punycode($punycode); |
158 | |
159 | =head1 DESCRIPTION |
160 | |
161 | URI::_punycode is a module to encode / decode Unicode strings into |
162 | Punycode, an efficient encoding of Unicode for use with IDNA. |
163 | |
164 | This module requires Perl 5.6.0 or over to handle UTF8 flagged Unicode |
165 | strings. |
166 | |
167 | =head1 FUNCTIONS |
168 | |
169 | This module exports following functions by default. |
170 | |
171 | =over 4 |
172 | |
173 | =item encode_punycode |
174 | |
175 | $punycode = encode_punycode($unicode); |
176 | |
177 | takes Unicode string (UTF8-flagged variable) and returns Punycode |
178 | encoding for it. |
179 | |
180 | =item decode_punycode |
181 | |
182 | $unicode = decode_punycode($punycode) |
183 | |
184 | takes Punycode encoding and returns original Unicode string. |
185 | |
186 | =back |
187 | |
188 | These functions throws exceptionsn on failure. You can catch 'em via |
189 | C<eval>. |
190 | |
191 | =head1 AUTHOR |
192 | |
193 | Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt> is the author of |
194 | IDNA::Punycode v0.02 which was the basis for this module. |
195 | |
196 | This library is free software; you can redistribute it and/or modify |
197 | it under the same terms as Perl itself. |
198 | |
199 | =head1 SEE ALSO |
200 | |
201 | L<IDNA::Punycode>, RFC 3492 |
202 | |
203 | =cut |