Commit | Line | Data |
a0d0e21e |
1 | package I18N::Collate; |
2 | |
f06db76b |
3 | =head1 NAME |
4 | |
69b19ea2 |
5 | I18N::Collate - compare 8-bit scalar data according to the current locale |
f06db76b |
6 | |
7 | =head1 SYNOPSIS |
8 | |
69b19ea2 |
9 | use I18N::Collate; |
f06db76b |
10 | setlocale(LC_COLLATE, 'locale-of-your-choice'); |
69b19ea2 |
11 | $s1 = new I18N::Collate "scalar_data_1"; |
12 | $s2 = new I18N::Collate "scalar_data_2"; |
f06db76b |
13 | |
14 | =head1 DESCRIPTION |
15 | |
16 | This module provides you with objects that will collate |
69b19ea2 |
17 | according to your national character set, provided that the |
18 | POSIX setlocale() function is supported on your system. |
f06db76b |
19 | |
20 | You can compare $s1 and $s2 above with |
21 | |
22 | $s1 le $s2 |
23 | |
24 | to extract the data itself, you'll need a dereference: $$s1 |
25 | |
69b19ea2 |
26 | This uses POSIX::setlocale(). The basic collation conversion is done by |
f06db76b |
27 | strxfrm() which terminates at NUL characters being a decent C routine. |
28 | collate_xfrm() handles embedded NUL characters gracefully. Due to C<cmp> |
29 | and overload magic, C<lt>, C<le>, C<eq>, C<ge>, and C<gt> work also. The |
30 | available locales depend on your operating system; try whether C<locale |
c2960299 |
31 | -a> shows them or man pages for "locale" or "nlsinfo" or |
32 | the direct approach C<ls /usr/lib/nls/loc> or C<ls |
33 | /usr/lib/nls>. Not all the locales that your vendor supports |
34 | are necessarily installed: please consult your operating system's |
69b19ea2 |
35 | documentation and possibly your local system administration. |
c2960299 |
36 | |
37 | The locale names are probably something like |
38 | C<"xx_XX.(ISO)?8859-N"> or C<"xx_XX.(ISO)?8859N">, for example |
39 | C<"fr_CH.ISO8859-1"> is the Swiss (CH) variant of French (fr), |
40 | ISO Latin (8859) 1 (-1) which is the Western European character set. |
f06db76b |
41 | |
42 | =cut |
43 | |
69b19ea2 |
44 | # I18N::Collate.pm |
a0d0e21e |
45 | # |
46 | # Author: Jarkko Hietaniemi <Jarkko.Hietaniemi@hut.fi> |
47 | # Helsinki University of Technology, Finland |
48 | # |
49 | # Acks: Guy Decoux <decoux@moulon.inra.fr> understood |
50 | # overloading magic much deeper than I and told |
51 | # how to cut the size of this code by more than half. |
52 | # (my first version did overload all of lt gt eq le ge cmp) |
53 | # |
54 | # Purpose: compare 8-bit scalar data according to the current locale |
55 | # |
56 | # Requirements: Perl5 POSIX::setlocale() and POSIX::strxfrm() |
57 | # |
58 | # Exports: setlocale 1) |
59 | # collate_xfrm 2) |
60 | # |
61 | # Overloads: cmp # 3) |
62 | # |
69b19ea2 |
63 | # Usage: use I18N::Collate; |
c2960299 |
64 | # setlocale(LC_COLLATE, 'locale-of-your-choice'); # 4) |
69b19ea2 |
65 | # $s1 = new I18N::Collate "scalar_data_1"; |
66 | # $s2 = new I18N::Collate "scalar_data_2"; |
a0d0e21e |
67 | # |
68 | # now you can compare $s1 and $s2: $s1 le $s2 |
69 | # to extract the data itself, you need to deref: $$s1 |
70 | # |
71 | # Notes: |
72 | # 1) this uses POSIX::setlocale |
73 | # 2) the basic collation conversion is done by strxfrm() which |
74 | # terminates at NUL characters being a decent C routine. |
75 | # collate_xfrm handles embedded NUL characters gracefully. |
76 | # 3) due to cmp and overload magic, lt le eq ge gt work also |
77 | # 4) the available locales depend on your operating system; |
c2960299 |
78 | # try whether "locale -a" shows them or man pages for |
79 | # "locale" or "nlsinfo" work or the more direct |
a0d0e21e |
80 | # approach "ls /usr/lib/nls/loc" or "ls /usr/lib/nls". |
c2960299 |
81 | # Not all the locales that your vendor supports |
82 | # are necessarily installed: please consult your |
83 | # operating system's documentation. |
a0d0e21e |
84 | # The locale names are probably something like |
c2960299 |
85 | # 'xx_XX.(ISO)?8859-N' or 'xx_XX.(ISO)?8859N', |
86 | # for example 'fr_CH.ISO8859-1' is the Swiss (CH) |
87 | # variant of French (fr), ISO Latin (8859) 1 (-1) |
88 | # which is the Western European character set. |
a0d0e21e |
89 | # |
69b19ea2 |
90 | # Updated: 19960104 1946 GMT |
a0d0e21e |
91 | # |
92 | # --- |
93 | |
94 | use POSIX qw(strxfrm LC_COLLATE); |
95 | |
96 | require Exporter; |
97 | |
98 | @ISA = qw(Exporter); |
99 | @EXPORT = qw(collate_xfrm setlocale LC_COLLATE); |
100 | @EXPORT_OK = qw(); |
101 | |
102 | %OVERLOAD = qw( |
103 | fallback 1 |
104 | cmp collate_cmp |
105 | ); |
106 | |
107 | sub new { my $new = $_[1]; bless \$new } |
108 | |
109 | sub setlocale { |
110 | my ($category, $locale) = @_[0,1]; |
111 | |
112 | POSIX::setlocale($category, $locale) if (defined $category); |
113 | # the current $LOCALE |
114 | $LOCALE = $locale || $ENV{'LC_COLLATE'} || $ENV{'LC_ALL'} || ''; |
115 | } |
116 | |
117 | sub C { |
118 | my $s = ${$_[0]}; |
119 | |
120 | $C->{$LOCALE}->{$s} = collate_xfrm($s) |
121 | unless (defined $C->{$LOCALE}->{$s}); # cache when met |
122 | |
123 | $C->{$LOCALE}->{$s}; |
124 | } |
125 | |
126 | sub collate_xfrm { |
127 | my $s = $_[0]; |
128 | my $x = ''; |
129 | |
130 | for (split(/(\000+)/, $s)) { |
131 | $x .= (/^\000/) ? $_ : strxfrm("$_\000"); |
132 | } |
133 | |
134 | $x; |
135 | } |
136 | |
137 | sub collate_cmp { |
138 | &C($_[0]) cmp &C($_[1]); |
139 | } |
140 | |
141 | # init $LOCALE |
142 | |
143 | &I18N::Collate::setlocale(); |
144 | |
145 | 1; # keep require happy |