2 # Copyright (c) 2010-2010 Sullivan Beck. All rights reserved.
3 # This program is free software; you can redistribute it and/or modify it
4 # under the same terms as Perl itself.
6 ###############################################################################
7 ###############################################################################
8 # This script is used to automatically generate the Locale::Codes module
9 # which contain the actual codes.
20 use vars qw($VERSION);
26 ###############################################################################
28 ###############################################################################
30 # We need to create the following variables:
32 # %Country{COUNTRY_ID} => [ COUNTRY, COUNTRY, ... ]
33 # A list of all valid country names that
34 # correspond to a given COUNTRY_ID.
35 # The names are all real (i.e. correct
36 # spelling and capitalization).
37 # %CountryAlias{ALIAS} => [ COUNTRY_ID, I ]
38 # A hash of all aliases for a country.
39 # Aliases are all lowercase.
40 # %Code2CountryID{CODESET}{CODE} => [ COUNTRY_ID, I ]
41 # In a given CODESET, CODE corresponds to
42 # the I'th entry list of countries.
43 # %CountryID2Code{CODESET}{COUNTRY_ID} => CODE
44 # In the given CODESET, the COUNTRY_ID
45 # corresponds to the given CODE.
47 use vars qw( $CountryID %Country %CountryAlias %Code2CountryID %CountryID2Code );
48 use vars qw(%country_alias);
52 use vars qw( $ModDir $CountryModule );
54 $ModDir = "lib/Locale/Codes";
55 $CountryModule = "Country";
58 # We'll first read data from the official ISO 3166.
60 # Data available consists only of the country names and 2-character
61 # codes. Country names include non-ASCII characters encoded in
62 # ISO-8859-1. Also, they're all uppercase! Every line in the file ends
63 # with one unprintable character. In other words, they're distributed
64 # in the most unfriendly fashion you could ask for! We'll store the
65 # first country for error checking.
68 use vars qw($country_iso_url $country_iso_file $country_iso_1st);
69 use vars qw(%country_iso_orig);
71 $country_iso_url = "http://www.iso.org/iso/list-en1-semic-3.txt";
72 $country_iso_1st = "AFGHANISTAN";
73 ($country_iso_file) = $country_iso_url =~ m,/([^/]*)$,;
76 # The UN Stats Division contains some (but not all) of the ISO 3166
77 # 3-character codes and 3-digit codes. Since they are the maintainers
78 # of this data, this is an official source.
81 use vars qw($country_un_url $country_un_file);
82 use vars qw(%country_un_orig);
84 $country_un_url = "http://unstats.un.org/unsd/methods/m49/m49alpha.htm";
85 ($country_un_file) = $country_un_url =~ m,/([^/]*)$,;
88 # The National Geospatial-Intelligence Agency is the official source
92 use vars qw($country_nga_url $country_nga_file);
93 use vars qw(%country_nga_orig %country_nga_ignore);
95 $country_nga_url = "http://earth-info.nga.mil/gns/html/digraphs.htm";
96 ($country_nga_file) = $country_nga_url =~ m,/([^/]*)$,;
102 use vars qw($country_iana_url $country_iana_file);
103 use vars qw(%country_iana_orig);
105 $country_iana_url = "http://www.iana.org/domains/root/db/index.html";
106 ($country_iana_file) = $country_iana_url =~ m,/([^/]*)$,;
109 # This is the CIA World Factbook, which is assumed to be a reliable
110 # source of this data. Due to the complexity of the data, we'll store
111 # the last country so we know when to stop.
113 # We have to force-override some codes.
116 use vars qw($country_cia_url $country_cia_file $country_cia_last);
117 use vars qw(%country_cia_ignore %country_cia_orig %country_cia_codes);
119 $country_cia_url = "https://www.cia.gov/library/publications/the-world-factbook/appendix/print_appendix-d.html";
120 ($country_cia_file) = $country_cia_url =~ m,/([^/]*)$,;
121 $country_cia_last = "Zimbabwe";
123 require "data.country.pl";
125 ########################################
127 # We need to create the following variables:
129 # %Language{LANGUAGE_ID} => [ LANGUAGE, LANGUAGE, ... ]
130 # A list of all valid language names that
131 # correspond to a given LANGUAGE_ID.
132 # The names are all real (i.e. correct
133 # spelling and capitalization).
134 # %LanguageAlias{ALIAS} => [ LANGUAGE_ID, I ]
135 # A hash of all aliases for a language.
136 # Aliases are all lowercase.
137 # %Code2LanguageID{CODESET}{CODE} => [ LANGUAGE_ID, I ]
138 # In a given CODESET, CODE corresponds to
139 # the I'th entry in the list of languages.
140 # %LanguageID2Code{CODESET}{LANGUAGE_ID} => CODE
141 # In the given CODESET, the LANGUAGE_ID
142 # corresponds to the given CODE.
144 use vars qw( $LanguageID %Language %LanguageAlias %Code2LanguageID %LanguageID2Code );
145 use vars qw(%language_alias);
147 $LanguageID = "0001";
149 use vars qw( $LanguageModule );
151 $LanguageModule = "Language";
154 # We'll first read data from the official ISO 639.
156 # Data available consists of the language names and 2-letter and
157 # 3-letter codes. Language names include non-ASCII characters encoded in
158 # UTF-8. And (amazingly enough) it's available in a field delimited file!!!
161 use vars qw($language_iso_url $language_iso_file);
162 use vars qw(%language_iso_orig);
164 $language_iso_url = "http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt";
165 ($language_iso_file) = $language_iso_url =~ m,/([^/]*)$,;
167 require "data.language.pl";
169 ########################################
171 # We need to create the following variables:
173 # %Currency{CURRENCY_ID} => [ CURRENCY, CURRENCY, ... ]
174 # A list of all valid currency names that
175 # correspond to a given CURRENCY_ID.
176 # The names are all real (i.e. correct
177 # spelling and capitalization).
178 # %CurrencyAlias{ALIAS} => [ CURRENCY_ID, I ]
179 # A hash of all aliases for a currency.
180 # Aliases are all lowercase.
181 # %Code2CurrencyID{CODESET}{CODE} => [ CURRENCY_ID, I ]
182 # In a given CODESET, CODE corresponds to
183 # the I'th entry in the list of currencies.
184 # %CurrencyID2Code{CODESET}{CURRENCY_ID} => CODE
185 # In the given CODESET, the CURRENCY_ID
186 # corresponds to the given CODE.
188 use vars qw( $CurrencyID %Currency %CurrencyAlias %Code2CurrencyID %CurrencyID2Code );
189 use vars qw(%currency_alias);
191 $CurrencyID = "0001";
193 use vars qw( $CurrencyModule );
195 $CurrencyModule = "Currency";
198 # We'll first read data from the official ISO 4217.
201 use vars qw($currency_iso_url $currency_iso_file $currency_iso_last);
202 use vars qw(%currency_iso_orig %currency_iso_ignore);
204 $currency_iso_url = "http://www.iso.org/iso/support/currency_codes_list-1.htm";
205 ($currency_iso_file) = $currency_iso_url =~ m,/([^/]*)$,;
206 $currency_iso_last = "XXX";
208 require "data.currency.pl";
210 ########################################
212 # We need to create the following variables:
214 # %Script{SCRIPT_ID} => [ SCRIPT, SCRIPT, ... ]
215 # A list of all valid script names that
216 # correspond to a given SCRIPT_ID.
217 # The names are all real (i.e. correct
218 # spelling and capitalization).
219 # %ScriptAlias{ALIAS} => [ SCRIPT_ID, I ]
220 # A hash of all aliases for a script.
221 # Aliases are all lowercase.
222 # %Code2ScriptID{CODESET}{CODE} => [ SCRIPT_ID, I ]
223 # In a given CODESET, CODE corresponds to
224 # the I'th entry in the list of scripts.
225 # %ScriptID2Code{CODESET}{SCRIPT_ID} => CODE
226 # In the given CODESET, the SCRIPT_ID
227 # corresponds to the given CODE.
229 use vars qw( $ScriptID %Script %ScriptAlias %Code2ScriptID %ScriptID2Code );
230 use vars qw(%script_alias);
234 use vars qw( $ScriptModule );
236 $ScriptModule = "Script";
239 # We'll first read data from the official ISO 15924.
241 # Data available consists of the script names and 2-letter and
242 # 3-letter codes. Script names include non-ASCII characters encoded in
243 # UTF-8. And (amazingly enough) it's available in a field delimited file!!!
246 use vars qw($script_iso_url $script_iso_file $script_iso_tmp);
247 use vars qw(%script_iso_orig %script_iso_ignore);
249 $script_iso_url = "http://www.unicode.org/iso15924/iso15924.txt.zip";
250 ($script_iso_file) = $script_iso_url =~ m,/([^/]*)$,;
251 $script_iso_tmp = "iso15924.txt";
253 require "data.script.pl";
255 ###############################################################################
257 ###############################################################################
265 -h/--help : Print help.
267 -a/--all : Do all steps
269 -c/--country : Get the country codes
270 -l/--language : Get the language codes
271 -r/--currency : Get the currency codes
272 -s/--script : Get the script codes
273 -C/--clean : Clean up all temporary files
276 ###############################################################################
278 ###############################################################################
289 (print $usage), exit if ($_ eq "-h" || $_ eq "--help");
291 $do_all = 1, next if ($_ eq "-a" || $_ eq "--all");
293 $do_country = 1, next if ($_ eq "-c" || $_ eq "--country");
294 $do_language = 1, next if ($_ eq "-l" || $_ eq "--language");
295 $do_currency = 1, next if ($_ eq "-r" || $_ eq "--currency");
296 $do_script = 1, next if ($_ eq "-s" || $_ eq "--script");
297 $do_clean = 1, next if ($_ eq "-C" || $_ eq "--clean");
300 ############################################################################
302 ############################################################################
304 do_country() if ($do_all || $do_country);
305 do_language() if ($do_all || $do_language);
306 do_currency() if ($do_all || $do_currency);
307 do_script() if ($do_all || $do_script);
308 do_clean() if ($do_all || $do_clean);
310 ############################################################################
312 ############################################################################
315 print "Country codes...\n";
318 print_table("country") if ($DEBUG == 2);
321 print_table("country") if ($DEBUG == 2);
324 print_table("country") if ($DEBUG == 2);
327 print_table("country") if ($DEBUG == 2);
330 print_table("country") if ($DEBUG == 2);
332 # Go through all aliases to pick up any that haven't already been
333 # added (since some aliases are for human convenience rather than
334 # dealing with variations between codesets).
335 do_aliases("country");
336 print_table("country") if ($DEBUG);
338 write_module("country");
341 ########################################
345 ### The first set we'll do is the ISO 3166-1 2-character
346 ### codes. These country names must be adjusted (since they're all
347 ### uppercase). Also, the lines all end with some strange
348 ### unprintable character.
351 my $codeset = "alpha2";
353 system("wget -N -q $country_iso_url");
354 my @in = `cat $country_iso_file`;
358 # File is a line of text followed by a blank line followed by the
359 # codes as ORIGNAME;CODE .
361 if ($in[1] || $in[2] !~ /^$country_iso_1st;/) {
362 die "ERROR [iso]: country code file format changed!\n";
368 foreach my $line (@in) {
369 if ($line !~ /^(.*);(.*)$/) {
370 die "ERROR [iso]: line invalid\n" .
373 my($country,$code) = ($1,$2);
375 if (exists $country_iso_orig{$country}) {
376 $country = $country_iso_orig{$country};
378 print "WARNING [iso]: unknown country: $country\n";
382 my $countryID = $CountryID++;
384 $Country{$countryID} = [ $country ];
385 $CountryAlias{lc($country)} = [ $countryID, 0 ];
386 $Code2CountryID{$codeset}{$code} = [ $countryID, 0 ];
387 $CountryID2Code{$codeset}{$countryID} = $code;
391 ########################################
394 print "\nINFO [un]: expect Channel Islands\n";
397 ### The UN data contains most of the alpha-3 and numeric code sets.
400 system("wget -N -q $country_un_url");
401 my @in = `cat $country_un_file`;
403 my $in = join("",@in);
405 # Clean up some things that could cause problems in parsing:
407 strip_tags(\$in,1,"br","p","strong","div");
408 $in =~ s,\ , ,g;
411 # Look for a table who's first row has the header:
414 my $found = jump_to_row(\$in,"ISO ALPHA-3 code");
416 die "ERROR [un]: country code file format changed!\n";
420 my @row = get_row("un",\$in);
423 my($num,$country,$alpha3) = @row;
424 $alpha3 = lc($alpha3);
427 if ($num !~ /^\d+$/ || length($num) > 3) {
428 print "WARNING [un]: Invalid numeric code: $country => $num\n";
433 if ($alpha3 && $alpha3 !~ /^[a-z][a-z][a-z]$/) {
434 print "WARNING [un]: Invalid alpha-3 code: $country => $alpha3\n";
438 if (exists $country_un_orig{$country}) {
439 $country = $country_un_orig{$country};
442 my($err,$countryID,$i,$t) = get_countryID("un",$country);
446 print "INFO [un]: new country: $alpha3/$num\t$country\n";
450 $num = "0$num" while (length($num) < 3);
451 $Code2CountryID{"num"}{$num} = [ $countryID, $i ];
452 $CountryID2Code{"num"}{$countryID} = $num;
456 $Code2CountryID{"alpha3"}{$alpha3} = [ $countryID, $i ];
457 $CountryID2Code{"alpha3"}{$countryID} = $alpha3;
462 ########################################
465 print "\nINFO [nga]: expect Serbia, Ashmore, Baker, Bassas, Clipperton,\n" .
466 " Coral, Europa, Gaza, Glorioso, Howland, Jan Mayen,\n" .
467 " Jarvis, Johnston, Juan, Kingman, Midway, Navassa, Palmyra,\n" .
468 " Paracel, Spratly, Svalbard, Tromelin, Wake, West\n";
471 ### The NGA data contains the FIPS 10 codes
474 system("wget -N -q $country_nga_url");
475 my @in = `cat $country_nga_file`;
477 my $in = join("",@in);
479 # Clean up some things that could cause problems in parsing:
481 strip_tags(\$in,1,"br","p","strong","div");
482 $in =~ s,\Q[United States}\E,,; # A horrible typo in the HTML
483 $in =~ s,\ , ,g;
485 $in =~ s,\(see note[^\)]*\), ,g;
486 $in =~ s,\[[^\]]*\], ,g;
489 # Look for a table who's first row has the header:
491 # and then a table with the header:
494 foreach my $table ("SHORT FORM NAME","Short Form Name") {
495 my $found = jump_to_row(\$in,$table);
497 die "ERROR [nga]: country code file format changed!\n";
501 my @row = get_row("nga",\$in);
504 my($short,$long,$code) = @row;
506 next if ($code eq "N/A");
507 next if (exists $country_nga_ignore{$short});
509 if ($code !~ /^[A-Z][A-Z]$/) {
510 print "WARNING [nga]: Invalid code: $short => $code\n";
513 if ($short eq "None") {
514 print "ERROR [nga]: no short definition: $code $long\n";
517 $long = "" if ($long eq "None");
519 if (exists $country_nga_orig{$short}) {
520 $short = $country_nga_orig{$short};
522 if (exists $country_nga_orig{$long}) {
523 $long = $country_nga_orig{$long};
527 my($err,$c,$ii,$t) = get_countryID("nga",$short,1);
531 ($countryID,$i) = ($c,$ii);
533 ($err,$c,$ii,$t) = get_countryID("nga",$long,1);
534 ($countryID,$i) = ($c,-1);
536 print "INFO [nga]: new country: $code\t$short\n";
540 if ($countryID == -1) {
542 $countryID = $CountryID++;
544 $Country{$countryID} = [ $short ];
545 $CountryAlias{lc($short)} = [ $countryID, $i ];
548 # $short is a new alias
549 push @{ $Country{$countryID} },$short;
550 $i = $#{ $Country{$countryID} };
551 $CountryAlias{lc($short)} = [ $countryID, $i ];
555 ! exists $CountryAlias{lc($long)}) {
556 # $long is a new alias
557 push @{ $Country{$countryID} },$long;
558 $i = $#{ $Country{$countryID} };
559 $CountryAlias{lc($long)} = [ $countryID, $i ];
562 $Code2CountryID{"fips"}{$code} = [ $countryID, $i ];
563 $CountryID2Code{"fips"}{$countryID} = $code;
568 ########################################
569 sub do_country_iana {
571 print "\nINFO [iana]: expect Acension, European, Soviet, Portuguese\n";
574 ### The IANA data contains the domain names
577 system("wget -N -q $country_iana_url");
578 my @in = `cat $country_iana_file`;
580 my $in = join("",@in);
582 # Clean up some things that could cause problems in parsing:
584 strip_tags(\$in,1,"a");
585 $in =~ s,\ , ,g;
586 $in =~ s,\(being phased out\), ,g;
589 # Look for a table who's first row has the header:
590 # Sponsoring Organisation
592 my $found = jump_to_row(\$in,"Sponsoring Organisation");
594 die "ERROR [iana]: country code file format changed!\n";
598 my @row = get_row("iana",\$in);
601 my($dom,$type,$country) = @row;
602 next unless ($type eq "country-code");
604 $country =~ s,<br.*,,;
606 if (exists $country_un_orig{$country}) {
607 $country = $country_un_orig{$country};
610 my($err,$countryID,$i,$t) = get_countryID("iana",$country);
614 print "INFO [iana]: new country: $dom\t$country\n";
617 $Code2CountryID{"dom"}{$dom} = [ $countryID, $i ];
618 $CountryID2Code{"dom"}{$countryID} = $dom;
622 ########################################
625 print "\nINFO [cia]: expect Antarctica, Bouvet, British, Christmas, Cocos,\n" .
626 " France (Metro), French, Heard, Kosovo, Montenegro,\n" .
627 " Saint Barth, Saint Martin, Serbia, Taiwan, US\n";
630 ### The CIA data is used to check:
631 ### alpha-2, alpha-3, numeric, fips 10
634 system("wget -N -q $country_cia_url");
635 my @in = `cat $country_cia_file`;
637 my $in = join("",@in);
640 # Clean up some things that could cause problems in parsing:
642 strip_tags(\$in,1,"br","p","strong","div","a","b","img");
643 $in =~ s,\ , ,g;
646 # Move to a table with "Entity" as one of the entries. This
647 # table is followed by the entires.
649 my $found = jump_to_entry(\$in,"Entity");
651 die "ERROR [cia]: country code file format changed!\n";
654 # Each entry is quite complicated. Each is a single table (with a
655 # table nested in it) of the form:
674 # After all of the "A" entries, a special table containing the headers
680 # Read the next entry
683 my($country,$fips,$alpha2,$alpha3,$num,$stanag,$dom);
685 my $found = jump_to_table(\$in);
687 print "ERROR [cia]: malformed file\n";
690 $country = get_entry(\$in);
692 # The first entry is empty if it's at the end of the entries
693 # for a given letter.
694 $found = jump_to_entry(\$in,"Entity");
696 die "ERROR [cia]: country code file format changed!\n";
701 if (exists $country_cia_orig{$country}) {
702 $country = $country_cia_orig{$country};
705 $fips = uc(get_entry(\$in));
706 $found = jump_to_table(\$in);
708 print "ERROR [cia]: malformed file\n";
711 $alpha2 = lc(get_entry(\$in));
712 $alpha3 = lc(get_entry(\$in));
713 $num = get_entry(\$in);
714 $stanag = get_entry(\$in);
715 $dom = uc(get_entry(\$in));
718 next if (exists $country_cia_ignore{$country});
720 $alpha2 = $country_cia_codes{"alpha2"}{$country}
721 if (exists $country_cia_codes{"alpha2"}{$country});
722 $alpha3 = $country_cia_codes{"alpha3"}{$country}
723 if (exists $country_cia_codes{"alpha3"}{$country});
724 $num = $country_cia_codes{"num"}{$country}
725 if (exists $country_cia_codes{"num"}{$country});
726 $fips = $country_cia_codes{"fips"}{$country}
727 if (exists $country_cia_codes{"fips"}{$country});
728 $dom = $country_cia_codes{"dom"}{$country}
729 if (exists $country_cia_codes{"dom"}{$country});
732 # Get the countryID if the country or ANY of the codes match.
735 my($err,$countryID,$i,$type) = get_countryID("cia",$country,1);
738 if ($countryID == -1 && $alpha2 ne "-") {
739 if (exists $Code2CountryID{"alpha2"}{$alpha2}) {
740 ($countryID,$i) = @{ $Code2CountryID{"alpha2"}{$alpha2} };
744 if ($countryID == -1 && $alpha3 ne "-") {
745 if (exists $Code2CountryID{"alpha3"}{$alpha3}) {
746 ($countryID,$i) = @{ $Code2CountryID{"alpha3"}{$alpha3} };
750 if ($countryID == -1 && $num ne "-") {
751 if (exists $Code2CountryID{"num"}{$num}) {
752 ($countryID,$i) = @{ $Code2CountryID{"num"}{$num} };
756 if ($countryID == -1 && $fips ne "-") {
757 if (exists $Code2CountryID{"fips"}{$fips}) {
758 ($countryID,$i) = @{ $Code2CountryID{"fips"}{$fips} };
762 if ($countryID == -1 && $dom ne "-") {
763 if (exists $Code2CountryID{"dom"}{$dom}) {
764 ($countryID,$i) = @{ $Code2CountryID{"dom"}{$dom} };
768 if ($countryID == -1) {
769 $countryID = $CountryID++;
771 $Country{$countryID} = [ $country ];
772 $CountryAlias{lc($country)} = [ $countryID, $i ];
776 # Now check that any previously defined values match the
780 if ($alpha2 ne "-") {
781 my $err = check_code("cia","alpha2",$alpha2,$country,$countryID);
785 if ($alpha3 ne "-") {
786 my $err = check_code("cia","alpha3",$alpha3,$country,$countryID);
791 my $err = check_code("cia","num",$num,$country,$countryID);
796 my $err = check_code("cia","fips",$fips,$country,$countryID);
801 my $err = check_code("cia","dom",$dom,$country,$countryID);
805 last if ($country eq $country_cia_last);
809 ########################################
811 my($type,$codeset,$code,$country,$countryID) = @_;
813 # Check to make sure that the code is defined.
815 if (exists $Code2CountryID{$codeset}{$code}) {
816 return _check_code_exists($type,$codeset,$code,$country,$countryID);
818 return _check_code_new($type,$codeset,$code,$country,$countryID);
822 sub _check_code_exists {
823 my($type,$codeset,$code,$country,$countryID) = @_;
825 # Check the countryID for the code. It must be the same as the one
828 my $old_countryID = $Code2CountryID{$codeset}{$code}[0];
829 if ($countryID != $old_countryID) {
830 print "ERROR [$type]: countryID mismatch in code: [$codeset, $country, $code, $countryID != $old_countryID ]\n";
834 # If the country is defined, it must be the same CountryID. If it
835 # is not, create a new alias.
837 if (exists $CountryAlias{lc($country)}) {
839 my $alt_countryID = $CountryAlias{lc($country)}[0];
841 if ($countryID != $alt_countryID) {
842 print "ERROR [$type]: countryID mismatch in country: [$codeset, $country, $code, $countryID != $alt_countryID ]\n";
847 push @{ $Country{$countryID} },$country;
848 my $i = $#{ $Country{$countryID} };
849 $CountryAlias{lc($country)} = [ $countryID, $i ];
853 # This is a new code.
854 sub _check_code_new {
855 my($type,$codeset,$code,$country,$countryID) = @_;
857 print "INFO [$type]: New code: $codeset [$code] => $country\n";
859 # If this country name isn't defined, create it.
862 if (exists $CountryAlias{lc($country)}) {
863 $i = $CountryAlias{lc($country)}[1];
865 push @{ $Country{$countryID} },$country;
866 $i = $#{ $Country{$countryID} };
867 $CountryAlias{lc($country)} = [ $countryID, $i ];
870 # This country name is the canonical name for the code.
872 $CountryID2Code{$codeset}{$countryID} = $code;
873 $Code2CountryID{$codeset}{$code} = [ $countryID, $i ];
878 ########################################
880 my($type,$country,$no_create) = @_;
882 my($countryID,$i,$t);
883 if (exists $CountryAlias{lc($country)}) {
884 # The country is the same name as one previously defined
885 ($countryID,$i) = @{ $CountryAlias{lc($country)} };
888 } elsif (exists $country_alias{$country}) {
889 # It's a new alias for an existing country
890 my $c = $country_alias{$country};
891 if (! exists $CountryAlias{lc($c)}) {
892 print "WARNING [$type]: alias referenced before it is defined: $country => $c\n";
895 $countryID = $CountryAlias{lc($c)}[0];
896 push @{ $Country{$countryID} },$country;
897 $i = $#{ $Country{$countryID} };
898 $CountryAlias{lc($country)} = [ $countryID, $i ];
902 # It's a new country.
904 return(0,-1,-1,"new");
906 $countryID = $CountryID++;
908 $Country{$countryID} = [ $country ];
909 $CountryAlias{lc($country)} = [ $countryID, $i ];
913 return(0,$countryID,$i,$t);
916 ############################################################################
918 ############################################################################
921 print "Language codes...\n";
924 print_table("language") if ($DEBUG == 2);
926 # Go through all aliases to pick up any that haven't already been
927 # added (since some aliases are for human convenience rather than
928 # dealing with variations between codesets).
929 do_aliases("language");
930 print_table("language") if ($DEBUG);
932 write_module("language");
935 ########################################
936 sub do_language_iso {
938 ### The first set we'll do is the ISO codes.
941 system("wget -N -q $language_iso_url");
942 open(my $in,'<:encoding(utf8)',$language_iso_file);
947 # File is a set of lines of fields delimited by "|". Fields are:
952 # English names (semicolon separated list)
955 foreach my $line (@in) {
956 my($alpha3,$term,$alpha2,$english,$french) = split(/\|/,$line);
957 # The first line has some binary characters at the start.
958 if (length($alpha3)>3) {
959 $alpha3 = substr($alpha3,length($alpha3)-3);
962 if (exists $language_iso_orig{$english}) {
963 $english = $language_iso_orig{$english};
965 my $languageID = $LanguageID++;
966 my @language = split(/\s*;\s*/,$english);
968 $Language{$languageID} = [ @language ];
969 for (my $i=0; $i<=$#language; $i++) {
970 my $language = $language[$i];
971 $LanguageAlias{lc($language)} = [ $languageID, $i ];
975 $Code2LanguageID{"alpha3"}{$alpha3} = [ $languageID, 0 ];
976 $LanguageID2Code{"alpha3"}{$languageID} = $alpha3;
980 $Code2LanguageID{"term"}{$term} = [ $languageID, 0 ];
981 $LanguageID2Code{"term"}{$languageID} = $term;
985 $Code2LanguageID{"alpha2"}{$alpha2} = [ $languageID, 0 ];
986 $LanguageID2Code{"alpha2"}{$languageID} = $alpha2;
991 ############################################################################
993 ############################################################################
996 print "Currency codes...\n";
999 print_table("currency") if ($DEBUG == 2);
1001 # Go through all aliases to pick up any that haven't already been
1002 # added (since some aliases are for human convenience rather than
1003 # dealing with variations between codesets).
1004 do_aliases("currency");
1005 print_table("currency") if ($DEBUG);
1007 write_module("currency");
1010 ########################################
1011 sub do_currency_iso {
1013 ### The first set we'll do is the ISO 4217 codes.
1016 system("wget -N -q $currency_iso_url");
1017 my @in = `cat $currency_iso_file`;
1019 my $in = join("",@in);
1021 # Clean up some things that could cause problems in parsing:
1023 strip_tags(\$in,1,"p","a","strong","div");
1024 $in =~ s,\ , ,g;
1029 # Look for a table who's first row has the header:
1032 my $found = jump_to_row(\$in,"Entity");
1034 die "ERROR [iso]: currency code file format changed!\n";
1038 my @row = get_row("iso",\$in);
1041 my($ent,$currencies,$alphas,$nums) = @row;
1042 $nums = "" if ($nums eq "Nil");
1043 next if (! $alphas && ! $nums);
1045 my(@currency,@num,@alpha,$done);
1046 @currency = split(/<br\s*\/><br\s*\/>/,$currencies);
1047 @num = split(/<br\s*\/><br\s*\/>/,$nums);
1048 @alpha = split(/<br\s*\/><br\s*\/>/,$alphas);
1050 if ( ($nums && $#num != $#currency) ||
1051 ($alphas && $#alpha != $#currency) ) {
1052 print "WARNING [iso]: Invalid line: $currencies => $alphas, $nums\n";
1056 for (my $i=0; $i<=$#currency; $i++) {
1057 my $currency = $currency[$i];
1058 my $num = (@num ? $num[$i] : "");
1059 my $alpha = (@alpha ? $alpha[$i] : "");
1060 $done = 1 if ($alpha eq $currency_iso_last);
1062 if (exists $currency_iso_orig{$currency}) {
1063 $currency = $currency_iso_orig{$currency};
1067 if ($num !~ /^\d+$/ || length($num) > 3) {
1068 print "WARNING [iso]: Invalid numeric code: $currency => $num\n";
1073 $alpha = uc($alpha);
1074 if ($alpha && $alpha !~ /^[A-Z][A-Z][A-Z]$/) {
1075 print "WARNING [iso]: Invalid alpha code: $currency => $alpha\n";
1079 next if (exists $currency_iso_ignore{$alpha});
1081 # There's a lot of duplication, so make sure that if this code
1082 # exists, it is consistant.
1086 if (exists $Code2CurrencyID{"num"}{$num}) {
1088 my($currencyID,$i) = @{ $Code2CurrencyID{"num"}{$num} };
1089 push(@currencyID,$currencyID);
1091 if (exists $Code2CurrencyID{"alpha"}{$alpha}) {
1093 my($currencyID,$i) = @{ $Code2CurrencyID{"alpha"}{$alpha} };
1094 push(@currencyID,$currencyID);
1097 if ($#currencyID == 1) {
1098 if ($currencyID[0] != $currencyID[1]) {
1099 print "WARNING [iso]: CurrencyID mismatch: $currency => $alpha,$num\n";
1106 $currencyID = $CurrencyID++;
1107 $Currency{$currencyID} = [ $currency ];
1108 $CurrencyAlias{lc($currency)} = [ $currencyID, 0 ]
1110 $currencyID = $currencyID[0];
1114 $num = "0$num" while (length($num) < 3);
1115 $Code2CurrencyID{"num"}{$num} = [ $currencyID, 0 ];
1116 $CurrencyID2Code{"num"}{$currencyID} = $num;
1120 $Code2CurrencyID{"alpha"}{$alpha} = [ $currencyID, 0 ];
1121 $CurrencyID2Code{"alpha"}{$currencyID} = $alpha;
1130 ############################################################################
1132 ############################################################################
1135 print "Script codes...\n";
1138 print_table("script") if ($DEBUG == 2);
1140 # Go through all aliases to pick up any that haven't already been
1141 # added (since some aliases are for human convenience rather than
1142 # dealing with variations between codesets).
1143 do_aliases("script");
1144 print_table("script") if ($DEBUG);
1146 write_module("script");
1149 ########################################
1152 ### The first set we'll do is the ISO 15924 codes. We
1153 ### can get a zip file which contains the data.
1156 system("wget -N -q $script_iso_url");
1157 my $zip = Archive::Zip->new($script_iso_file);
1159 my @members = grep(/^iso15924.*\.txt/,$zip->memberNames());
1160 if (@members != 1) {
1161 die "ERROR [iso]: zip file changed format\n";
1163 my($file) = @members;
1164 $zip->extractMember($file,$script_iso_tmp);
1167 # The zip file contains a series of lines in the form:
1168 # alpha;numeric;english;...
1169 # The data is in UTF-8.
1171 # Every line has an unprintable character at the end.
1174 open(my $in,'<:encoding(utf8)',$script_iso_tmp);
1180 foreach my $line (@in) {
1181 next if (! $line || $line =~ /^\043/);
1182 my($alpha,$num,$script) = split(/;/,$line);
1183 $alpha = ucfirst(lc($alpha));
1184 next if (exists $script_iso_ignore{$alpha});
1186 if (exists $script_iso_orig{$script}) {
1187 $script = $script_iso_orig{$script};
1189 my $scriptID = $ScriptID++;
1191 $Script{$scriptID} = [ $script ];
1192 $ScriptAlias{lc($script)} = [ $scriptID, 0 ];
1194 $Code2ScriptID{"alpha"}{$alpha} = [ $scriptID, 0 ];
1195 $ScriptID2Code{"alpha"}{$scriptID} = $alpha;
1197 $Code2ScriptID{"num"}{$num} = [ $scriptID, 0 ];
1198 $ScriptID2Code{"num"}{$scriptID} = $num;
1202 ############################################################################
1204 ############################################################################
1209 my ($type_alias,$TypeAlias,$Type,$TypeID2Code,$Code2TypeID);
1211 if ($caller eq "country") {
1212 $type_alias = \%country_alias;
1213 $TypeAlias = \%CountryAlias;
1215 $TypeID2Code = \%CountryID2Code;
1216 $Code2TypeID = \%Code2CountryID;
1218 } elsif ($caller eq "language") {
1219 $type_alias = \%language_alias;
1220 $TypeAlias = \%LanguageAlias;
1222 $TypeID2Code = \%LanguageID2Code;
1223 $Code2TypeID = \%Code2LanguageID;
1225 } elsif ($caller eq "currency") {
1226 $type_alias = \%currency_alias;
1227 $TypeAlias = \%CurrencyAlias;
1229 $TypeID2Code = \%CurrencyID2Code;
1230 $Code2TypeID = \%Code2CurrencyID;
1232 } elsif ($caller eq "script") {
1233 $type_alias = \%script_alias;
1234 $TypeAlias = \%ScriptAlias;
1236 $TypeID2Code = \%ScriptID2Code;
1237 $Code2TypeID = \%Code2ScriptID;
1240 return($type_alias,$TypeAlias,$Type,$TypeID2Code,$Code2TypeID);
1246 my ($type_alias,$TypeAlias,$Type,$TypeID2Code,$Code2TypeID)
1247 = _type_hashes($caller);
1249 foreach my $typeID (sort keys %$Type) {
1250 my @type = @{ $$Type{$typeID} };
1253 my $type = shift(@type);
1254 if (length($type) < 40) {
1255 $type .= " "x(40-length($type));
1257 $type = substr($type,0,40);
1260 print "${typeID}[$i] = $type ";
1261 foreach my $codeset (keys %$Code2TypeID) {
1263 if (exists $$TypeID2Code{$codeset}{$typeID}) {
1264 my $code = $$TypeID2Code{$codeset}{$typeID};
1265 my($code_id,$code_i) = @{ $$Code2TypeID{$codeset}{$code} };
1266 $field = "$code [$code_id,$code_i]";
1267 $field .= " ERR" if ($code_id != $typeID);
1269 $field = $field . " "x(18-length($field));
1273 foreach $type (@type) {
1275 if (length($type) > 40) {
1276 $type = substr($type,0,40);
1278 print " [$i] = $type\n";
1283 ############################################################################
1285 ############################################################################
1290 my ($type_alias,$TypeAlias,$Type,$TypeID2Code,$Code2TypeID)
1291 = _type_hashes($caller);
1293 # Add remaining aliases.
1295 foreach my $alias (keys %$type_alias) {
1296 my $type = $$type_alias{$alias};
1298 next if (exists $$TypeAlias{lc($type)} &&
1299 exists $$TypeAlias{lc($alias)});
1301 if (! exists $$TypeAlias{lc($type)} &&
1302 ! exists $$TypeAlias{lc($alias)}) {
1303 print "WARNING: unused type in alias list: $type\n";
1304 print "WARNING: unused type in alias list: $alias\n";
1309 if (exists $$TypeAlias{lc($type)}) {
1310 $typeID = $$TypeAlias{lc($type)}[0];
1313 $typeID = $$TypeAlias{lc($alias)}[0];
1316 push @{ $$Type{$typeID} },$type;
1317 my $i = $#{ $$Type{$typeID} };
1318 $$TypeAlias{lc($type)} = [ $typeID, $i ];
1322 ############################################################################
1324 ############################################################################
1329 my($module,%hashes,$id);
1331 if ($type eq "country") {
1332 $module = $CountryModule;
1333 %hashes = ("id2names" => "Country",
1334 "alias2id" => "CountryAlias",
1335 "code2id" => "Code2CountryID",
1336 "id2code" => "CountryID2Code");
1338 } elsif ($type eq "language") {
1339 $module = $LanguageModule;
1340 %hashes = ("id2names" => "Language",
1341 "alias2id" => "LanguageAlias",
1342 "code2id" => "Code2LanguageID",
1343 "id2code" => "LanguageID2Code");
1345 } elsif ($type eq "currency") {
1346 $module = $CurrencyModule;
1347 %hashes = ("id2names" => "Currency",
1348 "alias2id" => "CurrencyAlias",
1349 "code2id" => "Code2CurrencyID",
1350 "id2code" => "CurrencyID2Code");
1352 } elsif ($type eq "script") {
1353 $module = $ScriptModule;
1354 %hashes = ("id2names" => "Script",
1355 "alias2id" => "ScriptAlias",
1356 "code2id" => "Code2ScriptID",
1357 "id2code" => "ScriptID2Code");
1361 my $file = "$ModDir/$module.pm";
1363 system("mv $file $file.bak");
1366 my $out = new IO::File;
1367 $out->open(">$file");
1368 my $timestamp = `date`;
1371 my $podstr = '=pod'; # so the CPAN indexer won't treat this as a POD file
1373 print $out "package Locale::Codes::$module;
1375 # This file was automatically generated. Any changes to this file will
1376 # be lost the next time 'get_codes' is run.
1377 # Generated on: $timestamp
1383 Locale::Codes::$module - $type codes for the Locale::$module module
1387 This module contains data used by the Locale::$module module. It is
1388 not intended to be used directly, and contains no calleable routines.
1392 See Locale::Codes for full author history.
1394 Currently maintained by Sullivan Beck (sbeck\@cpan.org).
1398 Copyright (c) 1997-2001 Canon Research Centre Europe (CRE).
1399 Copyright (c) 2001-2010 Neil Bowers
1400 Copyright (c) 2010-2010 Sullivan Beck
1402 This module is free software; you can redistribute it and/or
1403 modify it under the same terms as Perl itself.
1411 use vars qw(\$VERSION);
1414 \$Locale::Codes::Data{'$type'}{'id'} = '$id';
1418 foreach my $h qw(id2names alias2id code2id id2code) {
1419 my $hash = $hashes{$h};
1420 print $out "\$Locale::Codes::Data{'$type'}{'$h'} = {\n";
1421 _write_hash($out,$hash);
1423 print $out "};\n\n";
1432 my($out,$hashname) = @_;
1435 my %hash = %$hashname;
1437 _write_subhash($out,3,\%hash);
1440 sub _write_subhash {
1441 my($out,$indent,$hashref) = @_;
1443 my %hash = %$hashref;
1444 my $ind = " "x$indent;
1446 foreach my $key (sort keys %hash) {
1447 my $val = $hash{$key};
1448 if (ref($val) eq "HASH") {
1449 print $out "${ind}q($key) => {\n";
1450 _write_subhash($out,$indent+3,$val);
1451 print $out "${ind} },\n";
1452 } elsif (ref($val) eq "ARRAY") {
1453 print $out "${ind}q($key) => [\n";
1454 _write_sublist($out,$indent+3,$val);
1455 print $out "${ind} ],\n";
1457 print $out "${ind}q($key) => q($val),\n";
1462 sub _write_sublist {
1463 my($out,$indent,$listref) = @_;
1465 my @list = @$listref;
1466 my $ind = " "x$indent;
1468 foreach my $val (@list) {
1469 if (ref($val) eq "HASH") {
1470 print $out "${ind}{\n";
1471 _write_subhash($out,$indent+3,$val);
1472 print $out "${ind}},\n";
1473 } elsif (ref($val) eq "ARRAY") {
1474 print $out "${ind}[\n";
1475 _write_sublist($out,$indent+3,$val);
1476 print $out "${ind}],\n";
1478 print $out "${ind}q($val),\n";
1483 ############################################################################
1485 ############################################################################
1488 print "Cleaning...\n";
1490 system("rm -f $country_iso_file");
1491 system("rm -f $country_un_file");
1492 system("rm -f $country_nga_file");
1493 system("rm -f $country_cia_file");
1494 system("rm -f $country_iana_file");
1495 system("rm -f $language_iso_file");
1496 system("rm -f $currency_iso_file");
1497 system("rm -f $script_iso_file");
1498 system("rm -f $script_iso_tmp");
1499 system("rm -rf __MACOSX");
1502 ############################################################################
1504 ############################################################################
1507 my($type,$inref) = @_;
1509 return () if ($$inref !~ m,^\s*<tr,);
1511 if ($$inref !~ s,^(.*?)</tr[^>]*>,,) {
1512 die "ERROR [$type]: malformed HTML\n";
1516 if ($row =~ m,<table,) {
1517 die "ERROR [$type]: embedded table\n";
1521 while ($row =~ s,(?:.*?)<(td|th)[^>]*>\s*(.*?)\s*</\1[^>]*>,,) {
1530 my($inref,$header) = @_;
1532 if ($$inref =~ s,(.*?)\Q$header\E(.*?)</tr[^>]*>(.*?)(?=<tr),,) {
1540 my($inref,$value) = @_;
1542 if ($$inref =~ s,(.*?)(?=<(?:td|th)[^>]*>\s*\Q$value\E\s*),,) {
1552 if ($$inref =~ s,(.*?)(?=<table),,) {
1562 if ($$inref =~ s,.*?<td[^>]*>\s*(.*?)\s*</td[^>]*>,,) {
1569 my($inref,$close,@tags) = @_;
1571 foreach my $tag (@tags) {
1573 $$inref =~ s,</?$tag[^>]*>, ,g;
1575 $$inref =~ s,<$tag[^>]*>, ,g;
1582 # indent-tabs-mode: nil
1583 # cperl-indent-level: 3
1584 # cperl-continued-statement-offset: 2
1585 # cperl-continued-brace-offset: 0
1586 # cperl-brace-offset: 0
1587 # cperl-brace-imaginary-offset: 0
1588 # cperl-label-offset: -2