cpan/Locale-Codes/internal/get_codes

   1 #!/usr/bin/perl -w
   2 # Copyright (c) 2010-2010 Sullivan Beck.  All rights reserved.
   3 # This program is free software; you can redistribute it and/or modify it
   4 # under the same terms as Perl itself.
   5
   6 ###############################################################################
   7 ###############################################################################
   8 # This script is used to automatically generate the Locale::Codes module
   9 # which contain the actual codes.
  10
  11 require 5.000000;
  12 use YAML;
  13 use IO::File;
  14 use strict;
  15 use warnings;
  16 use Archive::Zip;
  17
  18 use lib "./internal";
  19
  20 use vars qw($VERSION);
  21 $VERSION='3.12';
  22
  23 use vars qw($DEBUG);
  24 $DEBUG = 0;
  25
  26 ###############################################################################
  27 # GLOBAL VARIABLES
  28 ###############################################################################
  29
  30 # We need to create the following variables:
  31 #
  32 #  %Country{COUNTRY_ID}           => [ COUNTRY, COUNTRY, ... ]
  33 #                                    A list of all valid country names that
  34 #                                    correspond to a given COUNTRY_ID.
  35 #                                    The names are all real (i.e. correct
  36 #                                    spelling and capitalization).
  37 #  %CountryAlias{ALIAS}           => [ COUNTRY_ID, I ]
  38 #                                    A hash of all aliases for a country.
  39 #                                    Aliases are all lowercase.
  40 #  %Code2CountryID{CODESET}{CODE} => [ COUNTRY_ID, I ]
  41 #                                    In a given CODESET, CODE corresponds to
  42 #                                    the I'th entry list of countries.
  43 #  %CountryID2Code{CODESET}{COUNTRY_ID} => CODE
  44 #                                    In the given CODESET, the COUNTRY_ID
  45 #                                    corresponds to the given CODE.
  46
  47 use vars qw( $CountryID %Country %CountryAlias %Code2CountryID %CountryID2Code );
  48 use vars qw(%country_alias);
  49
  50 $CountryID = "0001";
  51
  52 use vars qw( $ModDir $CountryModule );
  53
  54 $ModDir        = "lib/Locale/Codes";
  55 $CountryModule = "Country";
  56
  57 #
  58 # We'll first read data from the official ISO 3166.
  59 #
  60 # Data available consists only of the country names and 2-character
  61 # codes. Country names include non-ASCII characters encoded in
  62 # ISO-8859-1. Also, they're all uppercase! Every line in the file ends
  63 # with one unprintable character. In other words, they're distributed
  64 # in the most unfriendly fashion you could ask for! We'll store the
  65 # first country for error checking.
  66 #
  67
  68 use vars qw($country_iso_url $country_iso_file $country_iso_1st);
  69 use vars qw(%country_iso_orig);
  70
  71 $country_iso_url    = "http://www.iso.org/iso/list-en1-semic-3.txt";
  72 $country_iso_1st    = "AFGHANISTAN";
  73 ($country_iso_file) = $country_iso_url =~ m,/([^/]*)$,;
  74
  75 #
  76 # The UN Stats Division contains some (but not all) of the ISO 3166
  77 # 3-character codes and 3-digit codes. Since they are the maintainers
  78 # of this data, this is an official source.
  79 #
  80
  81 use vars qw($country_un_url $country_un_file);
  82 use vars qw(%country_un_orig);
  83
  84 $country_un_url    = "http://unstats.un.org/unsd/methods/m49/m49alpha.htm";
  85 ($country_un_file) = $country_un_url =~ m,/([^/]*)$,;
  86
  87 #
  88 # The National Geospatial-Intelligence Agency is the official source
  89 # for FIPS 10 codes.
  90 #
  91
  92 use vars qw($country_nga_url $country_nga_file);
  93 use vars qw(%country_nga_orig %country_nga_ignore);
  94
  95 $country_nga_url    = "http://earth-info.nga.mil/gns/html/digraphs.htm";
  96 ($country_nga_file) = $country_nga_url =~ m,/([^/]*)$,;
  97
  98 #
  99 # IANA domains
 100 #
 101
 102 use vars qw($country_iana_url $country_iana_file);
 103 use vars qw(%country_iana_orig);
 104
 105 $country_iana_url    = "http://www.iana.org/domains/root/db/index.html";
 106 ($country_iana_file) = $country_iana_url =~ m,/([^/]*)$,;
 107
 108 #
 109 # This is the CIA World Factbook, which is assumed to be a reliable
 110 # source of this data. Due to the complexity of the data, we'll store
 111 # the last country so we know when to stop.
 112 #
 113 # We have to force-override some codes.
 114 #
 115
 116 use vars qw($country_cia_url $country_cia_file $country_cia_last);
 117 use vars qw(%country_cia_ignore %country_cia_orig %country_cia_codes);
 118
 119 $country_cia_url = "https://www.cia.gov/library/publications/the-world-factbook/appendix/print_appendix-d.html";
 120 ($country_cia_file) = $country_cia_url =~ m,/([^/]*)$,;
 121 $country_cia_last = "Zimbabwe";
 122
 123 require "data.country.pl";
 124
 125 ########################################
 126
 127 # We need to create the following variables:
 128 #
 129 #  %Language{LANGUAGE_ID}           => [ LANGUAGE, LANGUAGE, ... ]
 130 #                                     A list of all valid language names that
 131 #                                     correspond to a given LANGUAGE_ID.
 132 #                                     The names are all real (i.e. correct
 133 #                                     spelling and capitalization).
 134 #  %LanguageAlias{ALIAS}           => [ LANGUAGE_ID, I ]
 135 #                                     A hash of all aliases for a language.
 136 #                                     Aliases are all lowercase.
 137 #  %Code2LanguageID{CODESET}{CODE} => [ LANGUAGE_ID, I ]
 138 #                                     In a given CODESET, CODE corresponds to
 139 #                                     the I'th entry in the list of languages.
 140 #  %LanguageID2Code{CODESET}{LANGUAGE_ID} => CODE
 141 #                                     In the given CODESET, the LANGUAGE_ID
 142 #                                     corresponds to the given CODE.
 143
 144 use vars qw( $LanguageID %Language %LanguageAlias %Code2LanguageID %LanguageID2Code );
 145 use vars qw(%language_alias);
 146
 147 $LanguageID = "0001";
 148
 149 use vars qw( $LanguageModule );
 150
 151 $LanguageModule = "Language";
 152
 153 #
 154 # We'll first read data from the official ISO 639.
 155 #
 156 # Data available consists of the language names and 2-letter and
 157 # 3-letter codes. Language names include non-ASCII characters encoded in
 158 # UTF-8. And (amazingly enough) it's available in a field delimited file!!!
 159 #
 160
 161 use vars qw($language_iso_url $language_iso_file);
 162 use vars qw(%language_iso_orig);
 163
 164 $language_iso_url    = "http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt";
 165 ($language_iso_file) = $language_iso_url =~ m,/([^/]*)$,;
 166
 167 require "data.language.pl";
 168
 169 ########################################
 170
 171 # We need to create the following variables:
 172 #
 173 #  %Currency{CURRENCY_ID}           => [ CURRENCY, CURRENCY, ... ]
 174 #                                      A list of all valid currency names that
 175 #                                      correspond to a given CURRENCY_ID.
 176 #                                      The names are all real (i.e. correct
 177 #                                      spelling and capitalization).
 178 #  %CurrencyAlias{ALIAS}            => [ CURRENCY_ID, I ]
 179 #                                      A hash of all aliases for a currency.
 180 #                                      Aliases are all lowercase.
 181 #  %Code2CurrencyID{CODESET}{CODE}  => [ CURRENCY_ID, I ]
 182 #                                      In a given CODESET, CODE corresponds to
 183 #                                      the I'th entry in the list of currencies.
 184 #  %CurrencyID2Code{CODESET}{CURRENCY_ID} => CODE
 185 #                                      In the given CODESET, the CURRENCY_ID
 186 #                                      corresponds to the given CODE.
 187
 188 use vars qw( $CurrencyID %Currency %CurrencyAlias %Code2CurrencyID %CurrencyID2Code );
 189 use vars qw(%currency_alias);
 190
 191 $CurrencyID = "0001";
 192
 193 use vars qw( $CurrencyModule );
 194
 195 $CurrencyModule = "Currency";
 196
 197 #
 198 # We'll first read data from the official ISO 4217.
 199 #
 200
 201 use vars qw($currency_iso_url $currency_iso_file $currency_iso_last);
 202 use vars qw(%currency_iso_orig %currency_iso_ignore);
 203
 204 $currency_iso_url    = "http://www.iso.org/iso/support/currency_codes_list-1.htm";
 205 ($currency_iso_file) = $currency_iso_url =~ m,/([^/]*)$,;
 206 $currency_iso_last   = "XXX";
 207
 208 require "data.currency.pl";
 209
 210 ########################################
 211
 212 # We need to create the following variables:
 213 #
 214 #  %Script{SCRIPT_ID}              => [ SCRIPT, SCRIPT, ... ]
 215 #                                     A list of all valid script names that
 216 #                                     correspond to a given SCRIPT_ID.
 217 #                                     The names are all real (i.e. correct
 218 #                                     spelling and capitalization).
 219 #  %ScriptAlias{ALIAS}             => [ SCRIPT_ID, I ]
 220 #                                     A hash of all aliases for a script.
 221 #                                     Aliases are all lowercase.
 222 #  %Code2ScriptID{CODESET}{CODE}   => [ SCRIPT_ID, I ]
 223 #                                     In a given CODESET, CODE corresponds to
 224 #                                     the I'th entry in the list of scripts.
 225 #  %ScriptID2Code{CODESET}{SCRIPT_ID} => CODE
 226 #                                     In the given CODESET, the SCRIPT_ID
 227 #                                     corresponds to the given CODE.
 228
 229 use vars qw( $ScriptID %Script %ScriptAlias %Code2ScriptID %ScriptID2Code );
 230 use vars qw(%script_alias);
 231
 232 $ScriptID = "0001";
 233
 234 use vars qw( $ScriptModule );
 235
 236 $ScriptModule = "Script";
 237
 238 #
 239 # We'll first read data from the official ISO 15924.
 240 #
 241 # Data available consists of the script names and 2-letter and
 242 # 3-letter codes. Script names include non-ASCII characters encoded in
 243 # UTF-8. And (amazingly enough) it's available in a field delimited file!!!
 244 #
 245
 246 use vars qw($script_iso_url $script_iso_file $script_iso_tmp);
 247 use vars qw(%script_iso_orig %script_iso_ignore);
 248
 249 $script_iso_url    = "http://www.unicode.org/iso15924/iso15924.txt.zip";
 250 ($script_iso_file) = $script_iso_url =~ m,/([^/]*)$,;
 251 $script_iso_tmp    = "iso15924.txt";
 252
 253 require "data.script.pl";
 254
 255 ###############################################################################
 256 # HELP
 257 ###############################################################################
 258
 259 use vars qw($usage);
 260 my $COM = $0;
 261 $COM =~ s/^.*\///;
 262
 263 $usage=
 264   "usage: $COM OPTIONS
 265       -h/--help       : Print help.
 266
 267       -a/--all        : Do all steps
 268
 269       -c/--country    : Get the country codes
 270       -l/--language   : Get the language codes
 271       -r/--currency   : Get the currency codes
 272       -s/--script     : Get the script codes
 273       -C/--clean      : Clean up all temporary files
 274 ";
 275
 276 ###############################################################################
 277 # PARSE ARGUMENTS
 278 ###############################################################################
 279
 280 my $do_all      = 0;
 281 my $do_country  = 0;
 282 my $do_language = 0;
 283 my $do_currency = 0;
 284 my $do_script   = 0;
 285 my $do_clean    = 0;
 286
 287 while ($_ = shift) {
 288
 289    (print $usage),   exit  if ($_ eq "-h"   ||  $_ eq "--help");
 290
 291    $do_all = 1,      next  if ($_ eq "-a"   ||  $_ eq "--all");
 292
 293    $do_country = 1,  next  if ($_ eq "-c"   ||  $_ eq "--country");
 294    $do_language = 1, next  if ($_ eq "-l"   ||  $_ eq "--language");
 295    $do_currency = 1, next  if ($_ eq "-r"   ||  $_ eq "--currency");
 296    $do_script = 1,   next  if ($_ eq "-s"   ||  $_ eq "--script");
 297    $do_clean = 1,    next  if ($_ eq "-C"   ||  $_ eq "--clean");
 298 }
 299
 300 ############################################################################
 301 # MAIN PROGRAM
 302 ############################################################################
 303
 304 do_country()    if ($do_all  ||  $do_country);
 305 do_language()   if ($do_all  ||  $do_language);
 306 do_currency()   if ($do_all  ||  $do_currency);
 307 do_script()     if ($do_all  ||  $do_script);
 308 do_clean()      if ($do_all  ||  $do_clean);
 309
 310 ############################################################################
 311 # DO_COUNTRY
 312 ############################################################################
 313
 314 sub do_country {
 315    print "Country codes...\n";
 316
 317    do_country_iso();
 318    print_table("country")  if ($DEBUG == 2);
 319
 320    do_country_un();
 321    print_table("country")  if ($DEBUG == 2);
 322
 323    do_country_nga();
 324    print_table("country")  if ($DEBUG == 2);
 325
 326    do_country_iana();
 327    print_table("country")  if ($DEBUG == 2);
 328
 329    do_country_cia();
 330    print_table("country")  if ($DEBUG == 2);
 331
 332    # Go through all aliases to pick up any that haven't already been
 333    # added (since some aliases are for human convenience rather than
 334    # dealing with variations between codesets).
 335    do_aliases("country");
 336    print_table("country")  if ($DEBUG);
 337
 338    write_module("country");
 339 }
 340
 341 ########################################
 342 sub do_country_iso {
 343
 344    ###
 345    ### The first set we'll do is the ISO 3166-1 2-character
 346    ### codes. These country names must be adjusted (since they're all
 347    ### uppercase). Also, the lines all end with some strange
 348    ### unprintable character.
 349    ###
 350
 351    my $codeset = "alpha2";
 352
 353    system("wget -N -q $country_iso_url");
 354    my @in = `cat $country_iso_file`;
 355    chomp(@in);
 356    chop(@in);
 357
 358    # File is a line of text followed by a blank line followed by the
 359    # codes as ORIGNAME;CODE .
 360
 361    if ($in[1]  ||  $in[2] !~ /^$country_iso_1st;/) {
 362       die "ERROR [iso]: country code file format changed!\n";
 363    }
 364
 365    shift(@in);
 366    shift(@in);
 367
 368    foreach my $line (@in) {
 369       if ($line !~ /^(.*);(.*)$/) {
 370          die "ERROR [iso]: line invalid\n" .
 371              "             $line\n";
 372       }
 373       my($country,$code) = ($1,$2);
 374       $code = lc($code);
 375       if (exists $country_iso_orig{$country}) {
 376          $country = $country_iso_orig{$country};
 377       } else {
 378          print "WARNING [iso]: unknown country: $country\n";
 379          next;
 380       }
 381
 382       my $countryID = $CountryID++;
 383
 384       $Country{$countryID}                  = [ $country ];
 385       $CountryAlias{lc($country)}           = [ $countryID, 0 ];
 386       $Code2CountryID{$codeset}{$code}      = [ $countryID, 0 ];
 387       $CountryID2Code{$codeset}{$countryID} = $code;
 388    }
 389 }
 390
 391 ########################################
 392 sub do_country_un {
 393
 394    print "\nINFO [un]: expect Channel Islands\n";
 395
 396    ###
 397    ### The UN data contains most of the alpha-3 and numeric code sets.
 398    ###
 399
 400    system("wget -N -q $country_un_url");
 401    my @in = `cat $country_un_file`;
 402    chomp(@in);
 403    my $in = join("",@in);
 404
 405    # Clean up some things that could cause problems in parsing:
 406
 407    strip_tags(\$in,1,"br","p","strong","div");
 408    $in =~ s,\&nbsp;, ,g;
 409    $in =~ s,\s+, ,g;
 410
 411    # Look for a table who's first row has the header:
 412    #    ISO ALPHA-3 code
 413
 414    my $found = jump_to_row(\$in,"ISO ALPHA-3 code");
 415    if (! $found) {
 416       die "ERROR [un]: country code file format changed!\n";
 417    }
 418
 419    while (1) {
 420       my @row = get_row("un",\$in);
 421       last  if (! @row);
 422
 423       my($num,$country,$alpha3) = @row;
 424       $alpha3 = lc($alpha3);
 425
 426       if ($num) {
 427          if ($num !~ /^\d+$/  ||  length($num) > 3) {
 428            print "WARNING [un]: Invalid numeric code: $country => $num\n";
 429            next;
 430         }
 431       }
 432
 433       if ($alpha3  &&  $alpha3 !~ /^[a-z][a-z][a-z]$/) {
 434          print "WARNING [un]: Invalid alpha-3 code: $country => $alpha3\n";
 435          next;
 436       }
 437
 438       if (exists $country_un_orig{$country}) {
 439          $country = $country_un_orig{$country};
 440       }
 441
 442       my($err,$countryID,$i,$t) = get_countryID("un",$country);
 443       next  if ($err);
 444
 445       if ($t eq "new") {
 446          print "INFO [un]: new country: $alpha3/$num\t$country\n";
 447       }
 448
 449       if ($num) {
 450          $num = "0$num"  while (length($num) < 3);
 451          $Code2CountryID{"num"}{$num} = [ $countryID, $i ];
 452          $CountryID2Code{"num"}{$countryID} = $num;
 453       }
 454
 455       if ($alpha3) {
 456          $Code2CountryID{"alpha3"}{$alpha3} = [ $countryID, $i ];
 457          $CountryID2Code{"alpha3"}{$countryID} = $alpha3;
 458       }
 459    }
 460 }
 461
 462 ########################################
 463 sub do_country_nga {
 464
 465    print "\nINFO [nga]: expect Serbia, Ashmore, Baker, Bassas, Clipperton,\n" .
 466            "            Coral, Europa, Gaza, Glorioso, Howland, Jan Mayen,\n" .
 467            "            Jarvis, Johnston, Juan, Kingman, Midway, Navassa, Palmyra,\n" .
 468            "            Paracel, Spratly, Svalbard, Tromelin, Wake, West\n";
 469
 470    ###
 471    ### The NGA data contains the FIPS 10 codes
 472    ###
 473
 474    system("wget -N -q $country_nga_url");
 475    my @in = `cat $country_nga_file`;
 476    chomp(@in);
 477    my $in = join("",@in);
 478
 479    # Clean up some things that could cause problems in parsing:
 480
 481    strip_tags(\$in,1,"br","p","strong","div");
 482    $in =~ s,\Q[United States}\E,,;       # A horrible typo in the HTML
 483    $in =~ s,\&nbsp;, ,g;
 484    $in =~ s,Other:, ,g;
 485    $in =~ s,\(see note[^\)]*\), ,g;
 486    $in =~ s,\[[^\]]*\], ,g;
 487    $in =~ s,\s+, ,g;
 488
 489    # Look for a table who's first row has the header:
 490    #    SHORT FORM NAME
 491    # and then a table with the header:
 492    #    Short Form Name
 493
 494    foreach my $table ("SHORT FORM NAME","Short Form Name") {
 495       my $found = jump_to_row(\$in,$table);
 496       if (! $found) {
 497          die "ERROR [nga]: country code file format changed!\n";
 498       }
 499
 500       while (1) {
 501          my @row = get_row("nga",\$in);
 502          last  if (! @row);
 503
 504          my($short,$long,$code) = @row;
 505          $code  = uc($code);
 506          next  if ($code eq "N/A");
 507          next  if (exists $country_nga_ignore{$short});
 508
 509          if ($code !~ /^[A-Z][A-Z]$/) {
 510             print "WARNING [nga]: Invalid code: $short => $code\n";
 511          }
 512
 513          if ($short eq "None") {
 514             print "ERROR [nga]: no short definition: $code  $long\n";
 515             next;
 516          }
 517          $long  = ""  if ($long  eq "None");
 518
 519          if (exists $country_nga_orig{$short}) {
 520             $short = $country_nga_orig{$short};
 521          }
 522          if (exists $country_nga_orig{$long}) {
 523             $long = $country_nga_orig{$long};
 524          }
 525
 526          my($countryID,$i);
 527          my($err,$c,$ii,$t) = get_countryID("nga",$short,1);
 528          next  if ($err);
 529
 530          if ($t ne "new") {
 531             ($countryID,$i) = ($c,$ii);
 532          } else {
 533             ($err,$c,$ii,$t) = get_countryID("nga",$long,1);
 534             ($countryID,$i) = ($c,-1);
 535             if ($t eq "new") {
 536                print "INFO [nga]: new country: $code\t$short\n";
 537             }
 538          }
 539
 540          if ($countryID == -1) {
 541             # New country
 542             $countryID = $CountryID++;
 543             $i = 0;
 544             $Country{$countryID} = [ $short ];
 545             $CountryAlias{lc($short)} = [ $countryID, $i ];
 546
 547          } elsif ($i == -1) {
 548             # $short is a new alias
 549             push @{ $Country{$countryID} },$short;
 550             $i = $#{ $Country{$countryID} };
 551             $CountryAlias{lc($short)} = [ $countryID, $i ];
 552          }
 553
 554          if ($long  &&
 555              ! exists $CountryAlias{lc($long)}) {
 556             # $long is a new alias
 557             push @{ $Country{$countryID} },$long;
 558             $i = $#{ $Country{$countryID} };
 559             $CountryAlias{lc($long)} = [ $countryID, $i ];
 560          }
 561
 562          $Code2CountryID{"fips"}{$code} = [ $countryID, $i ];
 563          $CountryID2Code{"fips"}{$countryID} = $code;
 564       }
 565    }
 566 }
 567
 568 ########################################
 569 sub do_country_iana {
 570
 571    print "\nINFO [iana]: expect Acension, European, Soviet, Portuguese\n";
 572
 573    ###
 574    ### The IANA data contains the domain names
 575    ###
 576
 577    system("wget -N -q $country_iana_url");
 578    my @in = `cat $country_iana_file`;
 579    chomp(@in);
 580    my $in = join("",@in);
 581
 582    # Clean up some things that could cause problems in parsing:
 583
 584    strip_tags(\$in,1,"a");
 585    $in =~ s,\&nbsp;, ,g;
 586    $in =~ s,\(being phased out\), ,g;
 587    $in =~ s,\s+, ,g;
 588
 589    # Look for a table who's first row has the header:
 590    #    Sponsoring Organisation
 591
 592    my $found = jump_to_row(\$in,"Sponsoring Organisation");
 593    if (! $found) {
 594       die "ERROR [iana]: country code file format changed!\n";
 595    }
 596
 597    while (1) {
 598       my @row = get_row("iana",\$in);
 599       last  if (! @row);
 600
 601       my($dom,$type,$country) = @row;
 602       next  unless ($type eq "country-code");
 603       $dom     =~ s/^\.//;
 604       $country =~ s,<br.*,,;
 605
 606       if (exists $country_un_orig{$country}) {
 607          $country = $country_un_orig{$country};
 608       }
 609
 610       my($err,$countryID,$i,$t) = get_countryID("iana",$country);
 611       next  if ($err);
 612
 613       if ($t eq "new") {
 614          print "INFO [iana]: new country: $dom\t$country\n";
 615       }
 616
 617       $Code2CountryID{"dom"}{$dom} = [ $countryID, $i ];
 618       $CountryID2Code{"dom"}{$countryID} = $dom;
 619    }
 620 }
 621
 622 ########################################
 623 sub do_country_cia {
 624
 625    print "\nINFO [cia]: expect Antarctica, Bouvet, British, Christmas, Cocos,\n" .
 626          "            France (Metro), French, Heard, Kosovo, Montenegro,\n" .
 627          "            Saint Barth, Saint Martin, Serbia, Taiwan, US\n";
 628
 629    ###
 630    ### The CIA data is used to check:
 631    ###    alpha-2, alpha-3, numeric, fips 10
 632    ###
 633
 634    system("wget -N -q $country_cia_url");
 635    my @in = `cat $country_cia_file`;
 636    chomp(@in);
 637    my $in = join("",@in);
 638    $in =~ s/\015/ /sg;
 639
 640    # Clean up some things that could cause problems in parsing:
 641
 642    strip_tags(\$in,1,"br","p","strong","div","a","b","img");
 643    $in =~ s,\&nbsp;, ,g;
 644    $in =~ s,\s+, ,g;
 645
 646    # Move to a table with "Entity" as one of the entries. This
 647    # table is followed by the entires.
 648
 649    my $found = jump_to_entry(\$in,"Entity");
 650    if (! $found) {
 651       die "ERROR [cia]: country code file format changed!\n";
 652    }
 653
 654    # Each entry is quite complicated. Each is a single table (with a
 655    # table nested in it) of the form:
 656    #
 657    #   <table>
 658    #      <tr>
 659    #         <td>COUNTRY</td>
 660    #         <td>FIPS</td>
 661    #         <td>
 662    #            <table>
 663    #               <tr>
 664    #                  <td>ALPHA-2</td>
 665    #                  <td>ALPHA-3</td>
 666    #                  <td>NUMERIC</td>
 667    #            </table>
 668    #         </td>
 669    #         ...
 670    #     </tr>
 671    #     ...
 672    #  </table>
 673    #
 674    # After all of the "A" entries, a special table containing the headers
 675    # is given again.
 676
 677    while (1) {
 678
 679       #
 680       # Read the next entry
 681       #
 682
 683       my($country,$fips,$alpha2,$alpha3,$num,$stanag,$dom);
 684
 685       my $found   = jump_to_table(\$in);
 686       if (! $found) {
 687          print "ERROR [cia]: malformed file\n";
 688          last;
 689       }
 690       $country = get_entry(\$in);
 691       if (! $country) {
 692          # The first entry is empty if it's at the end of the entries
 693          # for a given letter.
 694          $found = jump_to_entry(\$in,"Entity");
 695          if (! $found) {
 696             die "ERROR [cia]: country code file format changed!\n";
 697          }
 698          next;
 699       }
 700
 701       if (exists $country_cia_orig{$country}) {
 702          $country = $country_cia_orig{$country};
 703       }
 704
 705       $fips    = uc(get_entry(\$in));
 706       $found   = jump_to_table(\$in);
 707       if (! $found) {
 708          print "ERROR [cia]: malformed file\n";
 709          last;
 710       }
 711       $alpha2  = lc(get_entry(\$in));
 712       $alpha3  = lc(get_entry(\$in));
 713       $num     = get_entry(\$in);
 714       $stanag  = get_entry(\$in);
 715       $dom     = uc(get_entry(\$in));
 716       $dom     =~ s/^\.//;
 717
 718       next  if (exists $country_cia_ignore{$country});
 719
 720       $alpha2 = $country_cia_codes{"alpha2"}{$country}
 721         if (exists $country_cia_codes{"alpha2"}{$country});
 722       $alpha3 = $country_cia_codes{"alpha3"}{$country}
 723         if (exists $country_cia_codes{"alpha3"}{$country});
 724       $num = $country_cia_codes{"num"}{$country}
 725         if (exists $country_cia_codes{"num"}{$country});
 726       $fips = $country_cia_codes{"fips"}{$country}
 727         if (exists $country_cia_codes{"fips"}{$country});
 728       $dom  = $country_cia_codes{"dom"}{$country}
 729         if (exists $country_cia_codes{"dom"}{$country});
 730
 731       #
 732       # Get the countryID if the country or ANY of the codes match.
 733       #
 734
 735       my($err,$countryID,$i,$type) = get_countryID("cia",$country,1);
 736       next  if ($err);
 737
 738       if ($countryID == -1  &&  $alpha2 ne "-") {
 739          if (exists $Code2CountryID{"alpha2"}{$alpha2}) {
 740             ($countryID,$i) = @{ $Code2CountryID{"alpha2"}{$alpha2} };
 741          }
 742       }
 743
 744       if ($countryID == -1  &&  $alpha3 ne "-") {
 745          if (exists $Code2CountryID{"alpha3"}{$alpha3}) {
 746             ($countryID,$i) = @{ $Code2CountryID{"alpha3"}{$alpha3} };
 747          }
 748       }
 749
 750       if ($countryID == -1  &&  $num ne "-") {
 751          if (exists $Code2CountryID{"num"}{$num}) {
 752             ($countryID,$i) = @{ $Code2CountryID{"num"}{$num} };
 753          }
 754       }
 755
 756       if ($countryID == -1  &&  $fips ne "-") {
 757          if (exists $Code2CountryID{"fips"}{$fips}) {
 758             ($countryID,$i) = @{ $Code2CountryID{"fips"}{$fips} };
 759          }
 760       }
 761
 762       if ($countryID == -1  &&  $dom ne "-") {
 763          if (exists $Code2CountryID{"dom"}{$dom}) {
 764             ($countryID,$i) = @{ $Code2CountryID{"dom"}{$dom} };
 765          }
 766       }
 767
 768       if ($countryID == -1) {
 769          $countryID = $CountryID++;
 770          $i = 0;
 771          $Country{$countryID} = [ $country ];
 772          $CountryAlias{lc($country)} = [ $countryID, $i ];
 773       }
 774
 775       #
 776       # Now check that any previously defined values match the
 777       # CIA data.
 778       #
 779
 780       if ($alpha2 ne "-") {
 781          my $err = check_code("cia","alpha2",$alpha2,$country,$countryID);
 782          next  if ($err);
 783       }
 784
 785       if ($alpha3 ne "-") {
 786          my $err = check_code("cia","alpha3",$alpha3,$country,$countryID);
 787          next  if ($err);
 788       }
 789
 790       if ($num ne "-") {
 791          my $err = check_code("cia","num",$num,$country,$countryID);
 792          next  if ($err);
 793       }
 794
 795       if ($fips ne "-") {
 796          my $err = check_code("cia","fips",$fips,$country,$countryID);
 797          next  if ($err);
 798       }
 799
 800       if ($dom ne "-") {
 801          my $err = check_code("cia","dom",$dom,$country,$countryID);
 802          next  if ($err);
 803       }
 804
 805       last  if ($country eq $country_cia_last);
 806    }
 807 }
 808
 809 ########################################
 810 sub check_code {
 811    my($type,$codeset,$code,$country,$countryID) = @_;
 812
 813    # Check to make sure that the code is defined.
 814
 815    if (exists $Code2CountryID{$codeset}{$code}) {
 816       return _check_code_exists($type,$codeset,$code,$country,$countryID);
 817    } else {
 818       return _check_code_new($type,$codeset,$code,$country,$countryID);
 819    }
 820 }
 821
 822 sub _check_code_exists {
 823    my($type,$codeset,$code,$country,$countryID) = @_;
 824
 825    # Check the countryID for the code. It must be the same as the one
 826    # passed in.
 827
 828    my $old_countryID = $Code2CountryID{$codeset}{$code}[0];
 829    if ($countryID != $old_countryID) {
 830       print "ERROR [$type]: countryID mismatch in code: [$codeset, $country, $code, $countryID != $old_countryID ]\n";
 831       return 1;
 832    }
 833
 834    # If the country is defined, it must be the same CountryID. If it
 835    # is not, create a new alias.
 836
 837    if (exists $CountryAlias{lc($country)}) {
 838
 839       my $alt_countryID = $CountryAlias{lc($country)}[0];
 840
 841       if ($countryID != $alt_countryID) {
 842          print "ERROR [$type]: countryID mismatch in country: [$codeset, $country, $code, $countryID != $alt_countryID ]\n";
 843          return 1;
 844       }
 845
 846    } else {
 847       push @{ $Country{$countryID} },$country;
 848       my $i = $#{ $Country{$countryID} };
 849       $CountryAlias{lc($country)} = [ $countryID, $i ];
 850    }
 851 }
 852
 853 # This is a new code.
 854 sub _check_code_new {
 855    my($type,$codeset,$code,$country,$countryID) = @_;
 856
 857    print "INFO [$type]: New code: $codeset [$code] => $country\n";
 858
 859    # If this country name isn't defined, create it.
 860
 861    my $i;
 862    if (exists $CountryAlias{lc($country)}) {
 863       $i = $CountryAlias{lc($country)}[1];
 864    } else {
 865       push @{ $Country{$countryID} },$country;
 866       $i = $#{ $Country{$countryID} };
 867       $CountryAlias{lc($country)} = [ $countryID, $i ];
 868    }
 869
 870    # This country name is the canonical name for the code.
 871
 872    $CountryID2Code{$codeset}{$countryID} = $code;
 873    $Code2CountryID{$codeset}{$code} = [ $countryID, $i ];
 874
 875    return 0;
 876 }
 877
 878 ########################################
 879 sub get_countryID {
 880    my($type,$country,$no_create) = @_;
 881
 882    my($countryID,$i,$t);
 883    if (exists $CountryAlias{lc($country)}) {
 884       # The country is the same name as one previously defined
 885       ($countryID,$i) = @{ $CountryAlias{lc($country)} };
 886       $t = "same";
 887
 888    } elsif (exists $country_alias{$country}) {
 889       # It's a new alias for an existing country
 890       my $c = $country_alias{$country};
 891       if (! exists $CountryAlias{lc($c)}) {
 892          print "WARNING [$type]: alias referenced before it is defined: $country => $c\n";
 893          return (1);
 894       }
 895       $countryID = $CountryAlias{lc($c)}[0];
 896       push @{ $Country{$countryID} },$country;
 897       $i = $#{ $Country{$countryID} };
 898       $CountryAlias{lc($country)} = [ $countryID, $i ];
 899       $t = "alias";
 900
 901    } else {
 902       # It's a new country.
 903       if ($no_create) {
 904          return(0,-1,-1,"new");
 905       }
 906       $countryID = $CountryID++;
 907       $i         = 0;
 908       $Country{$countryID} = [ $country ];
 909       $CountryAlias{lc($country)} = [ $countryID, $i ];
 910       $t = "new";
 911    }
 912
 913    return(0,$countryID,$i,$t);
 914 }
 915
 916 ############################################################################
 917 # DO_LANGUAGE
 918 ############################################################################
 919
 920 sub do_language {
 921    print "Language codes...\n";
 922
 923    do_language_iso();
 924    print_table("language")  if ($DEBUG == 2);
 925
 926    # Go through all aliases to pick up any that haven't already been
 927    # added (since some aliases are for human convenience rather than
 928    # dealing with variations between codesets).
 929    do_aliases("language");
 930    print_table("language")  if ($DEBUG);
 931
 932    write_module("language");
 933 }
 934
 935 ########################################
 936 sub do_language_iso {
 937    ###
 938    ### The first set we'll do is the ISO codes.
 939    ###
 940
 941    system("wget -N -q $language_iso_url");
 942    open(my $in,'<:encoding(utf8)',$language_iso_file);
 943    my @in = <$in>;
 944    close($in);
 945    chomp(@in);
 946
 947    # File is a set of lines of fields delimited by "|". Fields are:
 948    #
 949    #    alpha3
 950    #    term
 951    #    alpha2
 952    #    English names (semicolon separated list)
 953    #    French name
 954
 955    foreach my $line (@in) {
 956       my($alpha3,$term,$alpha2,$english,$french) = split(/\|/,$line);
 957       # The first line has some binary characters at the start.
 958       if (length($alpha3)>3) {
 959          $alpha3 = substr($alpha3,length($alpha3)-3);
 960       }
 961
 962       if (exists $language_iso_orig{$english}) {
 963          $english = $language_iso_orig{$english};
 964       }
 965       my $languageID = $LanguageID++;
 966       my @language = split(/\s*;\s*/,$english);
 967
 968       $Language{$languageID}                  = [ @language ];
 969       for (my $i=0; $i<=$#language; $i++) {
 970          my $language = $language[$i];
 971          $LanguageAlias{lc($language)}         = [ $languageID, $i ];
 972       }
 973
 974       if ($alpha3) {
 975          $Code2LanguageID{"alpha3"}{$alpha3}     = [ $languageID, 0 ];
 976          $LanguageID2Code{"alpha3"}{$languageID} = $alpha3;
 977       }
 978
 979       if ($term) {
 980          $Code2LanguageID{"term"}{$term}         = [ $languageID, 0 ];
 981          $LanguageID2Code{"term"}{$languageID}   = $term;
 982       }
 983
 984       if ($alpha2) {
 985          $Code2LanguageID{"alpha2"}{$alpha2}     = [ $languageID, 0 ];
 986          $LanguageID2Code{"alpha2"}{$languageID} = $alpha2;
 987       }
 988    }
 989 }
 990
 991 ############################################################################
 992 # DO_CURRENCY
 993 ############################################################################
 994
 995 sub do_currency {
 996    print "Currency codes...\n";
 997
 998    do_currency_iso();
 999    print_table("currency")  if ($DEBUG == 2);
1000
1001    # Go through all aliases to pick up any that haven't already been
1002    # added (since some aliases are for human convenience rather than
1003    # dealing with variations between codesets).
1004    do_aliases("currency");
1005    print_table("currency")  if ($DEBUG);
1006
1007    write_module("currency");
1008 }
1009
1010 ########################################
1011 sub do_currency_iso {
1012    ###
1013    ### The first set we'll do is the ISO 4217 codes.
1014    ###
1015
1016    system("wget -N -q $currency_iso_url");
1017    my @in = `cat $currency_iso_file`;
1018    chomp(@in);
1019    my $in = join("",@in);
1020
1021    # Clean up some things that could cause problems in parsing:
1022
1023    strip_tags(\$in,1,"p","a","strong","div");
1024    $in =~ s,\&nbsp;, ,g;
1025    $in =~ s,†,,g;
1026    $in =~ s,‡,,g;
1027    $in =~ s,\s+, ,g;
1028
1029    # Look for a table who's first row has the header:
1030    #    Entity
1031
1032    my $found = jump_to_row(\$in,"Entity");
1033    if (! $found) {
1034       die "ERROR [iso]: currency code file format changed!\n";
1035    }
1036
1037    LINE: while (1) {
1038       my @row = get_row("iso",\$in);
1039       last  if (! @row);
1040
1041       my($ent,$currencies,$alphas,$nums) = @row;
1042       $nums     = ""  if ($nums eq "Nil");
1043       next  if (! $alphas  &&  ! $nums);
1044
1045       my(@currency,@num,@alpha,$done);
1046       @currency = split(/<br\s*\/><br\s*\/>/,$currencies);
1047       @num      = split(/<br\s*\/><br\s*\/>/,$nums);
1048       @alpha    = split(/<br\s*\/><br\s*\/>/,$alphas);
1049
1050       if ( ($nums    &&  $#num   != $#currency)  ||
1051            ($alphas  &&  $#alpha != $#currency) ) {
1052          print "WARNING [iso]: Invalid line: $currencies => $alphas, $nums\n";
1053          next;
1054       }
1055
1056       for (my $i=0; $i<=$#currency; $i++) {
1057          my $currency = $currency[$i];
1058          my $num      = (@num   ? $num[$i]   : "");
1059          my $alpha    = (@alpha ? $alpha[$i] : "");
1060          $done        = 1  if ($alpha eq $currency_iso_last);
1061
1062          if (exists $currency_iso_orig{$currency}) {
1063             $currency = $currency_iso_orig{$currency};
1064          }
1065
1066          if ($num) {
1067             if ($num !~ /^\d+$/  ||  length($num) > 3) {
1068                print "WARNING [iso]: Invalid numeric code: $currency => $num\n";
1069                next LINE;
1070             }
1071          }
1072
1073          $alpha = uc($alpha);
1074          if ($alpha  &&  $alpha !~ /^[A-Z][A-Z][A-Z]$/) {
1075             print "WARNING [iso]: Invalid alpha code: $currency => $alpha\n";
1076             next LINE;
1077          }
1078
1079          next  if (exists $currency_iso_ignore{$alpha});
1080
1081          # There's a lot of duplication, so make sure that if this code
1082          # exists, it is consistant.
1083
1084          my $new = 1;
1085          my @currencyID;
1086          if (exists $Code2CurrencyID{"num"}{$num}) {
1087             $new = 0;
1088             my($currencyID,$i) = @{ $Code2CurrencyID{"num"}{$num} };
1089             push(@currencyID,$currencyID);
1090          }
1091          if (exists $Code2CurrencyID{"alpha"}{$alpha}) {
1092             $new = 0;
1093             my($currencyID,$i) = @{ $Code2CurrencyID{"alpha"}{$alpha} };
1094             push(@currencyID,$currencyID);
1095          }
1096
1097          if ($#currencyID == 1) {
1098             if ($currencyID[0] != $currencyID[1]) {
1099                print "WARNING [iso]: CurrencyID mismatch: $currency => $alpha,$num\n";
1100                next LINE;
1101             }
1102          }
1103
1104          my $currencyID;
1105          if ($new) {
1106             $currencyID = $CurrencyID++;
1107             $Currency{$currencyID} = [ $currency ];
1108             $CurrencyAlias{lc($currency)} = [ $currencyID, 0 ]
1109          } else {
1110             $currencyID = $currencyID[0];
1111          }
1112
1113          if ($num) {
1114             $num = "0$num"  while (length($num) < 3);
1115             $Code2CurrencyID{"num"}{$num} = [ $currencyID, 0 ];
1116             $CurrencyID2Code{"num"}{$currencyID} = $num;
1117          }
1118
1119          if ($alpha) {
1120             $Code2CurrencyID{"alpha"}{$alpha} = [ $currencyID, 0 ];
1121             $CurrencyID2Code{"alpha"}{$currencyID} = $alpha;
1122          }
1123
1124       }
1125
1126       last  if ($done);
1127    }
1128 }
1129
1130 ############################################################################
1131 # DO_SCRIPT
1132 ############################################################################
1133
1134 sub do_script {
1135    print "Script codes...\n";
1136
1137    do_script_iso();
1138    print_table("script")  if ($DEBUG == 2);
1139
1140    # Go through all aliases to pick up any that haven't already been
1141    # added (since some aliases are for human convenience rather than
1142    # dealing with variations between codesets).
1143    do_aliases("script");
1144    print_table("script")  if ($DEBUG);
1145
1146    write_module("script");
1147 }
1148
1149 ########################################
1150 sub do_script_iso {
1151    ###
1152    ### The first set we'll do is the ISO 15924 codes. We
1153    ### can get a zip file which contains the data.
1154    ###
1155
1156    system("wget -N -q $script_iso_url");
1157    my $zip = Archive::Zip->new($script_iso_file);
1158
1159    my @members = grep(/^iso15924.*\.txt/,$zip->memberNames());
1160    if (@members != 1) {
1161       die "ERROR [iso]: zip file changed format\n";
1162    }
1163    my($file) = @members;
1164    $zip->extractMember($file,$script_iso_tmp);
1165
1166    #
1167    # The zip file contains a series of lines in the form:
1168    #   alpha;numeric;english;...
1169    # The data is in UTF-8.
1170    #
1171    # Every line has an unprintable character at the end.
1172    #
1173
1174    open(my $in,'<:encoding(utf8)',$script_iso_tmp);
1175    my @in = <$in>;
1176    close($in);
1177    chomp(@in);
1178    chop(@in);
1179
1180    foreach my $line (@in) {
1181       next  if (! $line  ||  $line =~ /^\043/);
1182       my($alpha,$num,$script) = split(/;/,$line);
1183       $alpha = ucfirst(lc($alpha));
1184       next  if (exists $script_iso_ignore{$alpha});
1185
1186       if (exists $script_iso_orig{$script}) {
1187          $script = $script_iso_orig{$script};
1188       }
1189       my $scriptID = $ScriptID++;
1190
1191       $Script{$scriptID}                 = [ $script ];
1192       $ScriptAlias{lc($script)}          = [ $scriptID, 0 ];
1193
1194       $Code2ScriptID{"alpha"}{$alpha}    = [ $scriptID, 0 ];
1195       $ScriptID2Code{"alpha"}{$scriptID} = $alpha;
1196
1197       $Code2ScriptID{"num"}{$num}        = [ $scriptID, 0 ];
1198       $ScriptID2Code{"num"}{$scriptID}   = $num;
1199    }
1200 }
1201
1202 ############################################################################
1203 # PRINT_TABLE
1204 ############################################################################
1205
1206 sub _type_hashes {
1207    my($caller) = @_;
1208
1209    my ($type_alias,$TypeAlias,$Type,$TypeID2Code,$Code2TypeID);
1210
1211    if      ($caller eq "country") {
1212       $type_alias   = \%country_alias;
1213       $TypeAlias    = \%CountryAlias;
1214       $Type         = \%Country;
1215       $TypeID2Code  = \%CountryID2Code;
1216       $Code2TypeID  = \%Code2CountryID;
1217
1218    } elsif ($caller eq "language") {
1219       $type_alias   = \%language_alias;
1220       $TypeAlias    = \%LanguageAlias;
1221       $Type         = \%Language;
1222       $TypeID2Code  = \%LanguageID2Code;
1223       $Code2TypeID  = \%Code2LanguageID;
1224
1225    } elsif ($caller eq "currency") {
1226       $type_alias   = \%currency_alias;
1227       $TypeAlias    = \%CurrencyAlias;
1228       $Type         = \%Currency;
1229       $TypeID2Code  = \%CurrencyID2Code;
1230       $Code2TypeID  = \%Code2CurrencyID;
1231
1232    } elsif ($caller eq "script") {
1233       $type_alias   = \%script_alias;
1234       $TypeAlias    = \%ScriptAlias;
1235       $Type         = \%Script;
1236       $TypeID2Code  = \%ScriptID2Code;
1237       $Code2TypeID  = \%Code2ScriptID;
1238    }
1239
1240    return($type_alias,$TypeAlias,$Type,$TypeID2Code,$Code2TypeID);
1241 }
1242
1243 sub print_table {
1244    my($caller) = @_;
1245
1246    my ($type_alias,$TypeAlias,$Type,$TypeID2Code,$Code2TypeID)
1247      = _type_hashes($caller);
1248
1249    foreach my $typeID (sort keys %$Type) {
1250       my @type = @{ $$Type{$typeID} };
1251       my $i = 0;
1252
1253       my $type = shift(@type);
1254       if (length($type) < 40) {
1255          $type .= " "x(40-length($type));
1256       } else {
1257          $type = substr($type,0,40);
1258       }
1259
1260       print "${typeID}[$i] = $type  ";
1261       foreach my $codeset (keys %$Code2TypeID) {
1262          my $field = "";
1263          if (exists $$TypeID2Code{$codeset}{$typeID}) {
1264             my $code = $$TypeID2Code{$codeset}{$typeID};
1265             my($code_id,$code_i) = @{ $$Code2TypeID{$codeset}{$code} };
1266             $field = "$code [$code_id,$code_i]";
1267             $field .= " ERR"  if ($code_id != $typeID);
1268          }
1269          $field = $field . " "x(18-length($field));
1270          print $field;
1271       }
1272       print "\n";
1273       foreach $type (@type) {
1274          $i++;
1275          if (length($type) > 40) {
1276             $type = substr($type,0,40);
1277          }
1278          print "    [$i] = $type\n";
1279       }
1280    }
1281 }
1282
1283 ############################################################################
1284 # DO_ALIASES
1285 ############################################################################
1286
1287 sub do_aliases {
1288    my($caller) = @_;
1289
1290    my ($type_alias,$TypeAlias,$Type,$TypeID2Code,$Code2TypeID)
1291      = _type_hashes($caller);
1292
1293    # Add remaining aliases.
1294
1295    foreach my $alias (keys %$type_alias) {
1296       my $type = $$type_alias{$alias};
1297
1298       next  if (exists $$TypeAlias{lc($type)}  &&
1299                 exists $$TypeAlias{lc($alias)});
1300
1301       if (! exists $$TypeAlias{lc($type)}  &&
1302           ! exists $$TypeAlias{lc($alias)}) {
1303          print "WARNING: unused type in alias list: $type\n";
1304          print "WARNING: unused type in alias list: $alias\n";
1305          next;
1306       }
1307
1308       my ($typeID);
1309       if (exists $$TypeAlias{lc($type)}) {
1310          $typeID = $$TypeAlias{lc($type)}[0];
1311          $type   = $alias;
1312       } else {
1313          $typeID = $$TypeAlias{lc($alias)}[0];
1314       }
1315
1316       push @{ $$Type{$typeID} },$type;
1317       my $i = $#{ $$Type{$typeID} };
1318       $$TypeAlias{lc($type)} = [ $typeID, $i ];
1319    }
1320 }
1321
1322 ############################################################################
1323 # WRITE_MODULE
1324 ############################################################################
1325
1326 sub write_module {
1327    my($type) = @_;
1328
1329    my($module,%hashes,$id);
1330
1331    if ($type eq "country") {
1332       $module = $CountryModule;
1333       %hashes = ("id2names"  => "Country",
1334                  "alias2id"  => "CountryAlias",
1335                  "code2id"   => "Code2CountryID",
1336                  "id2code"   => "CountryID2Code");
1337       $id     = $CountryID;
1338    } elsif ($type eq "language") {
1339       $module = $LanguageModule;
1340       %hashes = ("id2names"  => "Language",
1341                  "alias2id"  => "LanguageAlias",
1342                  "code2id"   => "Code2LanguageID",
1343                  "id2code"   => "LanguageID2Code");
1344       $id     = $LanguageID;
1345    } elsif ($type eq "currency") {
1346       $module = $CurrencyModule;
1347       %hashes = ("id2names"  => "Currency",
1348                  "alias2id"  => "CurrencyAlias",
1349                  "code2id"   => "Code2CurrencyID",
1350                  "id2code"   => "CurrencyID2Code");
1351       $id     = $CurrencyID;
1352    } elsif ($type eq "script") {
1353       $module = $ScriptModule;
1354       %hashes = ("id2names"  => "Script",
1355                  "alias2id"  => "ScriptAlias",
1356                  "code2id"   => "Code2ScriptID",
1357                  "id2code"   => "ScriptID2Code");
1358       $id     = $ScriptID;
1359    }
1360
1361    my $file = "$ModDir/$module.pm";
1362    if (-f $file) {
1363       system("mv $file $file.bak");
1364    }
1365
1366    my $out = new IO::File;
1367    $out->open(">$file");
1368    my $timestamp   = `date`;
1369    chomp($timestamp);
1370
1371    my $podstr = '=pod';    # so the CPAN indexer won't treat this as a POD file
1372
1373    print $out "package Locale::Codes::$module;
1374
1375 # This file was automatically generated.  Any changes to this file will
1376 # be lost the next time 'get_codes' is run.
1377 #    Generated on: $timestamp
1378
1379 $podstr
1380
1381 =head1 NAME
1382
1383 Locale::Codes::$module - $type codes for the Locale::$module module
1384
1385 =head1 SYNOPSIS
1386
1387 This module contains data used by the Locale::$module module. It is
1388 not intended to be used directly, and contains no calleable routines.
1389
1390 =head1 AUTHOR
1391
1392 See Locale::Codes for full author history.
1393
1394 Currently maintained by Sullivan Beck (sbeck\@cpan.org).
1395
1396 =head1 COPYRIGHT
1397
1398    Copyright (c) 1997-2001 Canon Research Centre Europe (CRE).
1399    Copyright (c) 2001-2010 Neil Bowers
1400    Copyright (c) 2010-2010 Sullivan Beck
1401
1402 This module is free software; you can redistribute it and/or
1403 modify it under the same terms as Perl itself.
1404
1405 =cut
1406
1407 use strict;
1408 use warnings;
1409 require 5.002;
1410
1411 use vars qw(\$VERSION);
1412 \$VERSION='3.12';
1413
1414 \$Locale::Codes::Data{'$type'}{'id'} = '$id';
1415
1416 ";
1417
1418    foreach my $h qw(id2names alias2id code2id id2code) {
1419       my $hash = $hashes{$h};
1420       print $out "\$Locale::Codes::Data{'$type'}{'$h'} = {\n";
1421       _write_hash($out,$hash);
1422
1423       print $out "};\n\n";
1424    }
1425
1426    print $out "1;\n";
1427
1428    $out->close();
1429 }
1430
1431 sub _write_hash {
1432    my($out,$hashname) = @_;
1433
1434    no strict 'refs';
1435    my %hash = %$hashname;
1436    use strict 'refs';
1437    _write_subhash($out,3,\%hash);
1438 }
1439
1440 sub _write_subhash {
1441    my($out,$indent,$hashref) = @_;
1442
1443    my %hash = %$hashref;
1444    my $ind  = " "x$indent;
1445
1446    foreach my $key (sort keys %hash) {
1447       my $val = $hash{$key};
1448       if (ref($val) eq "HASH") {
1449          print $out "${ind}q($key) => {\n";
1450          _write_subhash($out,$indent+3,$val);
1451          print $out "${ind}   },\n";
1452       } elsif (ref($val) eq "ARRAY") {
1453          print $out "${ind}q($key) => [\n";
1454          _write_sublist($out,$indent+3,$val);
1455          print $out "${ind}   ],\n";
1456       } else {
1457          print $out "${ind}q($key) => q($val),\n";
1458       }
1459    }
1460 }
1461
1462 sub _write_sublist {
1463    my($out,$indent,$listref) = @_;
1464
1465    my @list = @$listref;
1466    my $ind  = " "x$indent;
1467
1468    foreach my $val (@list) {
1469       if (ref($val) eq "HASH") {
1470          print $out "${ind}{\n";
1471          _write_subhash($out,$indent+3,$val);
1472          print $out "${ind}},\n";
1473       } elsif (ref($val) eq "ARRAY") {
1474          print $out "${ind}[\n";
1475          _write_sublist($out,$indent+3,$val);
1476          print $out "${ind}],\n";
1477       } else {
1478          print $out "${ind}q($val),\n";
1479       }
1480    }
1481 }
1482
1483 ############################################################################
1484 # DO_CLEAN
1485 ############################################################################
1486
1487 sub do_clean {
1488    print "Cleaning...\n";
1489
1490    system("rm -f $country_iso_file");
1491    system("rm -f $country_un_file");
1492    system("rm -f $country_nga_file");
1493    system("rm -f $country_cia_file");
1494    system("rm -f $country_iana_file");
1495    system("rm -f $language_iso_file");
1496    system("rm -f $currency_iso_file");
1497    system("rm -f $script_iso_file");
1498    system("rm -f $script_iso_tmp");
1499    system("rm -rf __MACOSX");
1500 }
1501
1502 ############################################################################
1503 # HTML SCRAPING
1504 ############################################################################
1505
1506 sub get_row {
1507    my($type,$inref) = @_;
1508
1509    return ()  if ($$inref !~ m,^\s*<tr,);
1510
1511    if ($$inref !~ s,^(.*?)</tr[^>]*>,,) {
1512       die "ERROR [$type]: malformed HTML\n";
1513    }
1514    my $row = $1;
1515
1516    if ($row =~ m,<table,) {
1517       die "ERROR [$type]: embedded table\n";
1518    }
1519
1520    my @row;
1521    while ($row =~ s,(?:.*?)<(td|th)[^>]*>\s*(.*?)\s*</\1[^>]*>,,) {
1522       my $val = $2;
1523       push(@row,$val);
1524    }
1525
1526    return @row;
1527 }
1528
1529 sub jump_to_row {
1530    my($inref,$header) = @_;
1531
1532    if ($$inref =~ s,(.*?)\Q$header\E(.*?)</tr[^>]*>(.*?)(?=<tr),,) {
1533       return 1;
1534    } else {
1535       return 0;
1536    }
1537 }
1538
1539 sub jump_to_entry {
1540    my($inref,$value) = @_;
1541
1542    if ($$inref =~ s,(.*?)(?=<(?:td|th)[^>]*>\s*\Q$value\E\s*),,) {
1543       return 1;
1544    } else {
1545       return 0;
1546    }
1547 }
1548
1549 sub jump_to_table {
1550    my($inref) = @_;
1551
1552    if ($$inref =~ s,(.*?)(?=<table),,) {
1553       return 1;
1554    } else {
1555       return 0;
1556    }
1557 }
1558
1559 sub get_entry {
1560    my($inref) = @_;
1561
1562    if ($$inref =~ s,.*?<td[^>]*>\s*(.*?)\s*</td[^>]*>,,) {
1563       return $1;
1564    }
1565    return "";
1566 }
1567
1568 sub strip_tags {
1569    my($inref,$close,@tags) = @_;
1570
1571    foreach my $tag (@tags) {
1572       if ($close) {
1573          $$inref =~ s,</?$tag[^>]*>, ,g;
1574       } else {
1575          $$inref =~ s,<$tag[^>]*>, ,g;
1576       }
1577    }
1578 }
1579
1580 # Local Variables:
1581 # mode: cperl
1582 # indent-tabs-mode: nil
1583 # cperl-indent-level: 3
1584 # cperl-continued-statement-offset: 2
1585 # cperl-continued-brace-offset: 0
1586 # cperl-brace-offset: 0
1587 # cperl-brace-imaginary-offset: 0
1588 # cperl-label-offset: -2
1589 # End: