lib/Pod/Escapes.pm

   1
   2 require 5;
   3 #                        The documentation is at the end.
   4 # Time-stamp: "2004-05-07 15:31:25 ADT"
   5 package Pod::Escapes;
   6 require Exporter;
   7 @ISA = ('Exporter');
   8 $VERSION = '1.04';
   9 @EXPORT_OK = qw(
  10   %Code2USASCII
  11   %Name2character
  12   %Name2character_number
  13   %Latin1Code_to_fallback
  14   %Latin1Char_to_fallback
  15   e2char
  16   e2charnum
  17 );
  18 %EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
  19
  20 #==========================================================================
  21
  22 use strict;
  23 use vars qw(
  24   %Code2USASCII
  25   %Name2character
  26   %Name2character_number
  27   %Latin1Code_to_fallback
  28   %Latin1Char_to_fallback
  29   $FAR_CHAR
  30   $FAR_CHAR_NUMBER
  31   $NOT_ASCII
  32 );
  33
  34 $FAR_CHAR = "?" unless defined $FAR_CHAR;
  35 $FAR_CHAR_NUMBER = ord($FAR_CHAR) unless defined $FAR_CHAR_NUMBER;
  36
  37 $NOT_ASCII = 'A' ne chr(65) unless defined $NOT_ASCII;
  38
  39 #--------------------------------------------------------------------------
  40 sub e2char {
  41   my $in = $_[0];
  42   return undef unless defined $in and length $in;
  43
  44   # Convert to decimal:
  45   if($in =~ m/^(0[0-7]*)$/s ) {
  46     $in = oct $in;
  47   } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) {
  48     $in = hex $1;
  49   } # else it's decimal, or named
  50
  51   if($NOT_ASCII) {
  52     # We're in bizarro world of not-ASCII!
  53     # Cope with US-ASCII codes, use fallbacks for Latin-1, or use FAR_CHAR.
  54     unless($in =~ m/^\d+$/s) {
  55       # It's a named character reference.  Get its numeric Unicode value.
  56       $in = $Name2character{$in};
  57       return undef unless defined $in;  # (if there's no such name)
  58       $in = ord $in; # (All ents must be one character long.)
  59         # ...So $in holds the char's US-ASCII numeric value, which we'll
  60         #  now go get the local equivalent for.
  61     }
  62
  63     # It's numeric, whether by origin or by mutation from a known name
  64     return $Code2USASCII{$in} # so "65" => "A" everywhere
  65         || $Latin1Code_to_fallback{$in} # Fallback.
  66         || $FAR_CHAR; # Fall further back
  67   }
  68
  69   # Normal handling:
  70   if($in =~ m/^\d+$/s) {
  71     if($] < 5.007  and  $in > 255) { # can't be trusted with Unicode
  72       return $FAR_CHAR;
  73     } else {
  74       return chr($in);
  75     }
  76   } else {
  77     return $Name2character{$in}; # returns undef if unknown
  78   }
  79 }
  80
  81 #--------------------------------------------------------------------------
  82 sub e2charnum {
  83   my $in = $_[0];
  84   return undef unless defined $in and length $in;
  85
  86   # Convert to decimal:
  87   if($in =~ m/^(0[0-7]*)$/s ) {
  88     $in = oct $in;
  89   } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) {
  90     $in = hex $1;
  91   } # else it's decimal, or named
  92
  93   if($in =~ m/^\d+$/s) {
  94     return 0 + $in;
  95   } else {
  96     return $Name2character_number{$in}; # returns undef if unknown
  97   }
  98 }
  99
 100 #--------------------------------------------------------------------------
 101
 102 %Name2character_number = (
 103  # General XML/XHTML:
 104  'lt'   => 60,
 105  'gt'   => 62,
 106  'quot' => 34,
 107  'amp'  => 38,
 108  'apos' => 39,
 109
 110  # POD-specific:
 111  'sol'    => 47,
 112  'verbar' => 124,
 113
 114  'lchevron' => 171, # legacy for laquo
 115  'rchevron' => 187, # legacy for raquo
 116
 117  # Remember, grave looks like \ (as in virtu\)
 118  #           acute looks like / (as in re/sume/)
 119  #           circumflex looks like ^ (as in papier ma^che/)
 120  #           umlaut/dieresis looks like " (as in nai"ve, Chloe")
 121
 122  # From the XHTML 1 .ent files:
 123  'nbsp'     , 160,
 124  'iexcl'    , 161,
 125  'cent'     , 162,
 126  'pound'    , 163,
 127  'curren'   , 164,
 128  'yen'      , 165,
 129  'brvbar'   , 166,
 130  'sect'     , 167,
 131  'uml'      , 168,
 132  'copy'     , 169,
 133  'ordf'     , 170,
 134  'laquo'    , 171,
 135  'not'      , 172,
 136  'shy'      , 173,
 137  'reg'      , 174,
 138  'macr'     , 175,
 139  'deg'      , 176,
 140  'plusmn'   , 177,
 141  'sup2'     , 178,
 142  'sup3'     , 179,
 143  'acute'    , 180,
 144  'micro'    , 181,
 145  'para'     , 182,
 146  'middot'   , 183,
 147  'cedil'    , 184,
 148  'sup1'     , 185,
 149  'ordm'     , 186,
 150  'raquo'    , 187,
 151  'frac14'   , 188,
 152  'frac12'   , 189,
 153  'frac34'   , 190,
 154  'iquest'   , 191,
 155  'Agrave'   , 192,
 156  'Aacute'   , 193,
 157  'Acirc'    , 194,
 158  'Atilde'   , 195,
 159  'Auml'     , 196,
 160  'Aring'    , 197,
 161  'AElig'    , 198,
 162  'Ccedil'   , 199,
 163  'Egrave'   , 200,
 164  'Eacute'   , 201,
 165  'Ecirc'    , 202,
 166  'Euml'     , 203,
 167  'Igrave'   , 204,
 168  'Iacute'   , 205,
 169  'Icirc'    , 206,
 170  'Iuml'     , 207,
 171  'ETH'      , 208,
 172  'Ntilde'   , 209,
 173  'Ograve'   , 210,
 174  'Oacute'   , 211,
 175  'Ocirc'    , 212,
 176  'Otilde'   , 213,
 177  'Ouml'     , 214,
 178  'times'    , 215,
 179  'Oslash'   , 216,
 180  'Ugrave'   , 217,
 181  'Uacute'   , 218,
 182  'Ucirc'    , 219,
 183  'Uuml'     , 220,
 184  'Yacute'   , 221,
 185  'THORN'    , 222,
 186  'szlig'    , 223,
 187  'agrave'   , 224,
 188  'aacute'   , 225,
 189  'acirc'    , 226,
 190  'atilde'   , 227,
 191  'auml'     , 228,
 192  'aring'    , 229,
 193  'aelig'    , 230,
 194  'ccedil'   , 231,
 195  'egrave'   , 232,
 196  'eacute'   , 233,
 197  'ecirc'    , 234,
 198  'euml'     , 235,
 199  'igrave'   , 236,
 200  'iacute'   , 237,
 201  'icirc'    , 238,
 202  'iuml'     , 239,
 203  'eth'      , 240,
 204  'ntilde'   , 241,
 205  'ograve'   , 242,
 206  'oacute'   , 243,
 207  'ocirc'    , 244,
 208  'otilde'   , 245,
 209  'ouml'     , 246,
 210  'divide'   , 247,
 211  'oslash'   , 248,
 212  'ugrave'   , 249,
 213  'uacute'   , 250,
 214  'ucirc'    , 251,
 215  'uuml'     , 252,
 216  'yacute'   , 253,
 217  'thorn'    , 254,
 218  'yuml'     , 255,
 219
 220  'fnof'     , 402,
 221  'Alpha'    , 913,
 222  'Beta'     , 914,
 223  'Gamma'    , 915,
 224  'Delta'    , 916,
 225  'Epsilon'  , 917,
 226  'Zeta'     , 918,
 227  'Eta'      , 919,
 228  'Theta'    , 920,
 229  'Iota'     , 921,
 230  'Kappa'    , 922,
 231  'Lambda'   , 923,
 232  'Mu'       , 924,
 233  'Nu'       , 925,
 234  'Xi'       , 926,
 235  'Omicron'  , 927,
 236  'Pi'       , 928,
 237  'Rho'      , 929,
 238  'Sigma'    , 931,
 239  'Tau'      , 932,
 240  'Upsilon'  , 933,
 241  'Phi'      , 934,
 242  'Chi'      , 935,
 243  'Psi'      , 936,
 244  'Omega'    , 937,
 245  'alpha'    , 945,
 246  'beta'     , 946,
 247  'gamma'    , 947,
 248  'delta'    , 948,
 249  'epsilon'  , 949,
 250  'zeta'     , 950,
 251  'eta'      , 951,
 252  'theta'    , 952,
 253  'iota'     , 953,
 254  'kappa'    , 954,
 255  'lambda'   , 955,
 256  'mu'       , 956,
 257  'nu'       , 957,
 258  'xi'       , 958,
 259  'omicron'  , 959,
 260  'pi'       , 960,
 261  'rho'      , 961,
 262  'sigmaf'   , 962,
 263  'sigma'    , 963,
 264  'tau'      , 964,
 265  'upsilon'  , 965,
 266  'phi'      , 966,
 267  'chi'      , 967,
 268  'psi'      , 968,
 269  'omega'    , 969,
 270  'thetasym' , 977,
 271  'upsih'    , 978,
 272  'piv'      , 982,
 273  'bull'     , 8226,
 274  'hellip'   , 8230,
 275  'prime'    , 8242,
 276  'Prime'    , 8243,
 277  'oline'    , 8254,
 278  'frasl'    , 8260,
 279  'weierp'   , 8472,
 280  'image'    , 8465,
 281  'real'     , 8476,
 282  'trade'    , 8482,
 283  'alefsym'  , 8501,
 284  'larr'     , 8592,
 285  'uarr'     , 8593,
 286  'rarr'     , 8594,
 287  'darr'     , 8595,
 288  'harr'     , 8596,
 289  'crarr'    , 8629,
 290  'lArr'     , 8656,
 291  'uArr'     , 8657,
 292  'rArr'     , 8658,
 293  'dArr'     , 8659,
 294  'hArr'     , 8660,
 295  'forall'   , 8704,
 296  'part'     , 8706,
 297  'exist'    , 8707,
 298  'empty'    , 8709,
 299  'nabla'    , 8711,
 300  'isin'     , 8712,
 301  'notin'    , 8713,
 302  'ni'       , 8715,
 303  'prod'     , 8719,
 304  'sum'      , 8721,
 305  'minus'    , 8722,
 306  'lowast'   , 8727,
 307  'radic'    , 8730,
 308  'prop'     , 8733,
 309  'infin'    , 8734,
 310  'ang'      , 8736,
 311  'and'      , 8743,
 312  'or'       , 8744,
 313  'cap'      , 8745,
 314  'cup'      , 8746,
 315  'int'      , 8747,
 316  'there4'   , 8756,
 317  'sim'      , 8764,
 318  'cong'     , 8773,
 319  'asymp'    , 8776,
 320  'ne'       , 8800,
 321  'equiv'    , 8801,
 322  'le'       , 8804,
 323  'ge'       , 8805,
 324  'sub'      , 8834,
 325  'sup'      , 8835,
 326  'nsub'     , 8836,
 327  'sube'     , 8838,
 328  'supe'     , 8839,
 329  'oplus'    , 8853,
 330  'otimes'   , 8855,
 331  'perp'     , 8869,
 332  'sdot'     , 8901,
 333  'lceil'    , 8968,
 334  'rceil'    , 8969,
 335  'lfloor'   , 8970,
 336  'rfloor'   , 8971,
 337  'lang'     , 9001,
 338  'rang'     , 9002,
 339  'loz'      , 9674,
 340  'spades'   , 9824,
 341  'clubs'    , 9827,
 342  'hearts'   , 9829,
 343  'diams'    , 9830,
 344  'OElig'    , 338,
 345  'oelig'    , 339,
 346  'Scaron'   , 352,
 347  'scaron'   , 353,
 348  'Yuml'     , 376,
 349  'circ'     , 710,
 350  'tilde'    , 732,
 351  'ensp'     , 8194,
 352  'emsp'     , 8195,
 353  'thinsp'   , 8201,
 354  'zwnj'     , 8204,
 355  'zwj'      , 8205,
 356  'lrm'      , 8206,
 357  'rlm'      , 8207,
 358  'ndash'    , 8211,
 359  'mdash'    , 8212,
 360  'lsquo'    , 8216,
 361  'rsquo'    , 8217,
 362  'sbquo'    , 8218,
 363  'ldquo'    , 8220,
 364  'rdquo'    , 8221,
 365  'bdquo'    , 8222,
 366  'dagger'   , 8224,
 367  'Dagger'   , 8225,
 368  'permil'   , 8240,
 369  'lsaquo'   , 8249,
 370  'rsaquo'   , 8250,
 371  'euro'     , 8364,
 372 );
 373
 374
 375 # Fill out %Name2character...
 376 {
 377   %Name2character = ();
 378   my($name, $number);
 379   while( ($name, $number) = each %Name2character_number) {
 380     if($] < 5.007  and  $number > 255) {
 381       $Name2character{$name} = $FAR_CHAR;
 382       # substitute for Unicode characters, for perls
 383       #  that can't reliable handle them
 384     } else {
 385       $Name2character{$name} = chr $number;
 386       # normal case
 387     }
 388   }
 389   # So they resolve 'right' even in EBCDIC-land
 390   $Name2character{'lt'  }   = '<';
 391   $Name2character{'gt'  }   = '>';
 392   $Name2character{'quot'}   = '"';
 393   $Name2character{'amp' }   = '&';
 394   $Name2character{'apos'}   = "'";
 395   $Name2character{'sol' }   = '/';
 396   $Name2character{'verbar'} = '|';
 397 }
 398
 399 #--------------------------------------------------------------------------
 400
 401 %Code2USASCII = (
 402 # mostly generated by
 403 #  perl -e "printf qq{  \x25 3s, '\x25s',\n}, $_, chr($_) foreach (32 .. 126)"
 404    32, ' ',
 405    33, '!',
 406    34, '"',
 407    35, '#',
 408    36, '$',
 409    37, '%',
 410    38, '&',
 411    39, "'", #!
 412    40, '(',
 413    41, ')',
 414    42, '*',
 415    43, '+',
 416    44, ',',
 417    45, '-',
 418    46, '.',
 419    47, '/',
 420    48, '0',
 421    49, '1',
 422    50, '2',
 423    51, '3',
 424    52, '4',
 425    53, '5',
 426    54, '6',
 427    55, '7',
 428    56, '8',
 429    57, '9',
 430    58, ':',
 431    59, ';',
 432    60, '<',
 433    61, '=',
 434    62, '>',
 435    63, '?',
 436    64, '@',
 437    65, 'A',
 438    66, 'B',
 439    67, 'C',
 440    68, 'D',
 441    69, 'E',
 442    70, 'F',
 443    71, 'G',
 444    72, 'H',
 445    73, 'I',
 446    74, 'J',
 447    75, 'K',
 448    76, 'L',
 449    77, 'M',
 450    78, 'N',
 451    79, 'O',
 452    80, 'P',
 453    81, 'Q',
 454    82, 'R',
 455    83, 'S',
 456    84, 'T',
 457    85, 'U',
 458    86, 'V',
 459    87, 'W',
 460    88, 'X',
 461    89, 'Y',
 462    90, 'Z',
 463    91, '[',
 464    92, "\\", #!
 465    93, ']',
 466    94, '^',
 467    95, '_',
 468    96, '`',
 469    97, 'a',
 470    98, 'b',
 471    99, 'c',
 472   100, 'd',
 473   101, 'e',
 474   102, 'f',
 475   103, 'g',
 476   104, 'h',
 477   105, 'i',
 478   106, 'j',
 479   107, 'k',
 480   108, 'l',
 481   109, 'm',
 482   110, 'n',
 483   111, 'o',
 484   112, 'p',
 485   113, 'q',
 486   114, 'r',
 487   115, 's',
 488   116, 't',
 489   117, 'u',
 490   118, 'v',
 491   119, 'w',
 492   120, 'x',
 493   121, 'y',
 494   122, 'z',
 495   123, '{',
 496   124, '|',
 497   125, '}',
 498   126, '~',
 499 );
 500
 501 #--------------------------------------------------------------------------
 502
 503 %Latin1Code_to_fallback = ();
 504 @Latin1Code_to_fallback{0xA0 .. 0xFF} = (
 505 # Copied from Text/Unidecode/x00.pm:
 506
 507 ' ', qq{!}, qq{C/}, 'PS', qq{\$?}, qq{Y=}, qq{|}, 'SS', qq{"}, qq{(c)}, 'a', qq{<<}, qq{!}, "", qq{(r)}, qq{-},
 508 'deg', qq{+-}, '2', '3', qq{'}, 'u', 'P', qq{*}, qq{,}, '1', 'o', qq{>>}, qq{1/4}, qq{1/2}, qq{3/4}, qq{?},
 509 'A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
 510 'D', 'N', 'O', 'O', 'O', 'O', 'O', 'x', 'O', 'U', 'U', 'U', 'U', 'U', 'Th', 'ss',
 511 'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
 512 'd', 'n', 'o', 'o', 'o', 'o', 'o', qq{/}, 'o', 'u', 'u', 'u', 'u', 'y', 'th', 'y',
 513
 514 );
 515
 516 {
 517   # Now stuff %Latin1Char_to_fallback:
 518   %Latin1Char_to_fallback = ();
 519   my($k,$v);
 520   while( ($k,$v) = each %Latin1Code_to_fallback) {
 521     $Latin1Char_to_fallback{chr $k} = $v;
 522     #print chr($k), ' => ', $v, "\n";
 523   }
 524 }
 525
 526 #--------------------------------------------------------------------------
 527 1;
 528 __END__
 529
 530 =head1 NAME
 531
 532 Pod::Escapes -- for resolving Pod EE<lt>...E<gt> sequences
 533
 534 =head1 SYNOPSIS
 535
 536   use Pod::Escapes qw(e2char);
 537   ...la la la, parsing POD, la la la...
 538   $text = e2char($e_node->label);
 539   unless(defined $text) {
 540     print "Unknown E sequence \"", $e_node->label, "\"!";
 541   }
 542   ...else print/interpolate $text...
 543
 544 =head1 DESCRIPTION
 545
 546 This module provides things that are useful in decoding
 547 Pod EE<lt>...E<gt> sequences.  Presumably, it should be used
 548 only by Pod parsers and/or formatters.
 549
 550 By default, Pod::Escapes exports none of its symbols.  But
 551 you can request any of them to be exported.
 552 Either request them individually, as with
 553 C<use Pod::Escapes qw(symbolname symbolname2...);>,
 554 or you can do C<use Pod::Escapes qw(:ALL);> to get all
 555 exportable symbols.
 556
 557 =head1 GOODIES
 558
 559 =over
 560
 561 =item e2char($e_content)
 562
 563 Given a name or number that could appear in a
 564 C<EE<lt>name_or_numE<gt>> sequence, this returns the string that
 565 it stands for.  For example, C<e2char('sol')>, C<e2char('47')>,
 566 C<e2char('0x2F')>, and C<e2char('057')> all return "/",
 567 because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>,
 568 and C<EE<lt>057E<gt>>, all mean "/".  If
 569 the name has no known value (as with a name of "qacute") or is
 570 syntactally invalid (as with a name of "1/4"), this returns undef.
 571
 572 =item e2charnum($e_content)
 573
 574 Given a name or number that could appear in a
 575 C<EE<lt>name_or_numE<gt>> sequence, this returns the number of
 576 the Unicode character that this stands for.  For example,
 577 C<e2char('sol')>, C<e2char('47')>,
 578 C<e2char('0x2F')>, and C<e2char('057')> all return 47,
 579 because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>,
 580 and C<EE<lt>057E<gt>>, all mean "/", whose Unicode number is 47.  If
 581 the name has no known value (as with a name of "qacute") or is
 582 syntactally invalid (as with a name of "1/4"), this returns undef.
 583
 584 =item $Name2character{I<name>}
 585
 586 Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol"
 587 to the string that each stands for.  Note that this does not
 588 include numerics (like "64" or "x981c").  Under old Perl versions
 589 (before 5.7) you get a "?" in place of characters whose Unicode
 590 value is over 255.
 591
 592 =item $Name2character_number{I<name>}
 593
 594 Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol"
 595 to the Unicode value that each stands for.  For example,
 596 C<$Name2character_number{'eacute'}> is 201, and
 597 C<$Name2character_number{'eacute'}> is 8364.  You get the correct
 598 Unicode value, regardless of the version of Perl you're using --
 599 which differs from C<%Name2character>'s behavior under pre-5.7 Perls.
 600
 601 Note that this hash does not
 602 include numerics (like "64" or "x981c").
 603
 604 =item $Latin1Code_to_fallback{I<integer>}
 605
 606 For numbers in the range 160 (0x00A0) to 255 (0x00FF), this maps
 607 from the character code for a Latin-1 character (like 233 for
 608 lowercase e-acute) to the US-ASCII character that best aproximates
 609 it (like "e").  You may find this useful if you are rendering
 610 POD in a format that you think deals well only with US-ASCII
 611 characters.
 612
 613 =item $Latin1Char_to_fallback{I<character>}
 614
 615 Just as above, but maps from characters (like "\xE9",
 616 lowercase e-acute) to characters (like "e").
 617
 618 =item $Code2USASCII{I<integer>}
 619
 620 This maps from US-ASCII codes (like 32) to the corresponding
 621 character (like space, for 32).  Only characters 32 to 126 are
 622 defined.  This is meant for use by C<e2char($x)> when it senses
 623 that it's running on a non-ASCII platform (where chr(32) doesn't
 624 get you a space -- but $Code2USASCII{32} will).  It's
 625 documented here just in case you might find it useful.
 626
 627 =back
 628
 629 =head1 CAVEATS
 630
 631 On Perl versions before 5.7, Unicode characters with a value
 632 over 255 (like lambda or emdash) can't be conveyed.  This
 633 module does work under such early Perl versions, but in the
 634 place of each such character, you get a "?".  Latin-1
 635 characters (characters 160-255) are unaffected.
 636
 637 Under EBCDIC platforms, C<e2char($n)> may not always be the
 638 same as C<chr(e2charnum($n))>, and ditto for
 639 C<$Name2character{$name}> and
 640 C<chr($Name2character_number{$name})>.
 641
 642 =head1 SEE ALSO
 643
 644 L<perlpod|perlpod>
 645
 646 L<perlpodspec|perlpodspec>
 647
 648 L<Text::Unidecode|Text::Unidecode>
 649
 650 =head1 COPYRIGHT AND DISCLAIMERS
 651
 652 Copyright (c) 2001-2004 Sean M. Burke.  All rights reserved.
 653
 654 This library is free software; you can redistribute it and/or modify
 655 it under the same terms as Perl itself.
 656
 657 This program is distributed in the hope that it will be useful, but
 658 without any warranty; without even the implied warranty of
 659 merchantability or fitness for a particular purpose.
 660
 661 Portions of the data tables in this module are derived from the
 662 entity declarations in the W3C XHTML specification.
 663
 664 Currently (October 2001), that's these three:
 665
 666  http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent
 667  http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent
 668  http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent
 669
 670 =head1 AUTHOR
 671
 672 Sean M. Burke C<sburke@cpan.org>
 673
 674 =cut
 675
 676 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 677 # What I used for reading the XHTML .ent files:
 678
 679 use strict;
 680 my(@norms, @good, @bad);
 681 my $dir = 'c:/sgml/docbook/';
 682 my %escapes;
 683 foreach my $file (qw(
 684   xhtml-symbol.ent
 685   xhtml-lat1.ent
 686   xhtml-special.ent
 687 )) {
 688   open(IN, "<$dir$file") or die "can't read-open $dir$file: $!";
 689   print "Reading $file...\n";
 690   while(<IN>) {
 691     if(m/<!ENTITY\s+(\S+)\s+"&#([^;]+);">/) {
 692       my($name, $value) = ($1,$2);
 693       next if $name eq 'quot' or $name eq 'apos' or $name eq 'gt';
 694
 695       $value = hex $1 if $value =~ m/^x([a-fA-F0-9]+)$/s;
 696       print "ILLEGAL VALUE $value" unless $value =~ m/^\d+$/s;
 697       if($value > 255) {
 698         push @good , sprintf "   %-10s , chr(%s),\n", "'$name'", $value;
 699         push @bad  , sprintf "   %-10s , \$bad,\n", "'$name'", $value;
 700       } else {
 701         push @norms, sprintf " %-10s , chr(%s),\n", "'$name'", $value;
 702       }
 703     } elsif(m/<!ENT/) {
 704       print "# Skipping $_";
 705     }
 706
 707   }
 708   close(IN);
 709 }
 710
 711 print @norms;
 712 print "\n ( \$] .= 5.006001 ? (\n";
 713 print @good;
 714 print " ) : (\n";
 715 print @bad;
 716 print " )\n);\n";
 717
 718 __END__
 719 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 720
 721