3 # The documentation is at the end.
4 # Time-stamp: "2004-05-07 15:31:25 ADT"
12 %Name2character_number
13 %Latin1Code_to_fallback
14 %Latin1Char_to_fallback
18 %EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
20 #==========================================================================
26 %Name2character_number
27 %Latin1Code_to_fallback
28 %Latin1Char_to_fallback
34 $FAR_CHAR = "?" unless defined $FAR_CHAR;
35 $FAR_CHAR_NUMBER = ord($FAR_CHAR) unless defined $FAR_CHAR_NUMBER;
37 $NOT_ASCII = 'A' ne chr(65) unless defined $NOT_ASCII;
39 #--------------------------------------------------------------------------
42 return undef unless defined $in and length $in;
45 if($in =~ m/^(0[0-7]*)$/s ) {
47 } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) {
49 } # else it's decimal, or named
52 # We're in bizarro world of not-ASCII!
53 # Cope with US-ASCII codes, use fallbacks for Latin-1, or use FAR_CHAR.
54 unless($in =~ m/^\d+$/s) {
55 # It's a named character reference. Get its numeric Unicode value.
56 $in = $Name2character{$in};
57 return undef unless defined $in; # (if there's no such name)
58 $in = ord $in; # (All ents must be one character long.)
59 # ...So $in holds the char's US-ASCII numeric value, which we'll
60 # now go get the local equivalent for.
63 # It's numeric, whether by origin or by mutation from a known name
64 return $Code2USASCII{$in} # so "65" => "A" everywhere
65 || $Latin1Code_to_fallback{$in} # Fallback.
66 || $FAR_CHAR; # Fall further back
70 if($in =~ m/^\d+$/s) {
71 if($] < 5.007 and $in > 255) { # can't be trusted with Unicode
77 return $Name2character{$in}; # returns undef if unknown
81 #--------------------------------------------------------------------------
84 return undef unless defined $in and length $in;
87 if($in =~ m/^(0[0-7]*)$/s ) {
89 } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) {
91 } # else it's decimal, or named
93 if($in =~ m/^\d+$/s) {
96 return $Name2character_number{$in}; # returns undef if unknown
100 #--------------------------------------------------------------------------
102 %Name2character_number = (
114 'lchevron' => 171, # legacy for laquo
115 'rchevron' => 187, # legacy for raquo
117 # Remember, grave looks like \ (as in virtu\)
118 # acute looks like / (as in re/sume/)
119 # circumflex looks like ^ (as in papier ma^che/)
120 # umlaut/dieresis looks like " (as in nai"ve, Chloe")
122 # From the XHTML 1 .ent files:
375 # Fill out %Name2character...
377 %Name2character = ();
379 while( ($name, $number) = each %Name2character_number) {
380 if($] < 5.007 and $number > 255) {
381 $Name2character{$name} = $FAR_CHAR;
382 # substitute for Unicode characters, for perls
383 # that can't reliable handle them
385 $Name2character{$name} = chr $number;
389 # So they resolve 'right' even in EBCDIC-land
390 $Name2character{'lt' } = '<';
391 $Name2character{'gt' } = '>';
392 $Name2character{'quot'} = '"';
393 $Name2character{'amp' } = '&';
394 $Name2character{'apos'} = "'";
395 $Name2character{'sol' } = '/';
396 $Name2character{'verbar'} = '|';
399 #--------------------------------------------------------------------------
402 # mostly generated by
403 # perl -e "printf qq{ \x25 3s, '\x25s',\n}, $_, chr($_) foreach (32 .. 126)"
501 #--------------------------------------------------------------------------
503 %Latin1Code_to_fallback = ();
504 @Latin1Code_to_fallback{0xA0 .. 0xFF} = (
505 # Copied from Text/Unidecode/x00.pm:
507 ' ', qq{!}, qq{C/}, 'PS', qq{\$?}, qq{Y=}, qq{|}, 'SS', qq{"}, qq{(c)}, 'a', qq{<<}, qq{!}, "", qq{(r)}, qq{-},
508 'deg', qq{+-}, '2', '3', qq{'}, 'u', 'P', qq{*}, qq{,}, '1', 'o', qq{>>}, qq{1/4}, qq{1/2}, qq{3/4}, qq{?},
509 'A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
510 'D', 'N', 'O', 'O', 'O', 'O', 'O', 'x', 'O', 'U', 'U', 'U', 'U', 'U', 'Th', 'ss',
511 'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
512 'd', 'n', 'o', 'o', 'o', 'o', 'o', qq{/}, 'o', 'u', 'u', 'u', 'u', 'y', 'th', 'y',
517 # Now stuff %Latin1Char_to_fallback:
518 %Latin1Char_to_fallback = ();
520 while( ($k,$v) = each %Latin1Code_to_fallback) {
521 $Latin1Char_to_fallback{chr $k} = $v;
522 #print chr($k), ' => ', $v, "\n";
526 #--------------------------------------------------------------------------
532 Pod::Escapes -- for resolving Pod EE<lt>...E<gt> sequences
536 use Pod::Escapes qw(e2char);
537 ...la la la, parsing POD, la la la...
538 $text = e2char($e_node->label);
539 unless(defined $text) {
540 print "Unknown E sequence \"", $e_node->label, "\"!";
542 ...else print/interpolate $text...
546 This module provides things that are useful in decoding
547 Pod EE<lt>...E<gt> sequences. Presumably, it should be used
548 only by Pod parsers and/or formatters.
550 By default, Pod::Escapes exports none of its symbols. But
551 you can request any of them to be exported.
552 Either request them individually, as with
553 C<use Pod::Escapes qw(symbolname symbolname2...);>,
554 or you can do C<use Pod::Escapes qw(:ALL);> to get all
561 =item e2char($e_content)
563 Given a name or number that could appear in a
564 C<EE<lt>name_or_numE<gt>> sequence, this returns the string that
565 it stands for. For example, C<e2char('sol')>, C<e2char('47')>,
566 C<e2char('0x2F')>, and C<e2char('057')> all return "/",
567 because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>,
568 and C<EE<lt>057E<gt>>, all mean "/". If
569 the name has no known value (as with a name of "qacute") or is
570 syntactally invalid (as with a name of "1/4"), this returns undef.
572 =item e2charnum($e_content)
574 Given a name or number that could appear in a
575 C<EE<lt>name_or_numE<gt>> sequence, this returns the number of
576 the Unicode character that this stands for. For example,
577 C<e2char('sol')>, C<e2char('47')>,
578 C<e2char('0x2F')>, and C<e2char('057')> all return 47,
579 because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>,
580 and C<EE<lt>057E<gt>>, all mean "/", whose Unicode number is 47. If
581 the name has no known value (as with a name of "qacute") or is
582 syntactally invalid (as with a name of "1/4"), this returns undef.
584 =item $Name2character{I<name>}
586 Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol"
587 to the string that each stands for. Note that this does not
588 include numerics (like "64" or "x981c"). Under old Perl versions
589 (before 5.7) you get a "?" in place of characters whose Unicode
592 =item $Name2character_number{I<name>}
594 Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol"
595 to the Unicode value that each stands for. For example,
596 C<$Name2character_number{'eacute'}> is 201, and
597 C<$Name2character_number{'eacute'}> is 8364. You get the correct
598 Unicode value, regardless of the version of Perl you're using --
599 which differs from C<%Name2character>'s behavior under pre-5.7 Perls.
601 Note that this hash does not
602 include numerics (like "64" or "x981c").
604 =item $Latin1Code_to_fallback{I<integer>}
606 For numbers in the range 160 (0x00A0) to 255 (0x00FF), this maps
607 from the character code for a Latin-1 character (like 233 for
608 lowercase e-acute) to the US-ASCII character that best aproximates
609 it (like "e"). You may find this useful if you are rendering
610 POD in a format that you think deals well only with US-ASCII
613 =item $Latin1Char_to_fallback{I<character>}
615 Just as above, but maps from characters (like "\xE9",
616 lowercase e-acute) to characters (like "e").
618 =item $Code2USASCII{I<integer>}
620 This maps from US-ASCII codes (like 32) to the corresponding
621 character (like space, for 32). Only characters 32 to 126 are
622 defined. This is meant for use by C<e2char($x)> when it senses
623 that it's running on a non-ASCII platform (where chr(32) doesn't
624 get you a space -- but $Code2USASCII{32} will). It's
625 documented here just in case you might find it useful.
631 On Perl versions before 5.7, Unicode characters with a value
632 over 255 (like lambda or emdash) can't be conveyed. This
633 module does work under such early Perl versions, but in the
634 place of each such character, you get a "?". Latin-1
635 characters (characters 160-255) are unaffected.
637 Under EBCDIC platforms, C<e2char($n)> may not always be the
638 same as C<chr(e2charnum($n))>, and ditto for
639 C<$Name2character{$name}> and
640 C<chr($Name2character_number{$name})>.
646 L<perlpodspec|perlpodspec>
648 L<Text::Unidecode|Text::Unidecode>
650 =head1 COPYRIGHT AND DISCLAIMERS
652 Copyright (c) 2001-2004 Sean M. Burke. All rights reserved.
654 This library is free software; you can redistribute it and/or modify
655 it under the same terms as Perl itself.
657 This program is distributed in the hope that it will be useful, but
658 without any warranty; without even the implied warranty of
659 merchantability or fitness for a particular purpose.
661 Portions of the data tables in this module are derived from the
662 entity declarations in the W3C XHTML specification.
664 Currently (October 2001), that's these three:
666 http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent
667 http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent
668 http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent
672 Sean M. Burke C<sburke@cpan.org>
676 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
677 # What I used for reading the XHTML .ent files:
680 my(@norms, @good, @bad);
681 my $dir = 'c:/sgml/docbook/';
683 foreach my $file (qw(
688 open(IN, "<$dir$file") or die "can't read-open $dir$file: $!";
689 print "Reading $file...\n";
691 if(m/<!ENTITY\s+(\S+)\s+"&#([^;]+);">/) {
692 my($name, $value) = ($1,$2);
693 next if $name eq 'quot' or $name eq 'apos' or $name eq 'gt';
695 $value = hex $1 if $value =~ m/^x([a-fA-F0-9]+)$/s;
696 print "ILLEGAL VALUE $value" unless $value =~ m/^\d+$/s;
698 push @good , sprintf " %-10s , chr(%s),\n", "'$name'", $value;
699 push @bad , sprintf " %-10s , \$bad,\n", "'$name'", $value;
701 push @norms, sprintf " %-10s , chr(%s),\n", "'$name'", $value;
704 print "# Skipping $_";
712 print "\n ( \$] .= 5.006001 ? (\n";
719 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~