local-lib5/lib/perl5/i486-linux-gnu-thread-multi/HTML/Entities.pm

   1 package HTML::Entities;
   2
   3 =head1 NAME
   4
   5 HTML::Entities - Encode or decode strings with HTML entities
   6
   7 =head1 SYNOPSIS
   8
   9  use HTML::Entities;
  10
  11  $a = "V&aring;re norske tegn b&oslash;r &#230res";
  12  decode_entities($a);
  13  encode_entities($a, "\200-\377");
  14
  15 For example, this:
  16
  17  $input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
  18  print encode_entities($input), "\n"
  19
  20 Prints this out:
  21
  22  vis-&agrave;-vis Beyonc&eacute;'s na&iuml;ve
  23  papier-m&acirc;ch&eacute; r&eacute;sum&eacute;
  24
  25 =head1 DESCRIPTION
  26
  27 This module deals with encoding and decoding of strings with HTML
  28 character entities.  The module provides the following functions:
  29
  30 =over 4
  31
  32 =item decode_entities( $string, ... )
  33
  34 This routine replaces HTML entities found in the $string with the
  35 corresponding Unicode character.  Under perl 5.6 and earlier only
  36 characters in the Latin-1 range are replaced. Unrecognized
  37 entities are left alone.
  38
  39 If multiple strings are provided as argument they are each decoded
  40 separately and the same number of strings are returned.
  41
  42 If called in void context the arguments are decoded in-place.
  43
  44 This routine is exported by default.
  45
  46 =item _decode_entities( $string, \%entity2char )
  47
  48 =item _decode_entities( $string, \%entity2char, $expand_prefix )
  49
  50 This will in-place replace HTML entities in $string.  The %entity2char
  51 hash must be provided.  Named entities not found in the %entity2char
  52 hash are left alone.  Numeric entities are expanded unless their value
  53 overflow.
  54
  55 The keys in %entity2char are the entity names to be expanded and their
  56 values are what they should expand into.  The values do not have to be
  57 single character strings.  If a key has ";" as suffix,
  58 then occurrences in $string are only expanded if properly terminated
  59 with ";".  Entities without ";" will be expanded regardless of how
  60 they are terminated for compatibility with how common browsers treat
  61 entities in the Latin-1 range.
  62
  63 If $expand_prefix is TRUE then entities without trailing ";" in
  64 %entity2char will even be expanded as a prefix of a longer
  65 unrecognized name.  The longest matching name in %entity2char will be
  66 used. This is mainly present for compatibility with an MSIE
  67 misfeature.
  68
  69    $string = "foo&nbspbar";
  70    _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
  71    print $string;  # will print "foo bar"
  72
  73 This routine is exported by default.
  74
  75 =item encode_entities( $string )
  76
  77 =item encode_entities( $string, $unsafe_chars )
  78
  79 This routine replaces unsafe characters in $string with their entity
  80 representation. A second argument can be given to specify which characters to
  81 consider unsafe.  The unsafe characters is specified using the regular
  82 expression character class syntax (what you find within brackets in regular
  83 expressions).
  84
  85 The default set of characters to encode are control chars, high-bit chars, and
  86 the C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> characters.  But this,
  87 for example, would encode I<just> the C<< < >>, C<< & >>, C<< > >>, and C<< "
  88 >> characters:
  89
  90   $encoded = encode_entities($input, '<>&"');
  91
  92 and this would only encode non-plain ascii:
  93
  94   $encoded = encode_entities($input, '^\n\x20-\x25\x27-\x7e');
  95
  96 This routine is exported by default.
  97
  98 =item encode_entities_numeric( $string )
  99
 100 =item encode_entities_numeric( $string, $unsafe_chars )
 101
 102 This routine works just like encode_entities, except that the replacement
 103 entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>.  For
 104 example, C<encode_entities("r\xF4le")> returns "r&ocirc;le", but
 105 C<encode_entities_numeric("r\xF4le")> returns "r&#xF4;le".
 106
 107 This routine is I<not> exported by default.  But you can always
 108 export it with C<use HTML::Entities qw(encode_entities_numeric);>
 109 or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
 110
 111 =back
 112
 113 All these routines modify the string passed as the first argument, if
 114 called in a void context.  In scalar and array contexts, the encoded or
 115 decoded string is returned (without changing the input string).
 116
 117 If you prefer not to import these routines into your namespace, you can
 118 call them as:
 119
 120   use HTML::Entities ();
 121   $decoded = HTML::Entities::decode($a);
 122   $encoded = HTML::Entities::encode($a);
 123   $encoded = HTML::Entities::encode_numeric($a);
 124
 125 The module can also export the %char2entity and the %entity2char
 126 hashes, which contain the mapping from all characters to the
 127 corresponding entities (and vice versa, respectively).
 128
 129 =head1 COPYRIGHT
 130
 131 Copyright 1995-2006 Gisle Aas. All rights reserved.
 132
 133 This library is free software; you can redistribute it and/or
 134 modify it under the same terms as Perl itself.
 135
 136 =cut
 137
 138 use strict;
 139 use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
 140 use vars qw(%entity2char %char2entity);
 141
 142 require 5.004;
 143 require Exporter;
 144 @ISA = qw(Exporter);
 145
 146 @EXPORT = qw(encode_entities decode_entities _decode_entities);
 147 @EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
 148
 149 $VERSION = "3.64";
 150 sub Version { $VERSION; }
 151
 152 require HTML::Parser;  # for fast XS implemented decode_entities
 153
 154
 155 %entity2char = (
 156  # Some normal chars that have special meaning in SGML context
 157  amp    => '&',  # ampersand
 158 'gt'    => '>',  # greater than
 159 'lt'    => '<',  # less than
 160  quot   => '"',  # double quote
 161  apos   => "'",  # single quote
 162
 163  # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
 164  AElig  => chr(198),  # capital AE diphthong (ligature)
 165  Aacute => chr(193),  # capital A, acute accent
 166  Acirc  => chr(194),  # capital A, circumflex accent
 167  Agrave => chr(192),  # capital A, grave accent
 168  Aring  => chr(197),  # capital A, ring
 169  Atilde => chr(195),  # capital A, tilde
 170  Auml   => chr(196),  # capital A, dieresis or umlaut mark
 171  Ccedil => chr(199),  # capital C, cedilla
 172  ETH    => chr(208),  # capital Eth, Icelandic
 173  Eacute => chr(201),  # capital E, acute accent
 174  Ecirc  => chr(202),  # capital E, circumflex accent
 175  Egrave => chr(200),  # capital E, grave accent
 176  Euml   => chr(203),  # capital E, dieresis or umlaut mark
 177  Iacute => chr(205),  # capital I, acute accent
 178  Icirc  => chr(206),  # capital I, circumflex accent
 179  Igrave => chr(204),  # capital I, grave accent
 180  Iuml   => chr(207),  # capital I, dieresis or umlaut mark
 181  Ntilde => chr(209),  # capital N, tilde
 182  Oacute => chr(211),  # capital O, acute accent
 183  Ocirc  => chr(212),  # capital O, circumflex accent
 184  Ograve => chr(210),  # capital O, grave accent
 185  Oslash => chr(216),  # capital O, slash
 186  Otilde => chr(213),  # capital O, tilde
 187  Ouml   => chr(214),  # capital O, dieresis or umlaut mark
 188  THORN  => chr(222),  # capital THORN, Icelandic
 189  Uacute => chr(218),  # capital U, acute accent
 190  Ucirc  => chr(219),  # capital U, circumflex accent
 191  Ugrave => chr(217),  # capital U, grave accent
 192  Uuml   => chr(220),  # capital U, dieresis or umlaut mark
 193  Yacute => chr(221),  # capital Y, acute accent
 194  aacute => chr(225),  # small a, acute accent
 195  acirc  => chr(226),  # small a, circumflex accent
 196  aelig  => chr(230),  # small ae diphthong (ligature)
 197  agrave => chr(224),  # small a, grave accent
 198  aring  => chr(229),  # small a, ring
 199  atilde => chr(227),  # small a, tilde
 200  auml   => chr(228),  # small a, dieresis or umlaut mark
 201  ccedil => chr(231),  # small c, cedilla
 202  eacute => chr(233),  # small e, acute accent
 203  ecirc  => chr(234),  # small e, circumflex accent
 204  egrave => chr(232),  # small e, grave accent
 205  eth    => chr(240),  # small eth, Icelandic
 206  euml   => chr(235),  # small e, dieresis or umlaut mark
 207  iacute => chr(237),  # small i, acute accent
 208  icirc  => chr(238),  # small i, circumflex accent
 209  igrave => chr(236),  # small i, grave accent
 210  iuml   => chr(239),  # small i, dieresis or umlaut mark
 211  ntilde => chr(241),  # small n, tilde
 212  oacute => chr(243),  # small o, acute accent
 213  ocirc  => chr(244),  # small o, circumflex accent
 214  ograve => chr(242),  # small o, grave accent
 215  oslash => chr(248),  # small o, slash
 216  otilde => chr(245),  # small o, tilde
 217  ouml   => chr(246),  # small o, dieresis or umlaut mark
 218  szlig  => chr(223),  # small sharp s, German (sz ligature)
 219  thorn  => chr(254),  # small thorn, Icelandic
 220  uacute => chr(250),  # small u, acute accent
 221  ucirc  => chr(251),  # small u, circumflex accent
 222  ugrave => chr(249),  # small u, grave accent
 223  uuml   => chr(252),  # small u, dieresis or umlaut mark
 224  yacute => chr(253),  # small y, acute accent
 225  yuml   => chr(255),  # small y, dieresis or umlaut mark
 226
 227  # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
 228  copy   => chr(169),  # copyright sign
 229  reg    => chr(174),  # registered sign
 230  nbsp   => chr(160),  # non breaking space
 231
 232  # Additional ISO-8859/1 entities listed in rfc1866 (section 14)
 233  iexcl  => chr(161),
 234  cent   => chr(162),
 235  pound  => chr(163),
 236  curren => chr(164),
 237  yen    => chr(165),
 238  brvbar => chr(166),
 239  sect   => chr(167),
 240  uml    => chr(168),
 241  ordf   => chr(170),
 242  laquo  => chr(171),
 243 'not'   => chr(172),    # not is a keyword in perl
 244  shy    => chr(173),
 245  macr   => chr(175),
 246  deg    => chr(176),
 247  plusmn => chr(177),
 248  sup1   => chr(185),
 249  sup2   => chr(178),
 250  sup3   => chr(179),
 251  acute  => chr(180),
 252  micro  => chr(181),
 253  para   => chr(182),
 254  middot => chr(183),
 255  cedil  => chr(184),
 256  ordm   => chr(186),
 257  raquo  => chr(187),
 258  frac14 => chr(188),
 259  frac12 => chr(189),
 260  frac34 => chr(190),
 261  iquest => chr(191),
 262 'times' => chr(215),    # times is a keyword in perl
 263  divide => chr(247),
 264
 265  ( $] > 5.007 ? (
 266   'OElig;'    => chr(338),
 267   'oelig;'    => chr(339),
 268   'Scaron;'   => chr(352),
 269   'scaron;'   => chr(353),
 270   'Yuml;'     => chr(376),
 271   'fnof;'     => chr(402),
 272   'circ;'     => chr(710),
 273   'tilde;'    => chr(732),
 274   'Alpha;'    => chr(913),
 275   'Beta;'     => chr(914),
 276   'Gamma;'    => chr(915),
 277   'Delta;'    => chr(916),
 278   'Epsilon;'  => chr(917),
 279   'Zeta;'     => chr(918),
 280   'Eta;'      => chr(919),
 281   'Theta;'    => chr(920),
 282   'Iota;'     => chr(921),
 283   'Kappa;'    => chr(922),
 284   'Lambda;'   => chr(923),
 285   'Mu;'       => chr(924),
 286   'Nu;'       => chr(925),
 287   'Xi;'       => chr(926),
 288   'Omicron;'  => chr(927),
 289   'Pi;'       => chr(928),
 290   'Rho;'      => chr(929),
 291   'Sigma;'    => chr(931),
 292   'Tau;'      => chr(932),
 293   'Upsilon;'  => chr(933),
 294   'Phi;'      => chr(934),
 295   'Chi;'      => chr(935),
 296   'Psi;'      => chr(936),
 297   'Omega;'    => chr(937),
 298   'alpha;'    => chr(945),
 299   'beta;'     => chr(946),
 300   'gamma;'    => chr(947),
 301   'delta;'    => chr(948),
 302   'epsilon;'  => chr(949),
 303   'zeta;'     => chr(950),
 304   'eta;'      => chr(951),
 305   'theta;'    => chr(952),
 306   'iota;'     => chr(953),
 307   'kappa;'    => chr(954),
 308   'lambda;'   => chr(955),
 309   'mu;'       => chr(956),
 310   'nu;'       => chr(957),
 311   'xi;'       => chr(958),
 312   'omicron;'  => chr(959),
 313   'pi;'       => chr(960),
 314   'rho;'      => chr(961),
 315   'sigmaf;'   => chr(962),
 316   'sigma;'    => chr(963),
 317   'tau;'      => chr(964),
 318   'upsilon;'  => chr(965),
 319   'phi;'      => chr(966),
 320   'chi;'      => chr(967),
 321   'psi;'      => chr(968),
 322   'omega;'    => chr(969),
 323   'thetasym;' => chr(977),
 324   'upsih;'    => chr(978),
 325   'piv;'      => chr(982),
 326   'ensp;'     => chr(8194),
 327   'emsp;'     => chr(8195),
 328   'thinsp;'   => chr(8201),
 329   'zwnj;'     => chr(8204),
 330   'zwj;'      => chr(8205),
 331   'lrm;'      => chr(8206),
 332   'rlm;'      => chr(8207),
 333   'ndash;'    => chr(8211),
 334   'mdash;'    => chr(8212),
 335   'lsquo;'    => chr(8216),
 336   'rsquo;'    => chr(8217),
 337   'sbquo;'    => chr(8218),
 338   'ldquo;'    => chr(8220),
 339   'rdquo;'    => chr(8221),
 340   'bdquo;'    => chr(8222),
 341   'dagger;'   => chr(8224),
 342   'Dagger;'   => chr(8225),
 343   'bull;'     => chr(8226),
 344   'hellip;'   => chr(8230),
 345   'permil;'   => chr(8240),
 346   'prime;'    => chr(8242),
 347   'Prime;'    => chr(8243),
 348   'lsaquo;'   => chr(8249),
 349   'rsaquo;'   => chr(8250),
 350   'oline;'    => chr(8254),
 351   'frasl;'    => chr(8260),
 352   'euro;'     => chr(8364),
 353   'image;'    => chr(8465),
 354   'weierp;'   => chr(8472),
 355   'real;'     => chr(8476),
 356   'trade;'    => chr(8482),
 357   'alefsym;'  => chr(8501),
 358   'larr;'     => chr(8592),
 359   'uarr;'     => chr(8593),
 360   'rarr;'     => chr(8594),
 361   'darr;'     => chr(8595),
 362   'harr;'     => chr(8596),
 363   'crarr;'    => chr(8629),
 364   'lArr;'     => chr(8656),
 365   'uArr;'     => chr(8657),
 366   'rArr;'     => chr(8658),
 367   'dArr;'     => chr(8659),
 368   'hArr;'     => chr(8660),
 369   'forall;'   => chr(8704),
 370   'part;'     => chr(8706),
 371   'exist;'    => chr(8707),
 372   'empty;'    => chr(8709),
 373   'nabla;'    => chr(8711),
 374   'isin;'     => chr(8712),
 375   'notin;'    => chr(8713),
 376   'ni;'       => chr(8715),
 377   'prod;'     => chr(8719),
 378   'sum;'      => chr(8721),
 379   'minus;'    => chr(8722),
 380   'lowast;'   => chr(8727),
 381   'radic;'    => chr(8730),
 382   'prop;'     => chr(8733),
 383   'infin;'    => chr(8734),
 384   'ang;'      => chr(8736),
 385   'and;'      => chr(8743),
 386   'or;'       => chr(8744),
 387   'cap;'      => chr(8745),
 388   'cup;'      => chr(8746),
 389   'int;'      => chr(8747),
 390   'there4;'   => chr(8756),
 391   'sim;'      => chr(8764),
 392   'cong;'     => chr(8773),
 393   'asymp;'    => chr(8776),
 394   'ne;'       => chr(8800),
 395   'equiv;'    => chr(8801),
 396   'le;'       => chr(8804),
 397   'ge;'       => chr(8805),
 398   'sub;'      => chr(8834),
 399   'sup;'      => chr(8835),
 400   'nsub;'     => chr(8836),
 401   'sube;'     => chr(8838),
 402   'supe;'     => chr(8839),
 403   'oplus;'    => chr(8853),
 404   'otimes;'   => chr(8855),
 405   'perp;'     => chr(8869),
 406   'sdot;'     => chr(8901),
 407   'lceil;'    => chr(8968),
 408   'rceil;'    => chr(8969),
 409   'lfloor;'   => chr(8970),
 410   'rfloor;'   => chr(8971),
 411   'lang;'     => chr(9001),
 412   'rang;'     => chr(9002),
 413   'loz;'      => chr(9674),
 414   'spades;'   => chr(9824),
 415   'clubs;'    => chr(9827),
 416   'hearts;'   => chr(9829),
 417   'diams;'    => chr(9830),
 418  ) : ())
 419 );
 420
 421
 422 # Make the opposite mapping
 423 while (my($entity, $char) = each(%entity2char)) {
 424     $entity =~ s/;\z//;
 425     $char2entity{$char} = "&$entity;";
 426 }
 427 delete $char2entity{"'"};  # only one-way decoding
 428
 429 # Fill in missing entities
 430 for (0 .. 255) {
 431     next if exists $char2entity{chr($_)};
 432     $char2entity{chr($_)} = "&#$_;";
 433 }
 434
 435 my %subst;  # compiled encoding regexps
 436
 437 sub decode_entities_old
 438 {
 439     my $array;
 440     if (defined wantarray) {
 441         $array = [@_]; # copy
 442     } else {
 443         $array = \@_;  # modify in-place
 444     }
 445     my $c;
 446     for (@$array) {
 447         s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg;
 448         s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg;
 449         s/(&(\w+);?)/$entity2char{$2} || $1/eg;
 450     }
 451     wantarray ? @$array : $array->[0];
 452 }
 453
 454 sub encode_entities
 455 {
 456     return undef unless defined $_[0];
 457     my $ref;
 458     if (defined wantarray) {
 459         my $x = $_[0];
 460         $ref = \$x;     # copy
 461     } else {
 462         $ref = \$_[0];  # modify in-place
 463     }
 464     if (defined $_[1] and length $_[1]) {
 465         unless (exists $subst{$_[1]}) {
 466             # Because we can't compile regex we fake it with a cached sub
 467             my $chars = $_[1];
 468             $chars =~ s,(?<!\\)([]/]),\\$1,g;
 469             $chars =~ s,(?<!\\)\\\z,\\\\,;
 470             my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
 471             $subst{$_[1]} = eval $code;
 472             die( $@ . " while trying to turn range: \"$_[1]\"\n "
 473               . "into code: $code\n "
 474             ) if $@;
 475         }
 476         &{$subst{$_[1]}}($$ref);
 477     } else {
 478         # Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
 479         $$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
 480     }
 481     $$ref;
 482 }
 483
 484 sub encode_entities_numeric {
 485     local %char2entity;
 486     return &encode_entities;   # a goto &encode_entities wouldn't work
 487 }
 488
 489
 490 sub num_entity {
 491     sprintf "&#x%X;", ord($_[0]);
 492 }
 493
 494 # Set up aliases
 495 *encode = \&encode_entities;
 496 *encode_numeric = \&encode_entities_numeric;
 497 *encode_numerically = \&encode_entities_numeric;
 498 *decode = \&decode_entities;
 499
 500 1;