1 package HTML::Entities;
5 HTML::Entities - Encode or decode strings with HTML entities
11 $a = "Våre norske tegn bør æres";
13 encode_entities($a, "\200-\377");
17 $input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
18 print encode_entities($input), "\n"
22 vis-à-vis Beyoncé's naïve
23 papier-mâché résumé
27 This module deals with encoding and decoding of strings with HTML
28 character entities. The module provides the following functions:
32 =item decode_entities( $string, ... )
34 This routine replaces HTML entities found in the $string with the
35 corresponding Unicode character. Under perl 5.6 and earlier only
36 characters in the Latin-1 range are replaced. Unrecognized
37 entities are left alone.
39 If multiple strings are provided as argument they are each decoded
40 separately and the same number of strings are returned.
42 If called in void context the arguments are decoded in-place.
44 This routine is exported by default.
46 =item _decode_entities( $string, \%entity2char )
48 =item _decode_entities( $string, \%entity2char, $expand_prefix )
50 This will in-place replace HTML entities in $string. The %entity2char
51 hash must be provided. Named entities not found in the %entity2char
52 hash are left alone. Numeric entities are expanded unless their value
55 The keys in %entity2char are the entity names to be expanded and their
56 values are what they should expand into. The values do not have to be
57 single character strings. If a key has ";" as suffix,
58 then occurrences in $string are only expanded if properly terminated
59 with ";". Entities without ";" will be expanded regardless of how
60 they are terminated for compatibility with how common browsers treat
61 entities in the Latin-1 range.
63 If $expand_prefix is TRUE then entities without trailing ";" in
64 %entity2char will even be expanded as a prefix of a longer
65 unrecognized name. The longest matching name in %entity2char will be
66 used. This is mainly present for compatibility with an MSIE
69 $string = "foo bar";
70 _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
71 print $string; # will print "foo bar"
73 This routine is exported by default.
75 =item encode_entities( $string )
77 =item encode_entities( $string, $unsafe_chars )
79 This routine replaces unsafe characters in $string with their entity
80 representation. A second argument can be given to specify which characters to
81 consider unsafe. The unsafe characters is specified using the regular
82 expression character class syntax (what you find within brackets in regular
85 The default set of characters to encode are control chars, high-bit chars, and
86 the C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> characters. But this,
87 for example, would encode I<just> the C<< < >>, C<< & >>, C<< > >>, and C<< "
90 $encoded = encode_entities($input, '<>&"');
92 and this would only encode non-plain ascii:
94 $encoded = encode_entities($input, '^\n\x20-\x25\x27-\x7e');
96 This routine is exported by default.
98 =item encode_entities_numeric( $string )
100 =item encode_entities_numeric( $string, $unsafe_chars )
102 This routine works just like encode_entities, except that the replacement
103 entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For
104 example, C<encode_entities("r\xF4le")> returns "rôle", but
105 C<encode_entities_numeric("r\xF4le")> returns "rôle".
107 This routine is I<not> exported by default. But you can always
108 export it with C<use HTML::Entities qw(encode_entities_numeric);>
109 or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
113 All these routines modify the string passed as the first argument, if
114 called in a void context. In scalar and array contexts, the encoded or
115 decoded string is returned (without changing the input string).
117 If you prefer not to import these routines into your namespace, you can
120 use HTML::Entities ();
121 $decoded = HTML::Entities::decode($a);
122 $encoded = HTML::Entities::encode($a);
123 $encoded = HTML::Entities::encode_numeric($a);
125 The module can also export the %char2entity and the %entity2char
126 hashes, which contain the mapping from all characters to the
127 corresponding entities (and vice versa, respectively).
131 Copyright 1995-2006 Gisle Aas. All rights reserved.
133 This library is free software; you can redistribute it and/or
134 modify it under the same terms as Perl itself.
139 use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
140 use vars qw(%entity2char %char2entity);
146 @EXPORT = qw(encode_entities decode_entities _decode_entities);
147 @EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
150 sub Version { $VERSION; }
152 require HTML::Parser; # for fast XS implemented decode_entities
156 # Some normal chars that have special meaning in SGML context
157 amp => '&', # ampersand
158 'gt' => '>', # greater than
159 'lt' => '<', # less than
160 quot => '"', # double quote
161 apos => "'", # single quote
163 # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
164 AElig => chr(198), # capital AE diphthong (ligature)
165 Aacute => chr(193), # capital A, acute accent
166 Acirc => chr(194), # capital A, circumflex accent
167 Agrave => chr(192), # capital A, grave accent
168 Aring => chr(197), # capital A, ring
169 Atilde => chr(195), # capital A, tilde
170 Auml => chr(196), # capital A, dieresis or umlaut mark
171 Ccedil => chr(199), # capital C, cedilla
172 ETH => chr(208), # capital Eth, Icelandic
173 Eacute => chr(201), # capital E, acute accent
174 Ecirc => chr(202), # capital E, circumflex accent
175 Egrave => chr(200), # capital E, grave accent
176 Euml => chr(203), # capital E, dieresis or umlaut mark
177 Iacute => chr(205), # capital I, acute accent
178 Icirc => chr(206), # capital I, circumflex accent
179 Igrave => chr(204), # capital I, grave accent
180 Iuml => chr(207), # capital I, dieresis or umlaut mark
181 Ntilde => chr(209), # capital N, tilde
182 Oacute => chr(211), # capital O, acute accent
183 Ocirc => chr(212), # capital O, circumflex accent
184 Ograve => chr(210), # capital O, grave accent
185 Oslash => chr(216), # capital O, slash
186 Otilde => chr(213), # capital O, tilde
187 Ouml => chr(214), # capital O, dieresis or umlaut mark
188 THORN => chr(222), # capital THORN, Icelandic
189 Uacute => chr(218), # capital U, acute accent
190 Ucirc => chr(219), # capital U, circumflex accent
191 Ugrave => chr(217), # capital U, grave accent
192 Uuml => chr(220), # capital U, dieresis or umlaut mark
193 Yacute => chr(221), # capital Y, acute accent
194 aacute => chr(225), # small a, acute accent
195 acirc => chr(226), # small a, circumflex accent
196 aelig => chr(230), # small ae diphthong (ligature)
197 agrave => chr(224), # small a, grave accent
198 aring => chr(229), # small a, ring
199 atilde => chr(227), # small a, tilde
200 auml => chr(228), # small a, dieresis or umlaut mark
201 ccedil => chr(231), # small c, cedilla
202 eacute => chr(233), # small e, acute accent
203 ecirc => chr(234), # small e, circumflex accent
204 egrave => chr(232), # small e, grave accent
205 eth => chr(240), # small eth, Icelandic
206 euml => chr(235), # small e, dieresis or umlaut mark
207 iacute => chr(237), # small i, acute accent
208 icirc => chr(238), # small i, circumflex accent
209 igrave => chr(236), # small i, grave accent
210 iuml => chr(239), # small i, dieresis or umlaut mark
211 ntilde => chr(241), # small n, tilde
212 oacute => chr(243), # small o, acute accent
213 ocirc => chr(244), # small o, circumflex accent
214 ograve => chr(242), # small o, grave accent
215 oslash => chr(248), # small o, slash
216 otilde => chr(245), # small o, tilde
217 ouml => chr(246), # small o, dieresis or umlaut mark
218 szlig => chr(223), # small sharp s, German (sz ligature)
219 thorn => chr(254), # small thorn, Icelandic
220 uacute => chr(250), # small u, acute accent
221 ucirc => chr(251), # small u, circumflex accent
222 ugrave => chr(249), # small u, grave accent
223 uuml => chr(252), # small u, dieresis or umlaut mark
224 yacute => chr(253), # small y, acute accent
225 yuml => chr(255), # small y, dieresis or umlaut mark
227 # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
228 copy => chr(169), # copyright sign
229 reg => chr(174), # registered sign
230 nbsp => chr(160), # non breaking space
232 # Additional ISO-8859/1 entities listed in rfc1866 (section 14)
243 'not' => chr(172), # not is a keyword in perl
262 'times' => chr(215), # times is a keyword in perl
266 'OElig;' => chr(338),
267 'oelig;' => chr(339),
268 'Scaron;' => chr(352),
269 'scaron;' => chr(353),
273 'tilde;' => chr(732),
274 'Alpha;' => chr(913),
276 'Gamma;' => chr(915),
277 'Delta;' => chr(916),
278 'Epsilon;' => chr(917),
281 'Theta;' => chr(920),
283 'Kappa;' => chr(922),
284 'Lambda;' => chr(923),
288 'Omicron;' => chr(927),
291 'Sigma;' => chr(931),
293 'Upsilon;' => chr(933),
297 'Omega;' => chr(937),
298 'alpha;' => chr(945),
300 'gamma;' => chr(947),
301 'delta;' => chr(948),
302 'epsilon;' => chr(949),
305 'theta;' => chr(952),
307 'kappa;' => chr(954),
308 'lambda;' => chr(955),
312 'omicron;' => chr(959),
315 'sigmaf;' => chr(962),
316 'sigma;' => chr(963),
318 'upsilon;' => chr(965),
322 'omega;' => chr(969),
323 'thetasym;' => chr(977),
324 'upsih;' => chr(978),
326 'ensp;' => chr(8194),
327 'emsp;' => chr(8195),
328 'thinsp;' => chr(8201),
329 'zwnj;' => chr(8204),
333 'ndash;' => chr(8211),
334 'mdash;' => chr(8212),
335 'lsquo;' => chr(8216),
336 'rsquo;' => chr(8217),
337 'sbquo;' => chr(8218),
338 'ldquo;' => chr(8220),
339 'rdquo;' => chr(8221),
340 'bdquo;' => chr(8222),
341 'dagger;' => chr(8224),
342 'Dagger;' => chr(8225),
343 'bull;' => chr(8226),
344 'hellip;' => chr(8230),
345 'permil;' => chr(8240),
346 'prime;' => chr(8242),
347 'Prime;' => chr(8243),
348 'lsaquo;' => chr(8249),
349 'rsaquo;' => chr(8250),
350 'oline;' => chr(8254),
351 'frasl;' => chr(8260),
352 'euro;' => chr(8364),
353 'image;' => chr(8465),
354 'weierp;' => chr(8472),
355 'real;' => chr(8476),
356 'trade;' => chr(8482),
357 'alefsym;' => chr(8501),
358 'larr;' => chr(8592),
359 'uarr;' => chr(8593),
360 'rarr;' => chr(8594),
361 'darr;' => chr(8595),
362 'harr;' => chr(8596),
363 'crarr;' => chr(8629),
364 'lArr;' => chr(8656),
365 'uArr;' => chr(8657),
366 'rArr;' => chr(8658),
367 'dArr;' => chr(8659),
368 'hArr;' => chr(8660),
369 'forall;' => chr(8704),
370 'part;' => chr(8706),
371 'exist;' => chr(8707),
372 'empty;' => chr(8709),
373 'nabla;' => chr(8711),
374 'isin;' => chr(8712),
375 'notin;' => chr(8713),
377 'prod;' => chr(8719),
379 'minus;' => chr(8722),
380 'lowast;' => chr(8727),
381 'radic;' => chr(8730),
382 'prop;' => chr(8733),
383 'infin;' => chr(8734),
390 'there4;' => chr(8756),
392 'cong;' => chr(8773),
393 'asymp;' => chr(8776),
395 'equiv;' => chr(8801),
400 'nsub;' => chr(8836),
401 'sube;' => chr(8838),
402 'supe;' => chr(8839),
403 'oplus;' => chr(8853),
404 'otimes;' => chr(8855),
405 'perp;' => chr(8869),
406 'sdot;' => chr(8901),
407 'lceil;' => chr(8968),
408 'rceil;' => chr(8969),
409 'lfloor;' => chr(8970),
410 'rfloor;' => chr(8971),
411 'lang;' => chr(9001),
412 'rang;' => chr(9002),
414 'spades;' => chr(9824),
415 'clubs;' => chr(9827),
416 'hearts;' => chr(9829),
417 'diams;' => chr(9830),
422 # Make the opposite mapping
423 while (my($entity, $char) = each(%entity2char)) {
425 $char2entity{$char} = "&$entity;";
427 delete $char2entity{"'"}; # only one-way decoding
429 # Fill in missing entities
431 next if exists $char2entity{chr($_)};
432 $char2entity{chr($_)} = "&#$_;";
435 my %subst; # compiled encoding regexps
437 sub decode_entities_old
440 if (defined wantarray) {
441 $array = [@_]; # copy
443 $array = \@_; # modify in-place
447 s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg;
448 s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg;
449 s/(&(\w+);?)/$entity2char{$2} || $1/eg;
451 wantarray ? @$array : $array->[0];
456 return undef unless defined $_[0];
458 if (defined wantarray) {
462 $ref = \$_[0]; # modify in-place
464 if (defined $_[1] and length $_[1]) {
465 unless (exists $subst{$_[1]}) {
466 # Because we can't compile regex we fake it with a cached sub
468 $chars =~ s,(?<!\\)([]/]),\\$1,g;
469 $chars =~ s,(?<!\\)\\\z,\\\\,;
470 my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
471 $subst{$_[1]} = eval $code;
472 die( $@ . " while trying to turn range: \"$_[1]\"\n "
473 . "into code: $code\n "
476 &{$subst{$_[1]}}($$ref);
478 # Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
479 $$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
484 sub encode_entities_numeric {
486 return &encode_entities; # a goto &encode_entities wouldn't work
491 sprintf "&#x%X;", ord($_[0]);
495 *encode = \&encode_entities;
496 *encode_numeric = \&encode_entities_numeric;
497 *encode_numerically = \&encode_entities_numeric;
498 *decode = \&decode_entities;