diff options
author | Lorry Tar Creator <lorry-tar-importer@lorry> | 2013-05-08 22:21:52 +0000 |
---|---|---|
committer | Lorry Tar Creator <lorry-tar-importer@lorry> | 2013-05-08 22:21:52 +0000 |
commit | 2f253cfc85ffd55a8acb988e91f0bc5ab348124c (patch) | |
tree | 4734ccd522c71dd455879162006742002f8c1565 /t/entities.t | |
download | HTML-Parser-tarball-master.tar.gz |
HTML-Parser-3.71HEADHTML-Parser-3.71master
Diffstat (limited to 't/entities.t')
-rw-r--r-- | t/entities.t | 213 |
1 files changed, 213 insertions, 0 deletions
diff --git a/t/entities.t b/t/entities.t new file mode 100644 index 0000000..f12d2fd --- /dev/null +++ b/t/entities.t @@ -0,0 +1,213 @@ +use HTML::Entities qw(decode_entities encode_entities encode_entities_numeric); + +use Test::More tests => 20; + +$a = "Våre norske tegn bør æres"; + +decode_entities($a); + +is($a, "Våre norske tegn bør æres"); + +encode_entities($a); + +is($a, "Våre norske tegn bør æres"); + +decode_entities($a); +encode_entities_numeric($a); + +is($a, "Våre norske tegn bør æres"); + +$a = "<&>\"'"; +is(encode_entities($a), "<&>"'"); +is(encode_entities_numeric($a), "<&>"'"); + +$a = "abcdef"; +is(encode_entities($a, 'a-c'), "abcdef"); + +$a = "[24/7]\\"; +is(encode_entities($a, '/'), "[24/7]\\"); +is(encode_entities($a, '\\/'), "[24/7]\\"); +is(encode_entities($a, '\\'), "[24/7]\"); +is(encode_entities($a, ']\\'), "[24/7]\"); + +# See how well it does against rfc1866... +$ent = $plain = ""; +while (<DATA>) { + next unless /^\s*<!ENTITY\s+(\w+)\s*CDATA\s*\"&\#(\d+)/; + $ent .= "&$1;"; + $plain .= chr($2); +} + +$a = $ent; +decode_entities($a); +is($a, $plain); + +# Try decoding when the ";" are left out +$a = $ent, +$a =~ s/;//g; +decode_entities($a); +is($a, $plain); + + +$a = $plain; +encode_entities($a); +is($a, $ent); + +{ #RT #84144 - https://rt.cpan.org/Public/Bug/Display.html?id=84144 + + my %hash= ( + "Våre norske tegn bør æres" => "Våre norske tegn bør æres" + ); + + my ($got, $eval_ok); + $eval_ok= eval { $got= decode_entities((keys %hash)[0]); 1 }; + is( $eval_ok, 1, "decode_entitites() when processing a key as input"); + is( $got, (values %hash)[0], "decode_entities() decodes a key properly"); +} + +# From: Bill Simpson-Young <bill.simpson-young@cmis.csiro.au> +# Subject: HTML entities problem with 5.11 +# To: libwww-perl@ics.uci.edu +# Date: Fri, 05 Sep 1997 16:56:55 +1000 +# Message-Id: <199709050657.QAA10089@snowy.nsw.cmis.CSIRO.AU> +# +# Hi. I've got a problem that has surfaced with the changes to +# HTML::Entities.pm for 5.11 (it doesn't happen with 5.08). It's happening +# in the process of encoding then decoding special entities. Eg, what goes +# in as "abc&def&ghi" comes out as "abc&def;&ghi;". + +is(decode_entities("abc&def&ghi&abc;&def;"), "abc&def&ghi&abc;&def;"); + +# Decoding of ' +is(decode_entities("'"), "'"); +is(encode_entities("'", "'"), "'"); + +is(decode_entities("Attention Homeοωnөrs...1ѕt Tімe Eνөг"), + "Attention Home\x{3BF}\x{3C9}n\x{4E9}rs...1\x{455}t T\x{456}\x{43C}e E\x{3BD}\x{4E9}\x{433}"); +is(decode_entities("{&amp;&amp;& also Яœ}"), + "{&&& also \x{42F}\x{153}}"); + +__END__ +# Quoted from rfc1866.txt + +14. Proposed Entities + + The HTML DTD references the "Added Latin 1" entity set, which only + supplies named entities for a subset of the non-ASCII characters in + [ISO-8859-1], namely the accented characters. The following entities + should be supported so that all ISO 8859-1 characters may only be + referenced symbolically. The names for these entities are taken from + the appendixes of [SGML]. + + <!ENTITY nbsp CDATA " " -- no-break space --> + <!ENTITY iexcl CDATA "¡" -- inverted exclamation mark --> + <!ENTITY cent CDATA "¢" -- cent sign --> + <!ENTITY pound CDATA "£" -- pound sterling sign --> + <!ENTITY curren CDATA "¤" -- general currency sign --> + <!ENTITY yen CDATA "¥" -- yen sign --> + <!ENTITY brvbar CDATA "¦" -- broken (vertical) bar --> + <!ENTITY sect CDATA "§" -- section sign --> + <!ENTITY uml CDATA "¨" -- umlaut (dieresis) --> + <!ENTITY copy CDATA "©" -- copyright sign --> + <!ENTITY ordf CDATA "ª" -- ordinal indicator, feminine --> + <!ENTITY laquo CDATA "«" -- angle quotation mark, left --> + <!ENTITY not CDATA "¬" -- not sign --> + <!ENTITY shy CDATA "­" -- soft hyphen --> + <!ENTITY reg CDATA "®" -- registered sign --> + <!ENTITY macr CDATA "¯" -- macron --> + <!ENTITY deg CDATA "°" -- degree sign --> + <!ENTITY plusmn CDATA "±" -- plus-or-minus sign --> + <!ENTITY sup2 CDATA "²" -- superscript two --> + <!ENTITY sup3 CDATA "³" -- superscript three --> + <!ENTITY acute CDATA "´" -- acute accent --> + <!ENTITY micro CDATA "µ" -- micro sign --> + <!ENTITY para CDATA "¶" -- pilcrow (paragraph sign) --> + <!ENTITY middot CDATA "·" -- middle dot --> + <!ENTITY cedil CDATA "¸" -- cedilla --> + <!ENTITY sup1 CDATA "¹" -- superscript one --> + <!ENTITY ordm CDATA "º" -- ordinal indicator, masculine --> + <!ENTITY raquo CDATA "»" -- angle quotation mark, right --> + <!ENTITY frac14 CDATA "¼" -- fraction one-quarter --> + <!ENTITY frac12 CDATA "½" -- fraction one-half --> + <!ENTITY frac34 CDATA "¾" -- fraction three-quarters --> + <!ENTITY iquest CDATA "¿" -- inverted question mark --> + <!ENTITY Agrave CDATA "À" -- capital A, grave accent --> + <!ENTITY Aacute CDATA "Á" -- capital A, acute accent --> + <!ENTITY Acirc CDATA "Â" -- capital A, circumflex accent --> + + + +Berners-Lee & Connolly Standards Track [Page 75] + +RFC 1866 Hypertext Markup Language - 2.0 November 1995 + + + <!ENTITY Atilde CDATA "Ã" -- capital A, tilde --> + <!ENTITY Auml CDATA "Ä" -- capital A, dieresis or umlaut mark --> + <!ENTITY Aring CDATA "Å" -- capital A, ring --> + <!ENTITY AElig CDATA "Æ" -- capital AE diphthong (ligature) --> + <!ENTITY Ccedil CDATA "Ç" -- capital C, cedilla --> + <!ENTITY Egrave CDATA "È" -- capital E, grave accent --> + <!ENTITY Eacute CDATA "É" -- capital E, acute accent --> + <!ENTITY Ecirc CDATA "Ê" -- capital E, circumflex accent --> + <!ENTITY Euml CDATA "Ë" -- capital E, dieresis or umlaut mark --> + <!ENTITY Igrave CDATA "Ì" -- capital I, grave accent --> + <!ENTITY Iacute CDATA "Í" -- capital I, acute accent --> + <!ENTITY Icirc CDATA "Î" -- capital I, circumflex accent --> + <!ENTITY Iuml CDATA "Ï" -- capital I, dieresis or umlaut mark --> + <!ENTITY ETH CDATA "Ð" -- capital Eth, Icelandic --> + <!ENTITY Ntilde CDATA "Ñ" -- capital N, tilde --> + <!ENTITY Ograve CDATA "Ò" -- capital O, grave accent --> + <!ENTITY Oacute CDATA "Ó" -- capital O, acute accent --> + <!ENTITY Ocirc CDATA "Ô" -- capital O, circumflex accent --> + <!ENTITY Otilde CDATA "Õ" -- capital O, tilde --> + <!ENTITY Ouml CDATA "Ö" -- capital O, dieresis or umlaut mark --> + <!ENTITY times CDATA "×" -- multiply sign --> + <!ENTITY Oslash CDATA "Ø" -- capital O, slash --> + <!ENTITY Ugrave CDATA "Ù" -- capital U, grave accent --> + <!ENTITY Uacute CDATA "Ú" -- capital U, acute accent --> + <!ENTITY Ucirc CDATA "Û" -- capital U, circumflex accent --> + <!ENTITY Uuml CDATA "Ü" -- capital U, dieresis or umlaut mark --> + <!ENTITY Yacute CDATA "Ý" -- capital Y, acute accent --> + <!ENTITY THORN CDATA "Þ" -- capital THORN, Icelandic --> + <!ENTITY szlig CDATA "ß" -- small sharp s, German (sz ligature) --> + <!ENTITY agrave CDATA "à" -- small a, grave accent --> + <!ENTITY aacute CDATA "á" -- small a, acute accent --> + <!ENTITY acirc CDATA "â" -- small a, circumflex accent --> + <!ENTITY atilde CDATA "ã" -- small a, tilde --> + <!ENTITY auml CDATA "ä" -- small a, dieresis or umlaut mark --> + <!ENTITY aring CDATA "å" -- small a, ring --> + <!ENTITY aelig CDATA "æ" -- small ae diphthong (ligature) --> + <!ENTITY ccedil CDATA "ç" -- small c, cedilla --> + <!ENTITY egrave CDATA "è" -- small e, grave accent --> + <!ENTITY eacute CDATA "é" -- small e, acute accent --> + <!ENTITY ecirc CDATA "ê" -- small e, circumflex accent --> + <!ENTITY euml CDATA "ë" -- small e, dieresis or umlaut mark --> + <!ENTITY igrave CDATA "ì" -- small i, grave accent --> + <!ENTITY iacute CDATA "í" -- small i, acute accent --> + <!ENTITY icirc CDATA "î" -- small i, circumflex accent --> + <!ENTITY iuml CDATA "ï" -- small i, dieresis or umlaut mark --> + <!ENTITY eth CDATA "ð" -- small eth, Icelandic --> + <!ENTITY ntilde CDATA "ñ" -- small n, tilde --> + <!ENTITY ograve CDATA "ò" -- small o, grave accent --> + + + +Berners-Lee & Connolly Standards Track [Page 76] + +RFC 1866 Hypertext Markup Language - 2.0 November 1995 + + + <!ENTITY oacute CDATA "ó" -- small o, acute accent --> + <!ENTITY ocirc CDATA "ô" -- small o, circumflex accent --> + <!ENTITY otilde CDATA "õ" -- small o, tilde --> + <!ENTITY ouml CDATA "ö" -- small o, dieresis or umlaut mark --> + <!ENTITY divide CDATA "÷" -- divide sign --> + <!ENTITY oslash CDATA "ø" -- small o, slash --> + <!ENTITY ugrave CDATA "ù" -- small u, grave accent --> + <!ENTITY uacute CDATA "ú" -- small u, acute accent --> + <!ENTITY ucirc CDATA "û" -- small u, circumflex accent --> + <!ENTITY uuml CDATA "ü" -- small u, dieresis or umlaut mark --> + <!ENTITY yacute CDATA "ý" -- small y, acute accent --> + <!ENTITY thorn CDATA "þ" -- small thorn, Icelandic --> + <!ENTITY yuml CDATA "ÿ" -- small y, dieresis or umlaut mark --> |