HTML-Parser-3.71HEAD HTML-Parser-3.71 master

author: Lorry Tar Creator <lorry-tar-importer@lorry> 2013-05-08 22:21:52 +0000
committer: Lorry Tar Creator <lorry-tar-importer@lorry> 2013-05-08 22:21:52 +0000
commit: 2f253cfc85ffd55a8acb988e91f0bc5ab348124c (patch)
tree: 4734ccd522c71dd455879162006742002f8c1565 /t/headparser.t
download: HTML-Parser-tarball-master.tar.gz
1 files changed, 200 insertions, 0 deletions
diff --git a/t/headparser.t b/t/headparser.t
new file mode 100644
index 0000000..1b4b810
--- /dev/null
+++ b/t/headparser.t
@@ -0,0 +1,200 @@
+#!perl -w
+
+use strict;
+use Test::More tests => 17;
+
+{ package H;
+  sub new { bless {}, shift; }
+
+  sub header {
+     my $self = shift;
+     my $key  = uc(shift);
+     die if $key =~ /:/;
+     my $old = $self->{$key};
+     if (@_) { $self->{$key} = shift; }
+     $old;
+  }
+
+  sub push_header {
+     my($self, $k, $v) = @_;
+     $k = uc($k);
+     die if $k =~ /:/;
+     if (exists $self->{$k}) {
+        $self->{$k} = [ $self->{$k} ] unless ref $self->{$k};
+	push(@{$self->{$k}}, $v);
+     } else {
+	$self->{$k} = $v;
+     }
+  }
+
+  sub as_string {
+     my $self = shift;
+     my $str = "";
+     for (sort keys %$self) {
+         if (ref($self->{$_})) {
+            my $v;
+            for $v (@{$self->{$_}}) {
+	        $str .= "$_: $v\n";
+            }
+         } else {
+            $str .= "$_: $self->{$_}\n";
+         }
+     }
+     $str;
+  }
+}
+
+
+my $HTML = <<'EOT';
+
+<title>&Aring være eller &#229; ikke være</title>
+<meta http-equiv="Expires" content="Soon">
+<meta http-equiv="Foo" content="Bar">
+<meta name='twitter:card' content='photo' />
+<link href="mailto:gisle@aas.no" rev=made title="Gisle Aas">
+
+<script>
+
+    ignore this
+
+</script>
+<noscript> ... and this </noscript>
+
+<object classid="foo">
+
+<base href="http://www.sn.no">
+<meta name="Keywords" content="test, test, test,...">
+<meta name="Keywords" content="more">
+<meta charset="ISO-8859-1"><!-- HTML 5 -->
+
+Dette er vanlig tekst.  Denne teksten definerer også slutten på
+&lt;head> delen av dokumentet.
+
+<style>
+
+   ignore this too
+
+</style>
+
+<isindex>
+
+Dette er også vanlig tekst som ikke skal blir parset i det hele tatt.
+
+EOT
+
+$| = 1;
+
+#$HTML::HeadParser::DEBUG = 1;
+require HTML::HeadParser;
+my $p = HTML::HeadParser->new( H->new );
+
+if ($p->parse($HTML)) {
+    fail("Need more data which should not happen");
+} else {
+    #diag $p->as_string;
+    pass();
+}
+
+like($p->header('Title'), qr/Å være eller å ikke være/);
+is($p->header('Expires'), 'Soon');
+is($p->header('Content-Base'), 'http://www.sn.no');
+is_deeply($p->header('X-Meta-Keywords'), ['test, test, test,...', 'more']);
+is($p->header('X-Meta-Charset'), 'ISO-8859-1');
+is($p->header('X-Meta-Twitter-Card'), 'photo');
+like($p->header('Link'), qr/<mailto:gisle\@aas.no>/);
+
+# This header should not be present because the head ended
+ok(!$p->header('Isindex'));
+
+
+# Try feeding one char at a time
+my $expected = $p->as_string;
+my $nl = 1;
+$p = HTML::HeadParser->new(H->new);
+while ($HTML =~ /(.)/sg) {
+    #print STDERR '#' if $nl;
+    #print STDERR $1;
+    $nl = $1 eq "\n";
+    $p->parse($1) or last;
+}
+is($p->as_string, $expected);
+
+
+# Try reading it from a file
+my $file = "hptest$$.html";
+die "$file already exists" if -e $file;
+
+open(FILE, ">$file") or die "Can't create $file: $!";
+binmode(FILE);
+print FILE $HTML;
+print FILE "<p>This is more content...</p>\n" x 2000;
+print FILE "<title>Buuuh!</title>\n" x 200;
+close FILE or die "Can't close $file: $!";
+
+$p = HTML::HeadParser->new(H->new);
+$p->parse_file($file);
+unlink($file) or warn "Can't unlink $file: $!";
+
+is($p->header("Title"), "Å være eller å ikke være");
+
+
+# We got into an infinite loop on data without tags and no EOL.
+# This was actually a HTML::Parser bug.
+open(FILE, ">$file") or die "Can't create $file: $!";
+print FILE "Foo";
+close(FILE);
+
+$p = HTML::HeadParser->new(H->new);
+$p->parse_file($file);
+unlink($file) or warn "Can't unlink $file: $!";
+
+ok(!$p->as_string);
+
+SKIP: {
+  skip "Need Unicode support", 5 if $] < 5.008;
+
+  # Test that the Unicode BOM does not confuse us?
+  $p = HTML::HeadParser->new(H->new);
+  ok($p->parse("\x{FEFF}\n<title>Hi <foo></title>"));
+  $p->eof;
+
+  is($p->header("title"), "Hi <foo>");
+
+  $p = HTML::HeadParser->new(H->new);
+  $p->utf8_mode(1);
+  $p->parse(<<"EOT");  # example from http://rt.cpan.org/Ticket/Display.html?id=27522
+\xEF\xBB\xBF<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html>
+ <head>
+ <title>
+Parkinson's disease</title>
+ <meta name="Keywords" content="brain,disease,dopamine,drug,levodopa,parkinson,patients,symptoms,,Medications, Medications">
+ </meta>
+ \t
+\t<link href="../../css/ummAdam.css" rel="stylesheet" type="text/css" />
+\t<link rel="stylesheet" rev="stylesheet" href="../../css/ummprint.css" media="print" />
+\t
+\t </head>
+ <body>
+EOT
+  $p->eof;
+
+  is($p->header("title"), "Parkinson's disease");
+  is($p->header("link")->[0], '<../../css/ummAdam.css>; rel="stylesheet"; type="text/css"');
+
+  $p = HTML::HeadParser->new(H->new);
+  $p->utf8_mode(1);
+  $p->parse(<<"EOT");   # example from http://www.mjw.com.pl/
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\r
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="pl" lang="pl"> \r
+\r
+<head profile="http://gmpg.org/xfn/11">\r
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r
+\r
+<title> ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa &#8211; MJW</title>\r
+<link rel="shortcut icon" href="favicon.ico" type="image/x-icon" />\r
+
+EOT
+    $p->eof;
+    is($p->header("title"), "ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa \xE2\x80\x93 MJW");
+}
author	Lorry Tar Creator <lorry-tar-importer@lorry>	2013-05-08 22:21:52 +0000
committer	Lorry Tar Creator <lorry-tar-importer@lorry>	2013-05-08 22:21:52 +0000
commit	2f253cfc85ffd55a8acb988e91f0bc5ab348124c (patch)
tree	4734ccd522c71dd455879162006742002f8c1565 /t/headparser.t
download	HTML-Parser-tarball-master.tar.gz