diff options
author | Lorry Tar Creator <lorry-tar-importer@lorry> | 2013-05-08 22:21:52 +0000 |
---|---|---|
committer | Lorry Tar Creator <lorry-tar-importer@lorry> | 2013-05-08 22:21:52 +0000 |
commit | 2f253cfc85ffd55a8acb988e91f0bc5ab348124c (patch) | |
tree | 4734ccd522c71dd455879162006742002f8c1565 /t/headparser.t | |
download | HTML-Parser-tarball-master.tar.gz |
HTML-Parser-3.71HEADHTML-Parser-3.71master
Diffstat (limited to 't/headparser.t')
-rw-r--r-- | t/headparser.t | 200 |
1 files changed, 200 insertions, 0 deletions
diff --git a/t/headparser.t b/t/headparser.t new file mode 100644 index 0000000..1b4b810 --- /dev/null +++ b/t/headparser.t @@ -0,0 +1,200 @@ +#!perl -w + +use strict; +use Test::More tests => 17; + +{ package H; + sub new { bless {}, shift; } + + sub header { + my $self = shift; + my $key = uc(shift); + die if $key =~ /:/; + my $old = $self->{$key}; + if (@_) { $self->{$key} = shift; } + $old; + } + + sub push_header { + my($self, $k, $v) = @_; + $k = uc($k); + die if $k =~ /:/; + if (exists $self->{$k}) { + $self->{$k} = [ $self->{$k} ] unless ref $self->{$k}; + push(@{$self->{$k}}, $v); + } else { + $self->{$k} = $v; + } + } + + sub as_string { + my $self = shift; + my $str = ""; + for (sort keys %$self) { + if (ref($self->{$_})) { + my $v; + for $v (@{$self->{$_}}) { + $str .= "$_: $v\n"; + } + } else { + $str .= "$_: $self->{$_}\n"; + } + } + $str; + } +} + + +my $HTML = <<'EOT'; + +<title>Å være eller å ikke være</title> +<meta http-equiv="Expires" content="Soon"> +<meta http-equiv="Foo" content="Bar"> +<meta name='twitter:card' content='photo' /> +<link href="mailto:gisle@aas.no" rev=made title="Gisle Aas"> + +<script> + + ignore this + +</script> +<noscript> ... and this </noscript> + +<object classid="foo"> + +<base href="http://www.sn.no"> +<meta name="Keywords" content="test, test, test,..."> +<meta name="Keywords" content="more"> +<meta charset="ISO-8859-1"><!-- HTML 5 --> + +Dette er vanlig tekst. Denne teksten definerer også slutten på +<head> delen av dokumentet. + +<style> + + ignore this too + +</style> + +<isindex> + +Dette er også vanlig tekst som ikke skal blir parset i det hele tatt. + +EOT + +$| = 1; + +#$HTML::HeadParser::DEBUG = 1; +require HTML::HeadParser; +my $p = HTML::HeadParser->new( H->new ); + +if ($p->parse($HTML)) { + fail("Need more data which should not happen"); +} else { + #diag $p->as_string; + pass(); +} + +like($p->header('Title'), qr/Å være eller å ikke være/); +is($p->header('Expires'), 'Soon'); +is($p->header('Content-Base'), 'http://www.sn.no'); +is_deeply($p->header('X-Meta-Keywords'), ['test, test, test,...', 'more']); +is($p->header('X-Meta-Charset'), 'ISO-8859-1'); +is($p->header('X-Meta-Twitter-Card'), 'photo'); +like($p->header('Link'), qr/<mailto:gisle\@aas.no>/); + +# This header should not be present because the head ended +ok(!$p->header('Isindex')); + + +# Try feeding one char at a time +my $expected = $p->as_string; +my $nl = 1; +$p = HTML::HeadParser->new(H->new); +while ($HTML =~ /(.)/sg) { + #print STDERR '#' if $nl; + #print STDERR $1; + $nl = $1 eq "\n"; + $p->parse($1) or last; +} +is($p->as_string, $expected); + + +# Try reading it from a file +my $file = "hptest$$.html"; +die "$file already exists" if -e $file; + +open(FILE, ">$file") or die "Can't create $file: $!"; +binmode(FILE); +print FILE $HTML; +print FILE "<p>This is more content...</p>\n" x 2000; +print FILE "<title>Buuuh!</title>\n" x 200; +close FILE or die "Can't close $file: $!"; + +$p = HTML::HeadParser->new(H->new); +$p->parse_file($file); +unlink($file) or warn "Can't unlink $file: $!"; + +is($p->header("Title"), "Å være eller å ikke være"); + + +# We got into an infinite loop on data without tags and no EOL. +# This was actually a HTML::Parser bug. +open(FILE, ">$file") or die "Can't create $file: $!"; +print FILE "Foo"; +close(FILE); + +$p = HTML::HeadParser->new(H->new); +$p->parse_file($file); +unlink($file) or warn "Can't unlink $file: $!"; + +ok(!$p->as_string); + +SKIP: { + skip "Need Unicode support", 5 if $] < 5.008; + + # Test that the Unicode BOM does not confuse us? + $p = HTML::HeadParser->new(H->new); + ok($p->parse("\x{FEFF}\n<title>Hi <foo></title>")); + $p->eof; + + is($p->header("title"), "Hi <foo>"); + + $p = HTML::HeadParser->new(H->new); + $p->utf8_mode(1); + $p->parse(<<"EOT"); # example from http://rt.cpan.org/Ticket/Display.html?id=27522 +\xEF\xBB\xBF<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html> + <head> + <title> +Parkinson's disease</title> + <meta name="Keywords" content="brain,disease,dopamine,drug,levodopa,parkinson,patients,symptoms,,Medications, Medications"> + </meta> + \t +\t<link href="../../css/ummAdam.css" rel="stylesheet" type="text/css" /> +\t<link rel="stylesheet" rev="stylesheet" href="../../css/ummprint.css" media="print" /> +\t +\t </head> + <body> +EOT + $p->eof; + + is($p->header("title"), "Parkinson's disease"); + is($p->header("link")->[0], '<../../css/ummAdam.css>; rel="stylesheet"; type="text/css"'); + + $p = HTML::HeadParser->new(H->new); + $p->utf8_mode(1); + $p->parse(<<"EOT"); # example from http://www.mjw.com.pl/ +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\r +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="pl" lang="pl"> \r +\r +<head profile="http://gmpg.org/xfn/11">\r +<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r +\r +<title> ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa – MJW</title>\r +<link rel="shortcut icon" href="favicon.ico" type="image/x-icon" />\r + +EOT + $p->eof; + is($p->header("title"), "ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa \xE2\x80\x93 MJW"); +} |