src/lxml/html/tests/test_feedparser_data.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95

import os
import re
try:
    from rfc822 import Message
except ImportError:
    # Python 3
    from email import message_from_file as Message
import unittest
from lxml.tests.common_imports import doctest
from lxml.doctestcompare import LHTMLOutputChecker

from lxml.html.clean import clean, Cleaner

feed_dirs = [
    os.path.join(os.path.dirname(__file__), 'feedparser-data'),
    os.path.join(os.path.dirname(__file__), 'hackers-org-data'),
    ]
bar_re = re.compile(r"-----+")

class DummyInput:
    def __init__(self, **kw):
        for name, value in kw.items():
            setattr(self, name, value)

class FeedTestCase(unittest.TestCase):

    def __init__(self, filename):
        self.filename = filename
        unittest.TestCase.__init__(self)

    def parse(self):
        f = open(self.filename, 'r')
        headers = Message(f)
        c = f.read()
        f.close()
        if not c.strip():
            c = headers.get_payload()
        if not headers.keys():
            raise Exception(
                "File %s has no headers" % self.filename)
        self.description = headers['Description']
        self.expect = headers.get('Expect', '')
        self.ignore = headers.get('Ignore')
        self.options = [
            o.strip() for o in headers.get('Options', '').split(',')
            if o.strip()]
        parts = bar_re.split(c)
        self.input = parts[0].rstrip() + '\n'
        if parts[1:]:
            self.expect = parts[1].rstrip() + '\n'
        else:
            self.expect = None

    def runTest(self):
        self.parse()
        if self.ignore:
            # We've marked this test to be ignored.
            return
        kw = {}
        for name in self.options:
            if name.startswith('-'):
                kw[name[1:]] = False
            else:
                kw[name] = True
        if kw.get('clean', True):
            transformed = Cleaner(**kw).clean_html(self.input)
        else:
            transformed = self.input
        assert self.expect is not None, (
            "No expected output in %s" % self.filename)
        checker = LHTMLOutputChecker()
        if not checker.check_output(self.expect, transformed, 0):
            result = checker.output_difference(
                DummyInput(want=self.expect), transformed, 0)
            #result += '\noptions: %s %r' % (', '.join(self.options), kw)
            #result += repr(transformed)
            raise Exception("\n"+result)

    def shortDescription(self):
        return self.filename

def test_suite():
    suite = unittest.TestSuite()
    for dir in feed_dirs:
        for fn in os.listdir(dir):
            fn = os.path.join(dir, fn)
            if fn.endswith('.data'):
                case = FeedTestCase(fn)
                suite.addTests([case])
                # This is my lazy way of stopping on first error:
                try:
                    case.runTest()
                except:
                    break
    return suite