summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorscoder <none@none>2009-10-17 02:11:42 +0200
committerscoder <none@none>2009-10-17 02:11:42 +0200
commita49509f5c24e2d8a7fda8d3feff46c200b97b3da (patch)
tree1cb537c28d22ddaf7c70b8ce5bfb3756e4e3fc52 /src
parent0878e377e3859cde852c87cda023a562a407cd95 (diff)
downloadpython-lxml-a49509f5c24e2d8a7fda8d3feff46c200b97b3da.tar.gz
[svn r4237] r5292@delle: sbehnel | 2009-10-17 02:11:33 +0200
fix bug 449926: reverse URL iteration inside of text content to simplify replacements --HG-- branch : trunk
Diffstat (limited to 'src')
-rw-r--r--src/lxml/html/__init__.py37
-rw-r--r--src/lxml/html/tests/test_rewritelinks.txt52
2 files changed, 80 insertions, 9 deletions
diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py
index e66b1e86..e822fea6 100644
--- a/src/lxml/html/__init__.py
+++ b/src/lxml/html/__init__.py
@@ -308,6 +308,13 @@ class HtmlMixin(object):
Note: <base href> is *not* taken into account in any way. The
link you get is exactly the link in the document.
+
+ Note: multiple links inside of a single text string or
+ attribute value are returned in reversed order. This makes it
+ possible to replace or delete them from the text string value
+ based on their reported text positions. Otherwise, a
+ modification at one text position can change the positions of
+ links reported later on.
"""
link_attrs = defs.link_attrs
for el in self.iter():
@@ -347,15 +354,29 @@ class HtmlMixin(object):
## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
yield (el, 'value', el.get('value'), 0)
if tag == 'style' and el.text:
- for match in _css_url_re.finditer(el.text):
- url, start = _unquote_match(match.group(1), match.start(1))
- yield (el, None, url, start)
- for match in _css_import_re.finditer(el.text):
- yield (el, None, match.group(1), match.start(1))
+ urls = [
+ _unquote_match(match.group(1), match.start(1))
+ for match in _css_url_re.finditer(el.text)
+ ] + [
+ (match.group(1), match.start(1))
+ for match in _css_import_re.finditer(el.text)
+ ]
+ if urls:
+ # sort by start pos to bring both match sets back into order
+ urls = [ (start, url) for (url, start) in urls ]
+ urls.sort()
+ # reverse the list to report correct positions despite
+ # modifications
+ urls.reverse()
+ for start, url in urls:
+ yield (el, None, url, start)
if 'style' in attribs:
- for match in _css_url_re.finditer(attribs['style']):
- url, start = _unquote_match(match.group(1), match.start(1))
- yield (el, 'style', url, start)
+ urls = list(_css_url_re.finditer(attribs['style']))
+ if urls:
+ # return in reversed order to simplify in-place modifications
+ for match in urls[::-1]:
+ url, start = _unquote_match(match.group(1), match.start(1))
+ yield (el, 'style', url, start)
def rewrite_links(self, link_repl_func, resolve_base_href=True,
base_href=None):
diff --git a/src/lxml/html/tests/test_rewritelinks.txt b/src/lxml/html/tests/test_rewritelinks.txt
index 1ba2dc5f..43dd99da 100644
--- a/src/lxml/html/tests/test_rewritelinks.txt
+++ b/src/lxml/html/tests/test_rewritelinks.txt
@@ -1,3 +1,8 @@
+
+Setup::
+
+ >>> import lxml.html
+
We'll define a link translation function:
>>> base_href = 'http://old/base/path.html'
@@ -118,8 +123,8 @@ link)``, which is awkward to test here, so we'll make a printer::
... </table>
... </body></html>'''))
link href="style.css"
- style None="/bg.gif"@40
style None="/other-styles.css"@69
+ style None="/bg.gif"@40
script src="/js-funcs.js"
a href="/test.html"
a href="/other.html"
@@ -179,3 +184,48 @@ An application of ``iterlinks()`` is ``make_links_absolute()``::
</table>
</body>
</html>
+
+Check if we can replace multiple links inside of the same text string::
+
+ >>> html = lxml.html.fromstring ("""\
+ ... <html>
+ ... <head>
+ ... <title>Test</title>
+ ... <style type='text/css'>
+ ... .bg1 {
+ ... background: url(images/bg1.png);
+ ... }
+ ... .bg2 {
+ ... background: url(images/bg2.png);
+ ... }
+ ... </style>
+ ... </head>
+ ... <body>
+ ... <p>Hi</p>
+ ... </body>
+ ... </html>
+ ... """,
+ ... base_url = 'http://www.example.com/')
+
+ >>> html.make_links_absolute ()
+
+ >>> try: _unicode = unicode
+ ... except NameError: _unicode = str
+
+ >>> print(lxml.html.tostring (html, pretty_print = True, encoding=_unicode))
+ <html>
+ <head>
+ <title>Test</title>
+ <style type="text/css">
+ .bg1 {
+ background: url(http://www.example.com/images/bg1.png);
+ }
+ .bg2 {
+ background: url(http://www.example.com/images/bg2.png);
+ }
+ </style>
+ </head>
+ <body>
+ <p>Hi</p>
+ </body>
+ </html>