diff options
| author | scoder <none@none> | 2009-10-17 02:11:42 +0200 |
|---|---|---|
| committer | scoder <none@none> | 2009-10-17 02:11:42 +0200 |
| commit | a49509f5c24e2d8a7fda8d3feff46c200b97b3da (patch) | |
| tree | 1cb537c28d22ddaf7c70b8ce5bfb3756e4e3fc52 /src | |
| parent | 0878e377e3859cde852c87cda023a562a407cd95 (diff) | |
| download | python-lxml-a49509f5c24e2d8a7fda8d3feff46c200b97b3da.tar.gz | |
[svn r4237] r5292@delle: sbehnel | 2009-10-17 02:11:33 +0200
fix bug 449926: reverse URL iteration inside of text content to simplify replacements
--HG--
branch : trunk
Diffstat (limited to 'src')
| -rw-r--r-- | src/lxml/html/__init__.py | 37 | ||||
| -rw-r--r-- | src/lxml/html/tests/test_rewritelinks.txt | 52 |
2 files changed, 80 insertions, 9 deletions
diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py index e66b1e86..e822fea6 100644 --- a/src/lxml/html/__init__.py +++ b/src/lxml/html/__init__.py @@ -308,6 +308,13 @@ class HtmlMixin(object): Note: <base href> is *not* taken into account in any way. The link you get is exactly the link in the document. + + Note: multiple links inside of a single text string or + attribute value are returned in reversed order. This makes it + possible to replace or delete them from the text string value + based on their reported text positions. Otherwise, a + modification at one text position can change the positions of + links reported later on. """ link_attrs = defs.link_attrs for el in self.iter(): @@ -347,15 +354,29 @@ class HtmlMixin(object): ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype yield (el, 'value', el.get('value'), 0) if tag == 'style' and el.text: - for match in _css_url_re.finditer(el.text): - url, start = _unquote_match(match.group(1), match.start(1)) - yield (el, None, url, start) - for match in _css_import_re.finditer(el.text): - yield (el, None, match.group(1), match.start(1)) + urls = [ + _unquote_match(match.group(1), match.start(1)) + for match in _css_url_re.finditer(el.text) + ] + [ + (match.group(1), match.start(1)) + for match in _css_import_re.finditer(el.text) + ] + if urls: + # sort by start pos to bring both match sets back into order + urls = [ (start, url) for (url, start) in urls ] + urls.sort() + # reverse the list to report correct positions despite + # modifications + urls.reverse() + for start, url in urls: + yield (el, None, url, start) if 'style' in attribs: - for match in _css_url_re.finditer(attribs['style']): - url, start = _unquote_match(match.group(1), match.start(1)) - yield (el, 'style', url, start) + urls = list(_css_url_re.finditer(attribs['style'])) + if urls: + # return in reversed order to simplify in-place modifications + for match in urls[::-1]: + url, start = _unquote_match(match.group(1), match.start(1)) + yield (el, 'style', url, start) def rewrite_links(self, link_repl_func, resolve_base_href=True, base_href=None): diff --git a/src/lxml/html/tests/test_rewritelinks.txt b/src/lxml/html/tests/test_rewritelinks.txt index 1ba2dc5f..43dd99da 100644 --- a/src/lxml/html/tests/test_rewritelinks.txt +++ b/src/lxml/html/tests/test_rewritelinks.txt @@ -1,3 +1,8 @@ + +Setup:: + + >>> import lxml.html + We'll define a link translation function: >>> base_href = 'http://old/base/path.html' @@ -118,8 +123,8 @@ link)``, which is awkward to test here, so we'll make a printer:: ... </table> ... </body></html>''')) link href="style.css" - style None="/bg.gif"@40 style None="/other-styles.css"@69 + style None="/bg.gif"@40 script src="/js-funcs.js" a href="/test.html" a href="/other.html" @@ -179,3 +184,48 @@ An application of ``iterlinks()`` is ``make_links_absolute()``:: </table> </body> </html> + +Check if we can replace multiple links inside of the same text string:: + + >>> html = lxml.html.fromstring ("""\ + ... <html> + ... <head> + ... <title>Test</title> + ... <style type='text/css'> + ... .bg1 { + ... background: url(images/bg1.png); + ... } + ... .bg2 { + ... background: url(images/bg2.png); + ... } + ... </style> + ... </head> + ... <body> + ... <p>Hi</p> + ... </body> + ... </html> + ... """, + ... base_url = 'http://www.example.com/') + + >>> html.make_links_absolute () + + >>> try: _unicode = unicode + ... except NameError: _unicode = str + + >>> print(lxml.html.tostring (html, pretty_print = True, encoding=_unicode)) + <html> + <head> + <title>Test</title> + <style type="text/css"> + .bg1 { + background: url(http://www.example.com/images/bg1.png); + } + .bg2 { + background: url(http://www.example.com/images/bg2.png); + } + </style> + </head> + <body> + <p>Hi</p> + </body> + </html> |
