[svn r4237] r5292@delle: sbehnel | 2009-10-17 02:11:33 +0200

fix bug 449926: reverse URL iteration inside of text content to simplify replacements --HG-- branch : trunk
author: scoder <none@none> 2009-10-17 02:11:42 +0200
committer: scoder <none@none> 2009-10-17 02:11:42 +0200
commit: a49509f5c24e2d8a7fda8d3feff46c200b97b3da (patch)
tree: 1cb537c28d22ddaf7c70b8ce5bfb3756e4e3fc52 /src
parent: 0878e377e3859cde852c87cda023a562a407cd95 (diff)
download: python-lxml-a49509f5c24e2d8a7fda8d3feff46c200b97b3da.tar.gz
2 files changed, 80 insertions, 9 deletions
diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py
index e66b1e86..e822fea6 100644
--- a/src/lxml/html/__init__.py
+++ b/src/lxml/html/__init__.py
@@ -308,6 +308,13 @@ class HtmlMixin(object):
 
         Note: <base href> is *not* taken into account in any way.  The
         link you get is exactly the link in the document.
+
+        Note: multiple links inside of a single text string or
+        attribute value are returned in reversed order.  This makes it
+        possible to replace or delete them from the text string value
+        based on their reported text positions.  Otherwise, a
+        modification at one text position can change the positions of
+        links reported later on.
         """
         link_attrs = defs.link_attrs
         for el in self.iter():
@@ -347,15 +354,29 @@ class HtmlMixin(object):
                     ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
                     yield (el, 'value', el.get('value'), 0)
             if tag == 'style' and el.text:
-                for match in _css_url_re.finditer(el.text):
-                    url, start = _unquote_match(match.group(1), match.start(1))
-                    yield (el, None, url, start)
-                for match in _css_import_re.finditer(el.text):
-                    yield (el, None, match.group(1), match.start(1))
+                urls = [
+                    _unquote_match(match.group(1), match.start(1))
+                    for match in _css_url_re.finditer(el.text)
+                    ] + [
+                    (match.group(1), match.start(1))
+                    for match in _css_import_re.finditer(el.text)
+                    ]
+                if urls:
+                    # sort by start pos to bring both match sets back into order
+                    urls = [ (start, url) for (url, start) in urls ]
+                    urls.sort()
+                    # reverse the list to report correct positions despite
+                    # modifications
+                    urls.reverse()
+                    for start, url in urls:
+                        yield (el, None, url, start)
             if 'style' in attribs:
-                for match in _css_url_re.finditer(attribs['style']):
-                    url, start = _unquote_match(match.group(1), match.start(1))
-                    yield (el, 'style', url, start)
+                urls = list(_css_url_re.finditer(attribs['style']))
+                if urls:
+                    # return in reversed order to simplify in-place modifications
+                    for match in urls[::-1]:
+                        url, start = _unquote_match(match.group(1), match.start(1))
+                        yield (el, 'style', url, start)
 
     def rewrite_links(self, link_repl_func, resolve_base_href=True,
                       base_href=None):
diff --git a/src/lxml/html/tests/test_rewritelinks.txt b/src/lxml/html/tests/test_rewritelinks.txt
index 1ba2dc5f..43dd99da 100644
--- a/src/lxml/html/tests/test_rewritelinks.txt
+++ b/src/lxml/html/tests/test_rewritelinks.txt
@@ -1,3 +1,8 @@
+
+Setup::
+
+    >>> import lxml.html
+
 We'll define a link translation function:
 
     >>> base_href = 'http://old/base/path.html'
@@ -118,8 +123,8 @@ link)``, which is awkward to test here, so we'll make a printer::
     ...   </table>
     ...  </body></html>'''))
     link href="style.css"
-    style None="/bg.gif"@40
     style None="/other-styles.css"@69
+    style None="/bg.gif"@40
     script src="/js-funcs.js"
     a href="/test.html"
     a href="/other.html"
@@ -179,3 +184,48 @@ An application of ``iterlinks()`` is ``make_links_absolute()``::
       </table>
      </body>
     </html>
+
+Check if we can replace multiple links inside of the same text string::
+
+    >>> html = lxml.html.fromstring ("""\
+    ... <html>
+    ...   <head>
+    ...      <title>Test</title>
+    ...      <style type='text/css'>
+    ...        .bg1 {
+    ...            background: url(images/bg1.png);
+    ...        }
+    ...        .bg2 {
+    ...            background: url(images/bg2.png);
+    ...        }
+    ...      </style>
+    ...   </head>
+    ...   <body>
+    ...      <p>Hi</p>
+    ...   </body>
+    ... </html>
+    ... """,
+    ... base_url = 'http://www.example.com/')
+
+    >>> html.make_links_absolute ()
+
+    >>> try: _unicode = unicode
+    ... except NameError: _unicode = str
+
+    >>> print(lxml.html.tostring (html, pretty_print = True, encoding=_unicode))
+    <html>
+      <head>
+        <title>Test</title>
+        <style type="text/css">
+          .bg1 {
+            background: url(http://www.example.com/images/bg1.png);
+          }
+          .bg2 {
+            background: url(http://www.example.com/images/bg2.png);
+          }
+        </style>
+      </head>
+      <body>
+        <p>Hi</p>
+      </body>
+    </html>
author	scoder <none@none>	2009-10-17 02:11:42 +0200
committer	scoder <none@none>	2009-10-17 02:11:42 +0200
commit	a49509f5c24e2d8a7fda8d3feff46c200b97b3da (patch)
tree	1cb537c28d22ddaf7c70b8ce5bfb3756e4e3fc52 /src
parent	0878e377e3859cde852c87cda023a562a407cd95 (diff)
download	python-lxml-a49509f5c24e2d8a7fda8d3feff46c200b97b3da.tar.gz