diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2013-03-14 21:31:09 +0200 |
---|---|---|
committer | Serhiy Storchaka <storchaka@gmail.com> | 2013-03-14 21:31:09 +0200 |
commit | 923baea9f921e829ece677e32c45a1a91acb3bef (patch) | |
tree | f503062af1d35b86f71102e2e7aaddb0aa7923a5 | |
parent | 2556c8388c950cd5d41d57251c1471c7bed3bb4b (diff) | |
download | cpython-git-923baea9f921e829ece677e32c45a1a91acb3bef.tar.gz |
Issue #1285086: Get rid of the refcounting hack and speed up urllib.unquote().
-rw-r--r-- | Lib/urllib.py | 32 | ||||
-rw-r--r-- | Lib/urlparse.py | 42 | ||||
-rw-r--r-- | Misc/NEWS | 2 |
3 files changed, 58 insertions, 18 deletions
diff --git a/Lib/urllib.py b/Lib/urllib.py index 33641a5700..f9655f9e88 100644 --- a/Lib/urllib.py +++ b/Lib/urllib.py @@ -28,6 +28,7 @@ import os import time import sys import base64 +import re from urlparse import urljoin as basejoin @@ -1198,22 +1199,35 @@ def splitvalue(attr): _hexdig = '0123456789ABCDEFabcdef' _hextochr = dict((a + b, chr(int(a + b, 16))) for a in _hexdig for b in _hexdig) +_asciire = re.compile('([\x00-\x7f]+)') def unquote(s): """unquote('abc%20def') -> 'abc def'.""" - res = s.split('%') + if _is_unicode(s): + if '%' not in s: + return s + bits = _asciire.split(s) + res = [bits[0]] + append = res.append + for i in range(1, len(bits), 2): + append(unquote(str(bits[i])).decode('latin1')) + append(bits[i + 1]) + return ''.join(res) + + bits = s.split('%') # fastpath - if len(res) == 1: + if len(bits) == 1: return s - s = res[0] - for item in res[1:]: + res = [bits[0]] + append = res.append + for item in bits[1:]: try: - s += _hextochr[item[:2]] + item[2:] + append(_hextochr[item[:2]]) + append(item[2:]) except KeyError: - s += '%' + item - except UnicodeDecodeError: - s += unichr(int(item[:2], 16)) + item[2:] - return s + append('%') + append(item) + return ''.join(res) def unquote_plus(s): """unquote('%7e/abc+def') -> '~/abc def'""" diff --git a/Lib/urlparse.py b/Lib/urlparse.py index f370ce3bdc..4ce982e8fd 100644 --- a/Lib/urlparse.py +++ b/Lib/urlparse.py @@ -28,6 +28,8 @@ test_urlparse.py provides a good indicator of parsing behavior. """ +import re + __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"] @@ -311,6 +313,15 @@ def urldefrag(url): else: return url, '' +try: + unicode +except NameError: + def _is_unicode(x): + return 0 +else: + def _is_unicode(x): + return isinstance(x, unicode) + # unquote method for parse_qs and parse_qsl # Cannot use directly from urllib as it would create a circular reference # because urllib uses urlparse methods (urljoin). If you update this function, @@ -319,22 +330,35 @@ def urldefrag(url): _hexdig = '0123456789ABCDEFabcdef' _hextochr = dict((a+b, chr(int(a+b,16))) for a in _hexdig for b in _hexdig) +_asciire = re.compile('([\x00-\x7f]+)') def unquote(s): """unquote('abc%20def') -> 'abc def'.""" - res = s.split('%') + if _is_unicode(s): + if '%' not in s: + return s + bits = _asciire.split(s) + res = [bits[0]] + append = res.append + for i in range(1, len(bits), 2): + append(unquote(str(bits[i])).decode('latin1')) + append(bits[i + 1]) + return ''.join(res) + + bits = s.split('%') # fastpath - if len(res) == 1: + if len(bits) == 1: return s - s = res[0] - for item in res[1:]: + res = [bits[0]] + append = res.append + for item in bits[1:]: try: - s += _hextochr[item[:2]] + item[2:] + append(_hextochr[item[:2]]) + append(item[2:]) except KeyError: - s += '%' + item - except UnicodeDecodeError: - s += unichr(int(item[:2], 16)) + item[2:] - return s + append('%') + append(item) + return ''.join(res) def parse_qs(qs, keep_blank_values=0, strict_parsing=0): """Parse a query given as a string argument. @@ -214,6 +214,8 @@ Core and Builtins Library ------- +- Issue #1285086: Get rid of the refcounting hack and speed up urllib.unquote(). + - Issue #17368: Fix an off-by-one error in the Python JSON decoder that caused a failure while decoding empty object literals when object_pairs_hook was specified. |