From e3207bd63bcf365a1f91b7c3e75a4b3354435501 Mon Sep 17 00:00:00 2001 From: Dirley Rodrigues Date: Mon, 4 Feb 2013 11:30:58 -0200 Subject: Improve external links finder to not yield duplicate links. --HG-- branch : distribute extra : rebase_source : 78e932fca32ee0ee1f50794cf998f4e7db78131b --- setuptools/package_index.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'setuptools/package_index.py') diff --git a/setuptools/package_index.py b/setuptools/package_index.py index 0ee21e3b..4393c83a 100755 --- a/setuptools/package_index.py +++ b/setuptools/package_index.py @@ -139,20 +139,26 @@ REL = re.compile("""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I) def find_external_links(url, page): """Find rel="homepage" and rel="download" links in `page`, yielding URLs""" + seen = set() for match in REL.finditer(page): tag, rel = match.groups() rels = map(str.strip, rel.lower().split(',')) if 'homepage' in rels or 'download' in rels: for match in HREF.finditer(tag): - yield urlparse.urljoin(url, htmldecode(match.group(1))) + url = urlparse.urljoin(url, htmldecode(match.group(1))) + if not url in seen: + yield url for tag in ("Home Page", "Download URL"): pos = page.find(tag) if pos!=-1: match = HREF.search(page,pos) if match: - yield urlparse.urljoin(url, htmldecode(match.group(1))) + url = urlparse.urljoin(url, htmldecode(match.group(1))) + if not url in seen: + yield url + user_agent = "Python-urllib/%s distribute/%s" % ( sys.version[:3], require('distribute')[0].version -- cgit v1.2.1 From 116420fe62842f18f2d37de46c2177028231755a Mon Sep 17 00:00:00 2001 From: Dirley Rodrigues Date: Mon, 4 Feb 2013 11:39:28 -0200 Subject: avoid naming problems --HG-- branch : distribute extra : rebase_source : 29eeb99013055b8d27cad7f7e8898d06a865b188 --- setuptools/package_index.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'setuptools/package_index.py') diff --git a/setuptools/package_index.py b/setuptools/package_index.py index 4393c83a..984feef4 100755 --- a/setuptools/package_index.py +++ b/setuptools/package_index.py @@ -139,25 +139,25 @@ REL = re.compile("""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I) def find_external_links(url, page): """Find rel="homepage" and rel="download" links in `page`, yielding URLs""" - seen = set() + seen_links = set() for match in REL.finditer(page): tag, rel = match.groups() rels = map(str.strip, rel.lower().split(',')) if 'homepage' in rels or 'download' in rels: for match in HREF.finditer(tag): - url = urlparse.urljoin(url, htmldecode(match.group(1))) - if not url in seen: - yield url + link = urlparse.urljoin(url, htmldecode(match.group(1))) + if not link in seen_links: + yield link for tag in ("Home Page", "Download URL"): pos = page.find(tag) if pos!=-1: match = HREF.search(page,pos) if match: - url = urlparse.urljoin(url, htmldecode(match.group(1))) - if not url in seen: - yield url + link = urlparse.urljoin(url, htmldecode(match.group(1))) + if not link in seen_links: + yield link user_agent = "Python-urllib/%s distribute/%s" % ( -- cgit v1.2.1 From d7ba7ce3c4ce2427d78cb2393bd062aa3a8496b4 Mon Sep 17 00:00:00 2001 From: Dirley Rodrigues Date: Mon, 4 Feb 2013 11:54:53 -0200 Subject: actually filter the links --HG-- branch : distribute extra : rebase_source : cb6e3497e1f8594181f10110cbc833bd6c81f89e --- setuptools/package_index.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'setuptools/package_index.py') diff --git a/setuptools/package_index.py b/setuptools/package_index.py index 984feef4..8974a647 100755 --- a/setuptools/package_index.py +++ b/setuptools/package_index.py @@ -148,6 +148,7 @@ def find_external_links(url, page): for match in HREF.finditer(tag): link = urlparse.urljoin(url, htmldecode(match.group(1))) if not link in seen_links: + seen_links.add(link) yield link for tag in ("Home Page", "Download URL"): @@ -157,6 +158,7 @@ def find_external_links(url, page): if match: link = urlparse.urljoin(url, htmldecode(match.group(1))) if not link in seen_links: + seen_links.add(link) yield link -- cgit v1.2.1 From b8327d7f646141415beb30acd39ce8840ebc708b Mon Sep 17 00:00:00 2001 From: "Jason R. Coombs" Date: Fri, 24 May 2013 10:36:51 -0400 Subject: Add unique_everseen from Python 2.7 docs --HG-- branch : distribute --- setuptools/package_index.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'setuptools/package_index.py') diff --git a/setuptools/package_index.py b/setuptools/package_index.py index 8974a647..e542f586 100755 --- a/setuptools/package_index.py +++ b/setuptools/package_index.py @@ -1,5 +1,6 @@ """PyPI and direct package downloading""" import sys, os.path, re, urlparse, urllib, urllib2, shutil, random, socket, cStringIO +import itertools import base64 import httplib from pkg_resources import * @@ -134,6 +135,24 @@ def interpret_distro_name(location, basename, metadata, platform = platform ) +# From Python 2.7 docs +def unique_everseen(iterable, key=None): + "List unique elements, preserving order. Remember all elements ever seen." + # unique_everseen('AAAABBBCCDAABBB') --> A B C D + # unique_everseen('ABBCcAD', str.lower) --> A B C D + seen = set() + seen_add = seen.add + if key is None: + for element in itertools.ifilterfalse(seen.__contains__, iterable): + seen_add(element) + yield element + else: + for element in iterable: + k = key(element) + if k not in seen: + seen_add(k) + yield element + REL = re.compile("""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I) # this line is here to fix emacs' cruddy broken syntax highlighting -- cgit v1.2.1 From 3b9a57a0c80ee11995fbe937e7dbeca3d83ec10a Mon Sep 17 00:00:00 2001 From: "Jason R. Coombs" Date: Fri, 24 May 2013 11:02:05 -0400 Subject: Use a wrapper to ensure unique values on find_external_links. Factors out uniqueness test into a re-usable decorator and simplifies the body of find_external_links. --HG-- branch : distribute --- setuptools/package_index.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'setuptools/package_index.py') diff --git a/setuptools/package_index.py b/setuptools/package_index.py index e542f586..0a3f9e05 100755 --- a/setuptools/package_index.py +++ b/setuptools/package_index.py @@ -12,6 +12,8 @@ except ImportError: from md5 import md5 from fnmatch import translate +from .py24compat import wraps + EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.]+)$') HREF = re.compile("""href\\s*=\\s*['"]?([^'"> ]+)""", re.I) # this is here to fix emacs' cruddy broken syntax highlighting @@ -153,32 +155,36 @@ def unique_everseen(iterable, key=None): seen_add(k) yield element +def unique_values(func): + """ + Wrap a function returning an iterable such that the resulting iterable + only ever yields unique items. + """ + @wraps(func) + def wrapper(*args, **kwargs): + return unique_everseen(func(*args, **kwargs)) + return wrapper + REL = re.compile("""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I) # this line is here to fix emacs' cruddy broken syntax highlighting +@unique_values def find_external_links(url, page): """Find rel="homepage" and rel="download" links in `page`, yielding URLs""" - seen_links = set() for match in REL.finditer(page): tag, rel = match.groups() rels = map(str.strip, rel.lower().split(',')) if 'homepage' in rels or 'download' in rels: for match in HREF.finditer(tag): - link = urlparse.urljoin(url, htmldecode(match.group(1))) - if not link in seen_links: - seen_links.add(link) - yield link + yield urlparse.urljoin(url, htmldecode(match.group(1))) for tag in ("Home Page", "Download URL"): pos = page.find(tag) if pos!=-1: match = HREF.search(page,pos) if match: - link = urlparse.urljoin(url, htmldecode(match.group(1))) - if not link in seen_links: - seen_links.add(link) - yield link + yield urlparse.urljoin(url, htmldecode(match.group(1))) user_agent = "Python-urllib/%s distribute/%s" % ( -- cgit v1.2.1