diff options
author | Kenneth Reitz <me@kennethreitz.com> | 2012-10-16 13:39:00 -0700 |
---|---|---|
committer | Kenneth Reitz <me@kennethreitz.com> | 2012-10-16 13:39:00 -0700 |
commit | e3c51e4182c54ffc494a7327e69004342844b0a4 (patch) | |
tree | 6f8a68415b5adc4109cd63b56d134e48535288b3 | |
parent | 706f88fedd7c526bc17ec9341a75ace1486d0a60 (diff) | |
download | python-requests-cache.tar.gz |
brain dumpcache
-rw-r--r-- | httpcache/README.rst | 103 | ||||
-rw-r--r-- | httpcache/conftest.py | 41 | ||||
-rw-r--r-- | httpcache/httpcache/__init__.py | 229 | ||||
-rw-r--r-- | httpcache/httpcache/cache.py | 155 | ||||
-rw-r--r-- | httpcache/httpcache/tests/__init__.py | 0 | ||||
-rw-r--r-- | httpcache/httpcache/tests/integration/test_caching_conditions.py | 44 | ||||
-rw-r--r-- | httpcache/httpcache/tests/server.py | 36 | ||||
-rw-r--r-- | httpcache/httpcache/tests/test_cache.py | 76 | ||||
-rw-r--r-- | httpcache/httpcache/tests/test_cache_control.py | 167 | ||||
-rw-r--r-- | httpcache/httpcache/tests/test_cache_invalidation.py | 39 | ||||
-rw-r--r-- | httpcache/httpcache/tests/test_max_age.py | 50 | ||||
-rw-r--r-- | httpcache/setup.py | 18 | ||||
m--------- | py-reqcache | 0 | ||||
-rw-r--r-- | requests/cache.py | 150 | ||||
-rw-r--r-- | requests/sessions.py | 8 | ||||
-rw-r--r-- | t.py | 26 |
16 files changed, 1133 insertions, 9 deletions
diff --git a/httpcache/README.rst b/httpcache/README.rst new file mode 100644 index 00000000..f64cd3d8 --- /dev/null +++ b/httpcache/README.rst @@ -0,0 +1,103 @@ +=========== + httpcache +=========== + +Httpcache is a port of the caching algorithms in httplib2_ for use with +requests_ session object. + +It was written because httplib2's better support for caching is often +mitigated by its lack of threadsafety. The same is true of requests in +terms of caching. + + +Usage +===== + +NOTE: Eventually, my hope is that this module can be integrated directly +into requests. That said, I've had minimal exposure to requests, so I +expect the initial implementation to be rather un-requests-like in +terms of its API. Suggestions and patches welcome! + +Here is the basic usage: :: + + import requests + + from httpcache import CacheControl + + + sess = requests.session() + cached_sess = CacheControl(sess) + + response = cached_sess.get('http://google.com') + +If the URL contains any caching based headers, it will cache the +result in a simple dictionary. + +Below is the implementation of the DictCache, the default cache +backend. It is extremely simple and shows how you would implement some +other cache backend: :: + + from httpcache.cache import BaseCache + + + class DictCache(BaseCache): + + def __init__(self, init_dict=None): + self.data = init_dict or {} + + def get(self, key): + return self.data.get(key, None) + + def set(self, key, value): + self.data.update({key: value}) + + def delete(self, key): + self.data.pop(key) + + + +See? Really simple. + + +Design +====== + +The CacheControl object's main task is to wrap the GET call of the +session object. The caching takes place by examining the request to +see if it should try to ue the cache. For example, if the request +includes a 'no-cache' or 'max-age=0' Cache-Control header, it will not +try to cache the request. If there is an cached value and its value +has been deemed fresh, the it will return the cached response. + +If the request cannot be cached, the actual request is peformed. At +this point we then analyze the response and see if we should add it to +the cache. For example, if the request contains a 'max-age=3600' in +the 'Cache-Control' header, it will cache the response before +returning it to the caller. + + +Tests +===== + +The tests are all in httpcache/tests and is runnable by py.test. + + +TODO +==== + + [ ]- Better integration with requests + [ ]- ETags / if-* header support + [ ]- Tests that run a server from the stdlib + + +Disclaimers +=========== + +Httpcache is brand new and maybe totally broken. I have some tests and +it is a pretty direct port of httplib2 caching, which I've found to be +very reliable. With that in mind, it hasn't been used in a production +environment just yet. If you check it out and find bugs, let me know. + + +.. _httplib2: http://code.google.com/p/httplib2/ +.. _requests: http://docs.python-requests.org/ diff --git a/httpcache/conftest.py b/httpcache/conftest.py new file mode 100644 index 00000000..2fe30f8a --- /dev/null +++ b/httpcache/conftest.py @@ -0,0 +1,41 @@ +import os +import py.test +import subprocess +import requests +import time + + +class TestServer(object): + + def start(self): + base = os.path.abspath(os.path.dirname(__file__)) + server_file = os.path.join(base, 'httpcache', 'tests', 'server.py') + cmd = ['python', server_file] + + kw = {} + if not os.environ.get('TEST_SERVER_OUTPUT'): + kw = {'stdout': subprocess.PIPE, + 'stderr': subprocess.STDOUT} + self.proc = subprocess.Popen(cmd, **kw) + url = 'http://localhost:8080' + up = None + while not up: + try: + up = requests.get(url) + except requests.ConnectionError: + time.sleep(1) + + def stop(self): + self.proc.terminate() + + +def pytest_namespace(): + return dict(server=TestServer()) + + +def pytest_configure(config): + py.test.server.start() + + +def pytest_unconfigure(config): + py.test.server.stop() diff --git a/httpcache/httpcache/__init__.py b/httpcache/httpcache/__init__.py new file mode 100644 index 00000000..7d3503fb --- /dev/null +++ b/httpcache/httpcache/__init__.py @@ -0,0 +1,229 @@ +""" +A caching wrapper for the requests session. +""" +import re +import email +import calendar +import time + +import requests +from cache import DictCache + + +URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?") + + +def parse_uri(uri): + """Parses a URI using the regex given in Appendix B of RFC 3986. + + (scheme, authority, path, query, fragment) = parse_uri(uri) + """ + groups = URI.match(uri).groups() + return (groups[1], groups[3], groups[4], groups[6], groups[8]) + + +def _parse_cache_control(headers): + """ + Parse the cache control headers returning a dictionary with values + for the different directives. + """ + retval = {} + if 'cache-control' in headers: + parts = headers['cache-control'].split(',') + parts_with_args = [ + tuple([x.strip().lower() for x in part.split("=", 1)]) + for part in parts if -1 != part.find("=")] + parts_wo_args = [(name.strip().lower(), 1) + for name in parts if -1 == name.find("=")] + retval = dict(parts_with_args + parts_wo_args) + return retval + + +def urlnorm(uri): + (scheme, authority, path, query, fragment) = parse_uri(uri) + if not scheme or not authority: + raise Exception("Only absolute URIs are allowed. uri = %s" % uri) + authority = authority.lower() + scheme = scheme.lower() + if not path: + path = "/" + # Could do syntax based normalization of the URI before + # computing the digest. See Section 6.2.2 of Std 66. + request_uri = query and "?".join([path, query]) or path + scheme = scheme.lower() + defrag_uri = scheme + "://" + authority + request_uri + return scheme, authority, request_uri, defrag_uri + + +class CacheControl(object): + + def __init__(self, session, cache=None): + self.session = session + self.cache = cache or DictCache() + + def __getattr__(self, key): + if hasattr(self.session, key): + return getattr(self.session, key) + raise AttributeError('%s not found' % key) + + def cache_url(self, url): + scheme, authority, request_uri, defrag_uri = urlnorm(url) + return defrag_uri + + def cached_request(self, *args, **kw): + """ + See if we should use a cached response. We are looking for + client conditions such as no-cache and testing our cached + value to see if we should use it or not. + + This is taken almost directly from httplib2._entry_disposition + """ + req = requests.Request(*args, **kw) + cache_url = self.cache_url(req.full_url) + + cc = _parse_cache_control(req.headers) + + # non-caching states + no_cache = False + if 'no-cache' in cc: no_cache = True + if 'max-age' in cc and cc['max-age'] == 0: no_cache = True + + # see if it is in the cache anyways + in_cache = self.cache.get(cache_url) + if no_cache or not in_cache: + return False + + # It is in the cache, so lets see if it is going to be + # fresh enough + resp = self.cache.get(cache_url) + now = time.time() + date = calendar.timegm( + email.Utils.parsedate_tz(resp.headers['date'])) + current_age = max(0, now - date) + + resp_cc = _parse_cache_control(resp.headers) + + # determine freshness + freshness_lifetime = 0 + if 'max-age' in resp_cc: + try: + freshness_lifetime = int(resp_cc['max-age']) + except ValueError: + pass + elif 'expires' in resp.headers: + expires = email.Utils.parsedate_tz(resp.headers['expires']) + if expires != None: + expire_time = calendar.timegm(expires) - date + freshness_lifetime = max(0, expire_time) + + # determine if we are setting freshness limit in the req + if 'max-age' in cc: + try: + freshness_lifetime = int(cc['max-age']) + except ValueError: + freshness_lifetime = 0 + + if 'min-fresh' in cc: + try: + min_fresh = int(cc['min-fresh']) + except ValueError: + min_fresh = 0 + # adjust our current age by our min fresh + current_age += min_fresh + + # see how fresh we actually are + fresh = (freshness_lifetime > current_age) + + if fresh: + # make sure we set the from_cache to true + resp.from_cache = True + return resp + + # we're not fresh, clean out the junk + self.cache.delete(cache_url) + + # return the original handler + return False + + def cache_response(self, resp): + """ + Algorithm for caching requests + """ + + # From httplib2: Don't cache 206's since we aren't going to + # handle byte range requests + if resp.status_code not in [200, 203]: + return + + cc_req = _parse_cache_control(resp.request.headers) + cc = _parse_cache_control(resp.headers) + + cache_url = self.cache_url(resp.request.full_url) + + # Delete it from the cache if we happen to have it stored there + no_store = cc.get('no-store') or cc_req.get('no-store') + if no_store and self.cache.get(cache_url): + self.cache.delete(cache_url) + + # Add to the cache if the response headers demand it. If there + # is no date header then we can't do anything about expiring + # the cache. + if 'date' in resp.headers: + + # cache when there is a max-age > 0 + if cc and cc.get('max-age'): + if int(cc['max-age']) > 0: + self.cache.set(cache_url, resp) + + # If the request can expire, it means we should cache it + # in the meantime. + elif 'expires' in resp.headers: + if int(resp.headers['expires']) > 0: + self.cache.set(cache_url, resp) + + def from_cache(f): + """ + A decorator that allows using a cached response. + """ + def cached_handler(self, *args, **kw): + # If we have a cached response use it + cached_response = self.cached_request(*args, **kw) + if cached_response: + return cached_response + + # Else return original function's response + return f(self, *args, **kw) + return cached_handler + + def invalidates_cache(f): + """ + A decorator for marking methods that can invalidate the cache. + """ + + def invalidating_handler(self, *args, **kw): + resp = f(self, *args, **kw) + if resp.ok: + cache_url = self.cache_url(resp.request.full_url) + self.cache.delete(cache_url) + return resp + return invalidating_handler + + @from_cache + def get(self, url, headers=None, *args, **kw): + resp = self.session.get(url, headers=headers, *args, **kw) + # We set this primarily for testing + resp.from_cache = False + + # See if we need to cache the response + self.cache_response(resp) + + # actually return the repsonse + return resp + + @invalidates_cache + def put(self, *args, **kw): + return self.session.put(*args, **kw) + + @invalidates_cache + def delete(self, *args, **kw): + return self.session.delete(*args, **kw) diff --git a/httpcache/httpcache/cache.py b/httpcache/httpcache/cache.py new file mode 100644 index 00000000..276cb0bd --- /dev/null +++ b/httpcache/httpcache/cache.py @@ -0,0 +1,155 @@ +""" +The cache object API for implementing caches. The default is just a +dictionary, which in turns means it is not threadsafe for writing. +""" +import os +import re +import time + +from contextlib import contextmanager +from threading import Lock +from cPickle import dump, load +from hashlib import md5 + + +class BaseCache(object): + + def get(self, key): + raise NotImplemented() + + def set(self, key, value): + raise NotImplemented() + + def delete(self, key): + raise NotImplemented() + + +class DictCache(BaseCache): + + def __init__(self, init_dict=None): + self.data = init_dict or {} + + def get(self, key): + return self.data.get(key, None) + + def set(self, key, value): + self.data.update({key: value}) + + def delete(self, key): + if key in self.data: + self.data.pop(key) + + +# Cache filename construction (original borrowed from Venus +# http://intertwingly.net/code/venus/) +re_url_scheme = re.compile(r'^\w+://') +re_slash = re.compile(r'[?/:|]+') + + +def safename(filename): + """Return a filename suitable for the cache. + + Strips dangerous and common characters to create a filename we + can use to store the cache in. + """ + + try: + if re_url_scheme.match(filename): + if isinstance(filename, str): + filename = filename.decode('utf-8') + filename = filename.encode('idna') + else: + filename = filename.encode('idna') + except UnicodeError: + pass + if isinstance(filename, unicode): + filename = filename.encode('utf-8') + filemd5 = md5(filename).hexdigest() + filename = re_url_scheme.sub("", filename) + filename = re_slash.sub(",", filename) + + # limit length of filename + if len(filename) > 200: + filename = filename[:200] + return ",".join((filename, filemd5)) + + +@contextmanager +def filelock(key, mode, timeout=None, interval=.1): + """ + A simple context manager that creates a temporary file for + locking a specific cache entry. + + This was inspired pretty directly by: + http://amix.dk/blog/post/19531 + """ + lockfile = '%s.lock' % key + if timeout: + iterations = int(timeout / interval) + locked = os.path.exists(lockfile) + while locked: + error = FileCacheLockedException(lockfile, key) + if timeout: + # it is already locked, but we have a timeout if we + # want to try and block until it is available + time.sleep(interval) + iterations -= 1 + locked = os.path.exists(lockfile) + if not iterations: + raise error + else: + # it is locked and we don't have a timeout, so raise + # the error + error + + with open(lockfile, 'w+') as file_lock: + file_lock.write('1') + with open(key, mode) as opened_file: + yield opened_file + os.remove(lockfile) + + +class FileCacheLockedException(Exception): + + def __init__(self, lockfile, fname): + self.lockfile, self.fname = lockfile, fname + msg = '"%s" is locked. Try removing "%s"' % (lockfile, fname) + super(FileCacheLockedException, self).__init__(msg) + + +class FileCache(BaseCache): + """ + A simple threadsafe file based cache. + + The file cache is a port of httplib2's directory cache. The only + difference is that it uses a lock when writing the file and + pickles the value. The pickling is used b/c we are using + requests.Response objects. + """ + + def __init__(self, cache, worker=None): + self.cache = cache + self.safe = safename + if not os.path.exists(cache): + os.makedirs(self.cache) + self.lock = Lock() + + def get(self, key): + retval = None + cache_full_path = os.path.join(self.cache, safename(key)) + try: + with open(cache_full_path, "rb") as f: + retval = load(f) + except (IOError, EOFError): + pass + return retval + + def set(self, key, value): + cache_full_path = os.path.join(self.cache, safename(key)) + with filelock(cache_full_path, "wb", timeout=1) as f: + dump(value, f) + + def delete(self, key): + cache_full_path = os.path.join(self.cache, safename(key)) + if os.path.exists(cache_full_path): + os.remove(cache_full_path) diff --git a/httpcache/httpcache/tests/__init__.py b/httpcache/httpcache/tests/__init__.py new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/httpcache/httpcache/tests/__init__.py diff --git a/httpcache/httpcache/tests/integration/test_caching_conditions.py b/httpcache/httpcache/tests/integration/test_caching_conditions.py new file mode 100644 index 00000000..59d35567 --- /dev/null +++ b/httpcache/httpcache/tests/integration/test_caching_conditions.py @@ -0,0 +1,44 @@ +from httpcache import CacheControl +import requests + + +class TestCachingConditions(object): + + def test_no_caching_directives(self): + url = 'http://localhost:8080/' + s = requests.Session() + c = CacheControl(s, {}) + r = c.get(url) + + assert r + assert r.content == 'foo' + assert not c.cache.get(url) + + def test_cache_max_age(self): + url = 'http://localhost:8080/max_age/' + s = requests.Session() + c = CacheControl(s, {}) + r = c.get(url) + assert c.cache.get(url) + assert c.cache.get(url) == r + + def test_cache_no_cache(self): + url = 'http://localhost:8080/no_cache/' + s = requests.Session() + c = CacheControl(s, {}) + c.get(url) + assert not c.cache.get(url) + + def test_cache_must_revalidate(self): + url = 'http://localhost:8080/must_revalidate/' + s = requests.Session() + c = CacheControl(s, {}) + c.get(url) + assert not c.cache.get(url) + + def test_cache_no_store(self): + url = 'http://localhost:8080/no_store/' + s = requests.Session() + c = CacheControl(s, {}) + c.get(url) + assert not c.cache.get(url) diff --git a/httpcache/httpcache/tests/server.py b/httpcache/httpcache/tests/server.py new file mode 100644 index 00000000..140c3f1d --- /dev/null +++ b/httpcache/httpcache/tests/server.py @@ -0,0 +1,36 @@ +from __future__ import print_function +import cherrypy + + +class CacheTestingServer(object): + + def index(self): + return 'foo' + index.exposed = True + + def max_age(self, value=None): + age = 'max-age=%s' % (value or 300) + cherrypy.response.headers['Cache-Control'] = age + return 'max age' + max_age.exposed = True + + def no_cache(self): + cherrypy.response.headers['Cache-Control'] = 'no-cache' + return 'no cache' + no_cache.exposed = True + + def must_revalidate(self): + cherrypy.response.headers['Cache-Control'] = 'must-revalidate' + return 'must revalidate' + must_revalidate.exposed = True + + def no_store(self): + cherrypy.response.headers['Cache-Control'] = 'no-store' + return 'no store' + no_store.exposed = True + + +if __name__ == '__main__': + cherrypy.tree.mount(CacheTestingServer(), '/') + cherrypy.engine.start() + cherrypy.engine.block() diff --git a/httpcache/httpcache/tests/test_cache.py b/httpcache/httpcache/tests/test_cache.py new file mode 100644 index 00000000..4efbc5c4 --- /dev/null +++ b/httpcache/httpcache/tests/test_cache.py @@ -0,0 +1,76 @@ +""" +Tests for our caches +""" +import os + +from threading import Thread +# from multiprocessing import Process +from httpcache.cache import FileCache, safename, filelock +from httpcache.cache import FileCacheLockedException + + +class RandomWriter(Thread): + def __init__(self, cache): + super(RandomWriter, self).__init__() + self.key = 'threading_test' + self.cache = cache + + def run(self): + for x in range(0, 5): + value = x * 1000 + self.cache.set(self.key, value) + assert self.cache.get(self.key) == value + + +class TestFileCache(object): + + cache_dir = 'test_file_cache' + + def test_set_get_delete(self): + fc = FileCache(self.cache_dir) + fc.set('foo', 'bar') + assert fc.get('foo') == 'bar' + fc.delete('foo') + assert fc.get('foo') == None + + def test_setting_with_multiple_threads(self): + fc = FileCache(self.cache_dir) + w1 = RandomWriter(fc) + w2 = RandomWriter(fc) + w1.start() + w2.start() + w1.join() + w2.join() + assert fc.get(w1.key) == 4000 + + def test_locked_raises_exception(self): + key, value = 'locked', {'foo': 'bar'} + fc = FileCache(self.cache_dir) + lockfile = os.path.join(self.cache_dir, + '%s.lock' % safename(key)) + with open(lockfile, 'w+') as lock: + lock.write('1') + assert os.path.exists(lockfile) + try: + fc.set(key, value) + assert False + except FileCacheLockedException: + assert True + os.remove(lockfile) + fc.set(key, value) + assert fc.get(key) == value + assert not os.path.exists(lockfile) + + def test_filelock_timeout(self): + fc = FileCache(self.cache_dir) + fname = safename('locked') + lockfile = '%s.lock' % fname + with open(lockfile, 'w+') as lock: + lock.write('1') + try: + with filelock(fname, 'w+', .5): + assert False + except FileCacheLockedException: + assert True + os.remove(lockfile) + assert not os.path.exists(lockfile) diff --git a/httpcache/httpcache/tests/test_cache_control.py b/httpcache/httpcache/tests/test_cache_control.py new file mode 100644 index 00000000..c06fabbb --- /dev/null +++ b/httpcache/httpcache/tests/test_cache_control.py @@ -0,0 +1,167 @@ +""" +Unit tests that verify our caching methods work correctly. +""" +import mock +import datetime +import time + +from httpcache import CacheControl +from httpcache.cache import DictCache + + +TIME_FMT = "%a, %d %b %Y %H:%M:%S" + + +class TestCacheControlResponse(object): + url = 'http://url.com/' + + def req(self, headers=None): + headers = headers or {} + return mock.Mock(full_url=self.url, headers=headers) + + def resp(self, headers=None): + headers = headers or {} + return mock.Mock(status_code=200, + headers=headers, + request=self.req()) + + def cache(self): + return CacheControl(mock.Mock(), mock.MagicMock()) + + def test_no_cache_non_20x_response(self): + c = self.cache() + + # No caching without some extra headers, so we add them + now = datetime.datetime.utcnow().strftime(TIME_FMT) + resp = self.resp({'cache-control': 'max-age=3600', + 'date': now}) + + no_cache_codes = [201, 300, 400, 500] + for code in no_cache_codes: + resp.status_code = code + c.cache_response(resp) + assert not c.cache.set.called + + # this should work b/c the resp is 20x + resp.status_code = 203 + c.cache_response(resp) + assert c.cache.set.called + + def test_no_cache_with_no_date(self): + c = self.cache() + + # No date header which makes our max-age pointless + resp = self.resp({'cache-control': 'max-age=3600'}) + c.cache_response(resp) + + assert not c.cache.set.called + + def test_cache_response_no_cache_control(self): + c = self.cache() + resp = self.resp() + c.cache_response(resp) + + assert not c.cache.set.called + + def test_cache_response_cache_max_age(self): + c = self.cache() + + now = datetime.datetime.utcnow().strftime(TIME_FMT) + resp = self.resp({'cache-control': 'max-age=3600', + 'date': now}) + c.cache_response(resp) + c.cache.set.assert_called_with(self.url, resp) + + def test_cache_repsonse_no_store(self): + resp = mock.Mock() + cache = DictCache({self.url: resp}) + c = CacheControl(resp, cache) + + cache_url = c.cache_url(self.url) + + resp = self.resp({'cache-control': 'no-store'}) + assert c.cache.get(cache_url) + + c.cache_response(resp) + assert not c.cache.get(cache_url) + + +class TestCacheControlRequest(object): + + url = 'http://foo.com' + + def test_cache_request_no_cache(self): + c = CacheControl(mock.Mock()) + hdrs = {'cache-control': 'no-cache'} + resp = c.cached_request(self.url, headers=hdrs) + assert not resp + + def test_cache_request_pragma_no_cache(self): + c = CacheControl(mock.Mock()) + hdrs = {'pragma': 'no-cache'} + resp = c.cached_request(self.url, headers=hdrs) + assert not resp + + def test_cache_request_no_store(self): + c = CacheControl(mock.Mock()) + hdrs = {'cache-control': 'no-store'} + resp = c.cached_request(self.url, headers=hdrs) + assert not resp + + def test_cache_request_max_age_0(self): + c = CacheControl(mock.Mock()) + hdrs = {'cache-control': 'max-age=0'} + resp = c.cached_request(self.url, headers=hdrs) + assert not resp + + def test_cache_request_not_in_cache(self): + c = CacheControl(mock.Mock()) + resp = c.cached_request(self.url) + assert not resp + + def test_cache_request_fresh_max_age(self): + now = datetime.datetime.utcnow().strftime(TIME_FMT) + resp = mock.Mock(headers={'cache-control': 'max-age=3600', + 'date': now}) + + # NOTE: httplib2 uses its own algorithm for finding the + # "defrag_uri" in order to use it for creating a cache key. It + # seems to append the trailing slash, which I'm pretty sure is + # b/c of the auto directory rules. I'm trusting it is correct. + cache = DictCache({self.url + '/': resp}) + c = CacheControl(mock.Mock(), cache) + r = c.cached_request(self.url) + assert r == resp + + def test_cache_request_unfresh_max_age(self): + earlier = time.time() - 3700 + now = datetime.datetime.fromtimestamp(earlier).strftime(TIME_FMT) + + resp = mock.Mock(headers={'cache-control': 'max-age=3600', + 'date': now}) + cache = DictCache({self.url: resp}) + c = CacheControl(mock.Mock(), cache) + r = c.cached_request(self.url) + assert not r + + def test_cache_request_fresh_expires(self): + later = datetime.timedelta(days=1) + expires = (datetime.datetime.utcnow() + later).strftime(TIME_FMT) + now = datetime.datetime.utcnow().strftime(TIME_FMT) + resp = mock.Mock(headers={'expires': expires, + 'date': now}) + cache = DictCache({self.url + '/': resp}) + c = CacheControl(mock.Mock, cache) + r = c.cached_request(self.url) + assert r == resp + + def test_cache_request_unfresh_expires(self): + later = datetime.timedelta(days=-1) + expires = (datetime.datetime.utcnow() + later).strftime(TIME_FMT) + now = datetime.datetime.utcnow().strftime(TIME_FMT) + resp = mock.Mock(headers={'expires': expires, + 'date': now}) + cache = DictCache({self.url: resp}) + c = CacheControl(mock.Mock, cache) + r = c.cached_request(self.url) + assert not r diff --git a/httpcache/httpcache/tests/test_cache_invalidation.py b/httpcache/httpcache/tests/test_cache_invalidation.py new file mode 100644 index 00000000..017a3d12 --- /dev/null +++ b/httpcache/httpcache/tests/test_cache_invalidation.py @@ -0,0 +1,39 @@ +""" +When resources are known to be updated via HTTP (ie PUT, DELETE), we +should invalidate them in our cache. +""" +import mock + +from httpcache import CacheControl +from httpcache.cache import DictCache + + +class TestInvalidations(object): + + url = 'http://foo.com/bar/' + + def resp(self): + req = mock.Mock(full_url=self.url) + return mock.Mock(request=req) + + def test_put_invalidates_cache(self): + # Prep our cache + resp = self.resp() + cache = DictCache({self.url: resp}) + session = mock.Mock(put=mock.Mock(return_value=resp)) + c = CacheControl(session, cache) + + c.put(self.url) + + assert not c.cache.get(self.url) + + def test_delete_invalidates_cache(self): + # Prep our cache + resp = self.resp() + cache = DictCache({self.url: resp}) + session = mock.Mock(delete=mock.Mock(return_value=resp)) + c = CacheControl(session, cache) + + c.delete(self.url) + + assert not c.cache.get(self.url) diff --git a/httpcache/httpcache/tests/test_max_age.py b/httpcache/httpcache/tests/test_max_age.py new file mode 100644 index 00000000..f33f501b --- /dev/null +++ b/httpcache/httpcache/tests/test_max_age.py @@ -0,0 +1,50 @@ +from __future__ import print_function +from .. import CacheControl +import requests + + +class TestMaxAge(object): + + def test_client_max_age_0(self): + """ + Making sure when the client uses max-age=0 we don't get a + cached copy even though we're still fresh. + """ + url = 'http://localhost:8080/max_age/' + s = requests.Session() + c = CacheControl(s, {}) + print('first request') + r = c.get(url) + cache_url = c.cache_url(url) + assert c.cache.get(cache_url) == r + + print('second request') + r = c.get(url, headers={'Cache-Control': 'max-age=0'}) + + # don't remove from the cache + assert c.cache.get(cache_url) + assert r.from_cache == False + + def test_client_max_age_3600(self): + """ + Verify we get a cached value when the client has a + reasonable max-age value. + """ + # prep our cache + url = 'http://localhost:8080/max_age/' + s = requests.Session() + c = CacheControl(s, {}) + r = c.get(url) + cache_url = c.cache_url(url) + assert c.cache.get(cache_url) == r + + # request that we don't want a new one unless + r = c.get(url, headers={'Cache-Control': 'max-age=3600'}) + assert r.from_cache == True + + # now lets grab one that forces a new request b/c the cache + # has expired. To do that we'll inject a new time value. + resp = c.cache.get(cache_url) + resp.headers['date'] = 'Tue, 15 Nov 1994 08:12:31 GMT' + r = c.get(url) + assert r.from_cache == False diff --git a/httpcache/setup.py b/httpcache/setup.py new file mode 100644 index 00000000..8d188634 --- /dev/null +++ b/httpcache/setup.py @@ -0,0 +1,18 @@ +import setuptools + +setup_params = dict( + name='HTTPCache', + version='0.5', + author="Eric Larson", + author_email="eric@ionrock.org", + url="https://bitbucket.org/elarson/httpcache", + packages=setuptools.find_packages(), + tests_requires=[ + 'py.test', + 'cherrypy', + ], +) + + +if __name__ == '__main__': + setuptools.setup(**setup_params) diff --git a/py-reqcache b/py-reqcache new file mode 160000 +Subproject 5b4f3e5217c2d1b32818bf4d8121a0a243e67db diff --git a/requests/cache.py b/requests/cache.py index 5be1dfbe..373e53c3 100644 --- a/requests/cache.py +++ b/requests/cache.py @@ -7,7 +7,11 @@ requests.cache Requests caching layer. """ + +import time import hashlib +import calendar +import email from .packages.cachecore import SimpleCache DEFAULT_CACHE = SimpleCache @@ -25,20 +29,16 @@ def expand_cache(c): if c is False: return Cache(backend=False) -def request_hash(r, type=''): - """Returns a SHA256(type-method-url) for cache keys.""" - - s = '{0}-{1}-{2}'.format(type, r.request.method, r.request.full_url) - return hashlib.sha256(s).hexdigest() class Cache(object): """A Cache session.""" - def __init__(self, backend=None, conditional=None, content=True): + def __init__(self, backend=None, conditional=None, content=True, handler=None): self.conditional = None self.content = None + self.handler = handler # Default to backend if True. if not backend is None: @@ -54,3 +54,141 @@ class Cache(object): if content is not True: self.content = content + if handler is None: + self.handler = CacheHandler() + + + +class CacheHandler(object): + + + @staticmethod + def request_hash(r, type=''): + """Returns a SHA256(type-method-url) for cache keys.""" + + s = '{0}-{1}-{2}'.format(type, r.request.method, r.request.full_url) + return hashlib.sha256(s).hexdigest() + + + @staticmethod + def _parse_cache_control(headers): + """ + Parse the cache control headers returning a dictionary with values + for the different directives. + """ + retval = {} + if 'cache-control' in headers: + parts = headers['cache-control'].split(',') + parts_with_args = [ + tuple([x.strip().lower() for x in part.split("=", 1)]) + for part in parts if -1 != part.find("=")] + parts_wo_args = [(name.strip().lower(), 1) + for name in parts if -1 == name.find("=")] + retval = dict(parts_with_args + parts_wo_args) + return retval + + def cache_request(self, r, cache): + """See if we should use a cached response.""" + + return True + cache_hash = self.request_hash(r) + cc = self._parse_cache_control(r.headers) + + # non-caching states + no_cache = False + if 'no-cache' in cc: + no_cache = True + if 'max-age' in cc and cc['max-age'] == 0: + no_cache = True + + # see if it is in the cache anyways + in_cache = cache.get(cache_hash) + if no_cache or not in_cache: + return False + + # It is in the cache, so lets see if it is going to be + # fresh enough + resp = cache.get(cache_hash) + now = time.time() + date = calendar.timegm( + email.Utils.parsedate_tz(resp.headers['date'])) + current_age = max(0, now - date) + + resp_cc = self._parse_cache_control(resp.headers) + + + # determine freshness + freshness_lifetime = 0 + if 'max-age' in resp_cc: + try: + freshness_lifetime = int(resp_cc['max-age']) + except ValueError: + pass + elif 'expires' in resp.headers: + expires = email.Utils.parsedate_tz(resp.headers['expires']) + if expires != None: + expire_time = calendar.timegm(expires) - date + freshness_lifetime = max(0, expire_time) + + # determine if we are setting freshness limit in the req + if 'max-age' in cc: + try: + freshness_lifetime = int(cc['max-age']) + except ValueError: + freshness_lifetime = 0 + + if 'min-fresh' in cc: + try: + min_fresh = int(cc['min-fresh']) + except ValueError: + min_fresh = 0 + # adjust our current age by our min fresh + current_age += min_fresh + + # see how fresh we actually are + fresh = (freshness_lifetime > current_age) + + if fresh: + # make sure we set the from_cache to true + resp.from_cache = True + return resp + + # we're not fresh, clean out the junk + cache.delete(cache_hash) + + # return the original handler + return False + + + def cache_response(self, r, cache): + """Algorithm for caching requests.""" + + if r.status_code not in [200, 203]: + return + + cc_req = self._parse_cache_control(r.request.headers) + cc = self._parse_cache_control(r.headers) + + # cache_url = self.cache_url(r.request.full_url) + cache_hash = self.request_hash(r) + + # Delete it from the cache if we happen to have it stored there + no_store = cc.get('no-store') or cc_req.get('no-store') + if no_store and cache.get(cache_hash): + cache.delete(cache_hash) + + # Add to the cache if the response headers demand it. If there + # is no date header then we can't do anything about expiring + # the cache. + if 'date' in r.headers: + + # cache when there is a max-age > 0 + if cc and cc.get('max-age'): + if int(cc['max-age']) > 0: + cache.set(cache_hash, r) + + # If the request can expire, it means we should cache it + # in the meantime. + elif 'expires' in r.headers: + if int(r.headers['expires']) > 0: + cache.set(cache_hash, r) diff --git a/requests/sessions.py b/requests/sessions.py index 5e827543..f6416777 100644 --- a/requests/sessions.py +++ b/requests/sessions.py @@ -11,6 +11,7 @@ requests (cookies, auth, proxies). from copy import deepcopy from .compat import cookielib +from .cache import expand_cache from .cookies import cookiejar_from_dict, remove_cookie_by_name from .defaults import defaults from .models import Request @@ -81,10 +82,11 @@ class Session(object): self.prefetch = prefetch self.verify = verify self.cert = cert - self.cache = cache + self.cache = expand_cache(cache) - from .cache import ReqCache - self.cache = ReqCache("test", "memory") + + # from .cache import ReqCache + # self.cache = ReqCache("test", "memory") for (k, v) in list(defaults.items()): self.config.setdefault(k, deepcopy(v)) @@ -0,0 +1,26 @@ +# mycache = ReqCache("test", "memory") +import requests +s = requests.session() + +r = s.get('http://github.com') +print r.__dict__.get('from_cache') + +r = s.get('http://github.com') +print r.__dict__.get('from_cache') + + +r = s.get('http://github.com') +print r.__dict__.get('from_cache') + +r = s.get('http://github.com') +print r.__dict__.get('from_cache') + +r = s.get('http://github.com') +print r.__dict__.get('from_cache') + +r = s.get('http://github.com') +print r.__dict__.get('from_cache') +# r = requests.get('http://github.com', hooks=mycache.hooks) + +# r = requests.get('http://github.com', hooks=mycache.hooks) +# explain_cache_result(r) |