diff options
Diffstat (limited to 'numpy/lib/_datasource.py')
-rw-r--r-- | numpy/lib/_datasource.py | 443 |
1 files changed, 443 insertions, 0 deletions
diff --git a/numpy/lib/_datasource.py b/numpy/lib/_datasource.py new file mode 100644 index 000000000..06aae85d8 --- /dev/null +++ b/numpy/lib/_datasource.py @@ -0,0 +1,443 @@ +"""A file interface for handling local and remote data files. +The goal of datasource is to abstract some of the file system operations when +dealing with data files so the researcher doesn't have to know all the +low-level details. Through datasource, a researcher can obtain and use a +file with one function call, regardless of location of the file. + +DataSource is meant to augment standard python libraries, not replace them. +It should work seemlessly with standard file IO operations and the os module. + +DataSource files can originate locally or remotely: + +- local files : '/home/guido/src/local/data.txt' +- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt' + +DataSource files can also be compressed or uncompressed. Currently only gzip +and bz2 are supported. + +Example: + + >>> # Create a DataSource, use os.curdir (default) for local storage. + >>> ds = datasource.DataSource() + >>> + >>> # Open a remote file. + >>> # DataSource downloads the file, stores it locally in: + >>> # './www.google.com/index.html' + >>> # opens the file and returns a file object. + >>> fp = ds.open('http://www.google.com/index.html') + >>> + >>> # Use the file as you normally would + >>> fp.read() + >>> fp.close() + +""" + +__docformat__ = "restructuredtext en" + +import bz2 +import gzip +import os +import tempfile +from shutil import rmtree +from urllib2 import urlopen, URLError +from urlparse import urlparse + + +# TODO: .zip support, .tar support? +_file_openers = {".gz":gzip.open, ".bz2":bz2.BZ2File, None:file} + + +def open(path, mode='r', destpath=os.curdir): + """Open ``path`` with ``mode`` and return the file object. + + If ``path`` is an URL, it will be downloaded, stored in the DataSource + directory and opened from there. + + *Parameters*: + + path : {string} + + mode : {string}, optional + + destpath : {string}, optional + Destination directory where URLs will be downloaded and stored. + + *Returns*: + + file object + + """ + + ds = DataSource(destpath) + return ds.open(path, mode) + + +class DataSource (object): + """A generic data source file (file, http, ftp, ...). + + DataSources could be local files or remote files/URLs. The files may + also be compressed or uncompressed. DataSource hides some of the low-level + details of downloading the file, allowing you to simply pass in a valid + file path (or URL) and obtain a file object. + + *Methods*: + + - exists : test if the file exists locally or remotely + - abspath : get absolute path of the file in the DataSource directory + - open : open the file + + *Example URL DataSource*:: + + # Initialize DataSource with a local directory, default is os.curdir. + ds = DataSource('/home/guido') + + # Open remote file. + # File will be downloaded and opened from here: + # /home/guido/site/xyz.txt + ds.open('http://fake.xyz.web/site/xyz.txt') + + *Example using DataSource for temporary files*:: + + # Initialize DataSource with 'None' for the local directory. + ds = DataSource(None) + + # Open local file. + # Opened file exists in a temporary directory like: + # /tmp/tmpUnhcvM/foobar.txt + # Temporary directories are deleted when the DataSource is deleted. + ds.open('/home/guido/foobar.txt') + + *Notes*: + BUG : URLs require a scheme string ('http://') to be used. + www.google.com will fail. + + >>> repos.exists('www.google.com/index.html') + False + + >>> repos.exists('http://www.google.com/index.html') + True + + """ + + def __init__(self, destpath=os.curdir): + """Create a DataSource with a local path at destpath.""" + if destpath: + self._destpath = os.path.abspath(destpath) + self._istmpdest = False + else: + self._destpath = tempfile.mkdtemp() + self._istmpdest = True + + def __del__(self): + # Remove temp directories + if self._istmpdest: + rmtree(self._destpath) + + def _iszip(self, filename): + """Test if the filename is a zip file by looking at the file extension. + """ + fname, ext = os.path.splitext(filename) + return ext in _file_openers.keys() + + def _iswritemode(self, mode): + """Test if the given mode will open a file for writing.""" + + # Currently only used to test the bz2 files. + _writemodes = ("w", "+") + for c in mode: + if c in _writemodes: + return True + return False + + def _splitzipext(self, filename): + """Split zip extension from filename and return filename. + + *Returns*: + base, zip_ext : {tuple} + + """ + + if self._iszip(filename): + return os.path.splitext(filename) + else: + return filename, None + + def _possible_names(self, filename): + """Return a tuple containing compressed filename variations.""" + names = [filename] + if not self._iszip(filename): + for zipext in _file_openers.keys(): + if zipext: + names.append(filename+zipext) + return names + + def _isurl(self, path): + """Test if path is a net location. Tests the scheme and netloc.""" + + # BUG : URLs require a scheme string ('http://') to be used. + # www.google.com will fail. + # Should we prepend the scheme for those that don't have it and + # test that also? Similar to the way we append .gz and test for + # for compressed versions of files. + + scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path) + return bool(scheme and netloc) + + def _cache(self, path): + """Cache the file specified by path. + + Creates a copy of the file in the datasource cache. + + """ + + upath = self.abspath(path) + + # ensure directory exists + if not os.path.exists(os.path.dirname(upath)): + os.makedirs(os.path.dirname(upath)) + + # TODO: Doesn't handle compressed files! + if self._isurl(path): + try: + openedurl = urlopen(path) + file(upath, 'w').write(openedurl.read()) + except URLError: + raise URLError("URL not found: ", path) + else: + try: + # TODO: Why not just copy the file with shutils.copyfile? + fp = file(path, 'r') + file(upath, 'w').write(fp.read()) + except IOError: + raise IOError("File not found: ", path) + return upath + + def _findfile(self, path): + """Searches for ``path`` and returns full path if found. + + If path is an URL, _findfile will cache a local copy and return + the path to the cached file. + If path is a local file, _findfile will return a path to that local + file. + + The search will include possible compressed versions of the file and + return the first occurence found. + + """ + + # Build list of possible local file paths + if not self._isurl(path): + # Valid local paths + filelist = self._possible_names(path) + # Paths in self._destpath + filelist += self._possible_names(self.abspath(path)) + else: + # Cached URLs in self._destpath + filelist = self._possible_names(self.abspath(path)) + # Remote URLs + filelist = filelist + self._possible_names(path) + + for name in filelist: + if self.exists(name): + if self._isurl(name): + name = self._cache(name) + return name + return None + + def abspath(self, path): + """Return absolute path of ``path`` in the DataSource directory. + + If ``path`` is an URL, the ``abspath`` will be either the location + the file exists locally or the location it would exist when opened + using the ``open`` method. + + The functionality is idential to os.path.abspath. + + *Parameters*: + + path : {string} + Can be a local file or a remote URL. + + *Returns*: + + Complete path, rooted in the DataSource destination directory. + + *See Also*: + + `open` : Method that downloads and opens files. + + """ + + # TODO: This should be more robust. Handles case where path includes + # the destpath, but not other sub-paths. Failing case: + # path = /home/guido/datafile.txt + # destpath = /home/alex/ + # upath = self.abspath(path) + # upath == '/home/alex/home/guido/datafile.txt' + + # handle case where path includes self._destpath + splitpath = path.split(self._destpath, 2) + if len(splitpath) > 1: + path = splitpath[1] + scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path) + return os.path.join(self._destpath, netloc, upath.strip(os.sep)) + + def exists(self, path): + """Test if ``path`` exists. + + Test if ``path`` exists as (and in this order): + + - a local file. + - a remote URL that have been downloaded and stored locally in the + DataSource directory. + - a remote URL that has not been downloaded, but is valid and + accessible. + + *Parameters*: + + path : {string} + Can be a local file or a remote URL. + + *Returns*: + + boolean + + *See Also*: + + `abspath` + + *Notes* + + When ``path`` is an URL, ``exist`` will return True if it's either + stored locally in the DataSource directory, or is a valid remote + URL. DataSource does not discriminate between to two, the file + is accessible if it exists in either location. + + """ + + # Test local path + if os.path.exists(path): + return True + + # Test cached url + upath = self.abspath(path) + if os.path.exists(upath): + return True + + # Test remote url + if self._isurl(path): + try: + netfile = urlopen(path) + del(netfile) + return True + except URLError: + return False + return False + + def open(self, path, mode='r'): + """Open ``path`` with ``mode`` and return the file object. + + If ``path`` is an URL, it will be downloaded, stored in the DataSource + directory and opened from there. + + *Parameters*: + + path : {string} + + mode : {string}, optional + + + *Returns*: + + file object + + """ + + # TODO: There is no support for opening a file for writing which + # doesn't exist yet (creating a file). Should there be? + + # TODO: Add a ``subdir`` parameter for specifying the subdirectory + # used to store URLs in self._destpath. + + if self._isurl(path) and self._iswritemode(mode): + raise ValueError("URLs are not writeable") + + # NOTE: _findfile will fail on a new file opened for writing. + found = self._findfile(path) + if found: + _fname, ext = self._splitzipext(found) + if ext == 'bz2': + mode.replace("+", "") + return _file_openers[ext](found, mode=mode) + else: + raise IOError("%s not found." % path) + + +class Repository (DataSource): + """A data Repository where multiple DataSource's share a base URL/directory. + + Repository extends DataSource by prepending a base URL (or directory) to + all the files it handles. Use a Repository when you will be working with + multiple files from one base URL. Initialize the Respository with the + base URL, then refer to each file by it's filename only. + + *Methods*: + + - exists : test if the file exists locally or remotely + - abspath : get absolute path of the file in the DataSource directory + - open : open the file + + *Toy example*:: + + # Analyze all files in the repository. + repos = Repository('/home/user/data/dir/') + for filename in filelist: + fp = repos.open(filename) + fp.analyze() + fp.close() + + # Similarly you could use a URL for a repository. + repos = Repository('http://www.xyz.edu/data') + + """ + + def __init__(self, baseurl, destpath=os.curdir): + """Create a Repository with a shared url or directory of baseurl.""" + DataSource.__init__(self, destpath=destpath) + self._baseurl = baseurl + + def __del__(self): + DataSource.__del__(self) + + def _fullpath(self, path): + """Return complete path for path. Prepends baseurl if necessary.""" + splitpath = path.split(self._baseurl, 2) + if len(splitpath) == 1: + result = os.path.join(self._baseurl, path) + else: + result = path # path contains baseurl already + return result + + def _findfile(self, path): + """Extend DataSource method to prepend baseurl to ``path``.""" + return DataSource._findfile(self, self._fullpath(path)) + + def abspath(self, path): + """Extend DataSource method to prepend baseurl to ``path``.""" + return DataSource.abspath(self, self._fullpath(path)) + + def exists(self, path): + """Extend DataSource method to prepend baseurl to ``path``.""" + return DataSource.exists(self, self._fullpath(path)) + + def open(self, path, mode='r'): + """Extend DataSource method to prepend baseurl to ``path``.""" + return DataSource.open(self, self._fullpath(path), mode) + + def listdir(self): + '''List files in the source Repository.''' + if self._isurl(self._baseurl): + raise NotImplementedError, \ + "Directory listing of URLs, not supported yet." + else: + return os.listdir(self._baseurl) |