summaryrefslogtreecommitdiff
path: root/numpy/lib/_datasource.py
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/lib/_datasource.py')
-rw-r--r--numpy/lib/_datasource.py443
1 files changed, 443 insertions, 0 deletions
diff --git a/numpy/lib/_datasource.py b/numpy/lib/_datasource.py
new file mode 100644
index 000000000..06aae85d8
--- /dev/null
+++ b/numpy/lib/_datasource.py
@@ -0,0 +1,443 @@
+"""A file interface for handling local and remote data files.
+The goal of datasource is to abstract some of the file system operations when
+dealing with data files so the researcher doesn't have to know all the
+low-level details. Through datasource, a researcher can obtain and use a
+file with one function call, regardless of location of the file.
+
+DataSource is meant to augment standard python libraries, not replace them.
+It should work seemlessly with standard file IO operations and the os module.
+
+DataSource files can originate locally or remotely:
+
+- local files : '/home/guido/src/local/data.txt'
+- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'
+
+DataSource files can also be compressed or uncompressed. Currently only gzip
+and bz2 are supported.
+
+Example:
+
+ >>> # Create a DataSource, use os.curdir (default) for local storage.
+ >>> ds = datasource.DataSource()
+ >>>
+ >>> # Open a remote file.
+ >>> # DataSource downloads the file, stores it locally in:
+ >>> # './www.google.com/index.html'
+ >>> # opens the file and returns a file object.
+ >>> fp = ds.open('http://www.google.com/index.html')
+ >>>
+ >>> # Use the file as you normally would
+ >>> fp.read()
+ >>> fp.close()
+
+"""
+
+__docformat__ = "restructuredtext en"
+
+import bz2
+import gzip
+import os
+import tempfile
+from shutil import rmtree
+from urllib2 import urlopen, URLError
+from urlparse import urlparse
+
+
+# TODO: .zip support, .tar support?
+_file_openers = {".gz":gzip.open, ".bz2":bz2.BZ2File, None:file}
+
+
+def open(path, mode='r', destpath=os.curdir):
+ """Open ``path`` with ``mode`` and return the file object.
+
+ If ``path`` is an URL, it will be downloaded, stored in the DataSource
+ directory and opened from there.
+
+ *Parameters*:
+
+ path : {string}
+
+ mode : {string}, optional
+
+ destpath : {string}, optional
+ Destination directory where URLs will be downloaded and stored.
+
+ *Returns*:
+
+ file object
+
+ """
+
+ ds = DataSource(destpath)
+ return ds.open(path, mode)
+
+
+class DataSource (object):
+ """A generic data source file (file, http, ftp, ...).
+
+ DataSources could be local files or remote files/URLs. The files may
+ also be compressed or uncompressed. DataSource hides some of the low-level
+ details of downloading the file, allowing you to simply pass in a valid
+ file path (or URL) and obtain a file object.
+
+ *Methods*:
+
+ - exists : test if the file exists locally or remotely
+ - abspath : get absolute path of the file in the DataSource directory
+ - open : open the file
+
+ *Example URL DataSource*::
+
+ # Initialize DataSource with a local directory, default is os.curdir.
+ ds = DataSource('/home/guido')
+
+ # Open remote file.
+ # File will be downloaded and opened from here:
+ # /home/guido/site/xyz.txt
+ ds.open('http://fake.xyz.web/site/xyz.txt')
+
+ *Example using DataSource for temporary files*::
+
+ # Initialize DataSource with 'None' for the local directory.
+ ds = DataSource(None)
+
+ # Open local file.
+ # Opened file exists in a temporary directory like:
+ # /tmp/tmpUnhcvM/foobar.txt
+ # Temporary directories are deleted when the DataSource is deleted.
+ ds.open('/home/guido/foobar.txt')
+
+ *Notes*:
+ BUG : URLs require a scheme string ('http://') to be used.
+ www.google.com will fail.
+
+ >>> repos.exists('www.google.com/index.html')
+ False
+
+ >>> repos.exists('http://www.google.com/index.html')
+ True
+
+ """
+
+ def __init__(self, destpath=os.curdir):
+ """Create a DataSource with a local path at destpath."""
+ if destpath:
+ self._destpath = os.path.abspath(destpath)
+ self._istmpdest = False
+ else:
+ self._destpath = tempfile.mkdtemp()
+ self._istmpdest = True
+
+ def __del__(self):
+ # Remove temp directories
+ if self._istmpdest:
+ rmtree(self._destpath)
+
+ def _iszip(self, filename):
+ """Test if the filename is a zip file by looking at the file extension.
+ """
+ fname, ext = os.path.splitext(filename)
+ return ext in _file_openers.keys()
+
+ def _iswritemode(self, mode):
+ """Test if the given mode will open a file for writing."""
+
+ # Currently only used to test the bz2 files.
+ _writemodes = ("w", "+")
+ for c in mode:
+ if c in _writemodes:
+ return True
+ return False
+
+ def _splitzipext(self, filename):
+ """Split zip extension from filename and return filename.
+
+ *Returns*:
+ base, zip_ext : {tuple}
+
+ """
+
+ if self._iszip(filename):
+ return os.path.splitext(filename)
+ else:
+ return filename, None
+
+ def _possible_names(self, filename):
+ """Return a tuple containing compressed filename variations."""
+ names = [filename]
+ if not self._iszip(filename):
+ for zipext in _file_openers.keys():
+ if zipext:
+ names.append(filename+zipext)
+ return names
+
+ def _isurl(self, path):
+ """Test if path is a net location. Tests the scheme and netloc."""
+
+ # BUG : URLs require a scheme string ('http://') to be used.
+ # www.google.com will fail.
+ # Should we prepend the scheme for those that don't have it and
+ # test that also? Similar to the way we append .gz and test for
+ # for compressed versions of files.
+
+ scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
+ return bool(scheme and netloc)
+
+ def _cache(self, path):
+ """Cache the file specified by path.
+
+ Creates a copy of the file in the datasource cache.
+
+ """
+
+ upath = self.abspath(path)
+
+ # ensure directory exists
+ if not os.path.exists(os.path.dirname(upath)):
+ os.makedirs(os.path.dirname(upath))
+
+ # TODO: Doesn't handle compressed files!
+ if self._isurl(path):
+ try:
+ openedurl = urlopen(path)
+ file(upath, 'w').write(openedurl.read())
+ except URLError:
+ raise URLError("URL not found: ", path)
+ else:
+ try:
+ # TODO: Why not just copy the file with shutils.copyfile?
+ fp = file(path, 'r')
+ file(upath, 'w').write(fp.read())
+ except IOError:
+ raise IOError("File not found: ", path)
+ return upath
+
+ def _findfile(self, path):
+ """Searches for ``path`` and returns full path if found.
+
+ If path is an URL, _findfile will cache a local copy and return
+ the path to the cached file.
+ If path is a local file, _findfile will return a path to that local
+ file.
+
+ The search will include possible compressed versions of the file and
+ return the first occurence found.
+
+ """
+
+ # Build list of possible local file paths
+ if not self._isurl(path):
+ # Valid local paths
+ filelist = self._possible_names(path)
+ # Paths in self._destpath
+ filelist += self._possible_names(self.abspath(path))
+ else:
+ # Cached URLs in self._destpath
+ filelist = self._possible_names(self.abspath(path))
+ # Remote URLs
+ filelist = filelist + self._possible_names(path)
+
+ for name in filelist:
+ if self.exists(name):
+ if self._isurl(name):
+ name = self._cache(name)
+ return name
+ return None
+
+ def abspath(self, path):
+ """Return absolute path of ``path`` in the DataSource directory.
+
+ If ``path`` is an URL, the ``abspath`` will be either the location
+ the file exists locally or the location it would exist when opened
+ using the ``open`` method.
+
+ The functionality is idential to os.path.abspath.
+
+ *Parameters*:
+
+ path : {string}
+ Can be a local file or a remote URL.
+
+ *Returns*:
+
+ Complete path, rooted in the DataSource destination directory.
+
+ *See Also*:
+
+ `open` : Method that downloads and opens files.
+
+ """
+
+ # TODO: This should be more robust. Handles case where path includes
+ # the destpath, but not other sub-paths. Failing case:
+ # path = /home/guido/datafile.txt
+ # destpath = /home/alex/
+ # upath = self.abspath(path)
+ # upath == '/home/alex/home/guido/datafile.txt'
+
+ # handle case where path includes self._destpath
+ splitpath = path.split(self._destpath, 2)
+ if len(splitpath) > 1:
+ path = splitpath[1]
+ scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
+ return os.path.join(self._destpath, netloc, upath.strip(os.sep))
+
+ def exists(self, path):
+ """Test if ``path`` exists.
+
+ Test if ``path`` exists as (and in this order):
+
+ - a local file.
+ - a remote URL that have been downloaded and stored locally in the
+ DataSource directory.
+ - a remote URL that has not been downloaded, but is valid and
+ accessible.
+
+ *Parameters*:
+
+ path : {string}
+ Can be a local file or a remote URL.
+
+ *Returns*:
+
+ boolean
+
+ *See Also*:
+
+ `abspath`
+
+ *Notes*
+
+ When ``path`` is an URL, ``exist`` will return True if it's either
+ stored locally in the DataSource directory, or is a valid remote
+ URL. DataSource does not discriminate between to two, the file
+ is accessible if it exists in either location.
+
+ """
+
+ # Test local path
+ if os.path.exists(path):
+ return True
+
+ # Test cached url
+ upath = self.abspath(path)
+ if os.path.exists(upath):
+ return True
+
+ # Test remote url
+ if self._isurl(path):
+ try:
+ netfile = urlopen(path)
+ del(netfile)
+ return True
+ except URLError:
+ return False
+ return False
+
+ def open(self, path, mode='r'):
+ """Open ``path`` with ``mode`` and return the file object.
+
+ If ``path`` is an URL, it will be downloaded, stored in the DataSource
+ directory and opened from there.
+
+ *Parameters*:
+
+ path : {string}
+
+ mode : {string}, optional
+
+
+ *Returns*:
+
+ file object
+
+ """
+
+ # TODO: There is no support for opening a file for writing which
+ # doesn't exist yet (creating a file). Should there be?
+
+ # TODO: Add a ``subdir`` parameter for specifying the subdirectory
+ # used to store URLs in self._destpath.
+
+ if self._isurl(path) and self._iswritemode(mode):
+ raise ValueError("URLs are not writeable")
+
+ # NOTE: _findfile will fail on a new file opened for writing.
+ found = self._findfile(path)
+ if found:
+ _fname, ext = self._splitzipext(found)
+ if ext == 'bz2':
+ mode.replace("+", "")
+ return _file_openers[ext](found, mode=mode)
+ else:
+ raise IOError("%s not found." % path)
+
+
+class Repository (DataSource):
+ """A data Repository where multiple DataSource's share a base URL/directory.
+
+ Repository extends DataSource by prepending a base URL (or directory) to
+ all the files it handles. Use a Repository when you will be working with
+ multiple files from one base URL. Initialize the Respository with the
+ base URL, then refer to each file by it's filename only.
+
+ *Methods*:
+
+ - exists : test if the file exists locally or remotely
+ - abspath : get absolute path of the file in the DataSource directory
+ - open : open the file
+
+ *Toy example*::
+
+ # Analyze all files in the repository.
+ repos = Repository('/home/user/data/dir/')
+ for filename in filelist:
+ fp = repos.open(filename)
+ fp.analyze()
+ fp.close()
+
+ # Similarly you could use a URL for a repository.
+ repos = Repository('http://www.xyz.edu/data')
+
+ """
+
+ def __init__(self, baseurl, destpath=os.curdir):
+ """Create a Repository with a shared url or directory of baseurl."""
+ DataSource.__init__(self, destpath=destpath)
+ self._baseurl = baseurl
+
+ def __del__(self):
+ DataSource.__del__(self)
+
+ def _fullpath(self, path):
+ """Return complete path for path. Prepends baseurl if necessary."""
+ splitpath = path.split(self._baseurl, 2)
+ if len(splitpath) == 1:
+ result = os.path.join(self._baseurl, path)
+ else:
+ result = path # path contains baseurl already
+ return result
+
+ def _findfile(self, path):
+ """Extend DataSource method to prepend baseurl to ``path``."""
+ return DataSource._findfile(self, self._fullpath(path))
+
+ def abspath(self, path):
+ """Extend DataSource method to prepend baseurl to ``path``."""
+ return DataSource.abspath(self, self._fullpath(path))
+
+ def exists(self, path):
+ """Extend DataSource method to prepend baseurl to ``path``."""
+ return DataSource.exists(self, self._fullpath(path))
+
+ def open(self, path, mode='r'):
+ """Extend DataSource method to prepend baseurl to ``path``."""
+ return DataSource.open(self, self._fullpath(path), mode)
+
+ def listdir(self):
+ '''List files in the source Repository.'''
+ if self._isurl(self._baseurl):
+ raise NotImplementedError, \
+ "Directory listing of URLs, not supported yet."
+ else:
+ return os.listdir(self._baseurl)