summaryrefslogtreecommitdiff
path: root/magic
diff options
context:
space:
mode:
authorAdam Hupp <adam@hupp.org>2017-12-03 23:04:32 -0800
committerAdam Hupp <adam@hupp.org>2017-12-03 23:04:32 -0800
commit9ae12462c15100941435acf4eb9caaf5de5dddf9 (patch)
tree3cdb39db5aba276af4d6156fca13a63266a7e4b1 /magic
parenta96081edc65ddcf20599b3fa1ef72eb2c55f1055 (diff)
downloadpython-magic-9ae12462c15100941435acf4eb9caaf5de5dddf9.tar.gz
Convert to a package so we can add more files
Diffstat (limited to 'magic')
-rw-r--r--magic/__init__.py426
1 files changed, 426 insertions, 0 deletions
diff --git a/magic/__init__.py b/magic/__init__.py
new file mode 100644
index 0000000..3b351e2
--- /dev/null
+++ b/magic/__init__.py
@@ -0,0 +1,426 @@
+"""
+magic is a wrapper around the libmagic file identification library.
+
+See README for more information.
+
+Usage:
+
+>>> import magic
+>>> magic.from_file("testdata/test.pdf")
+'PDF document, version 1.2'
+>>> magic.from_file("testdata/test.pdf", mime=True)
+'application/pdf'
+>>> magic.from_buffer(open("testdata/test.pdf").read(1024))
+'PDF document, version 1.2'
+>>>
+
+"""
+
+import sys
+import glob
+import ctypes
+import ctypes.util
+import threading
+
+from ctypes import c_char_p, c_int, c_size_t, c_void_p, byref, POINTER
+
+
+class MagicException(Exception):
+ def __init__(self, message):
+ super(MagicException, self).__init__(message)
+ self.message = message
+
+
+class Magic:
+ """
+ Magic is a wrapper around the libmagic C library.
+ """
+
+ def __init__(self, mime=False, magic_file=None, mime_encoding=False,
+ keep_going=False, uncompress=False, raw=False, extension=False):
+ """
+ Create a new libmagic wrapper.
+
+ mime - if True, mimetypes are returned instead of textual descriptions
+ mime_encoding - if True, codec is returned
+ magic_file - use a mime database other than the system default
+ keep_going - don't stop at the first match, keep going
+ uncompress - Try to look inside compressed files.
+ raw - Do not try to decode "non-printable" chars.
+ extension - Print a slash-separated list of valid extensions for the file type found.
+ """
+
+ self.cookie = None
+ self.flags = MAGIC_NONE
+ if mime:
+ self.flags |= MAGIC_MIME_TYPE
+ if mime_encoding:
+ self.flags |= MAGIC_MIME_ENCODING
+ if keep_going:
+ self.flags |= MAGIC_CONTINUE
+ if uncompress:
+ self.flags |= MAGIC_COMPRESS
+ if raw:
+ self.flags |= MAGIC_RAW
+ if extension:
+ self.flags |= MAGIC_EXTENSION
+
+ self.cookie = magic_open(self.flags)
+ self.lock = threading.Lock()
+
+ magic_load(self.cookie, magic_file)
+
+ # MAGIC_EXTENSION was added in 523 or 524, so bail if
+ # it doesn't appear to be available
+ if extension and (not _has_version or version() < 524):
+ raise NotImplementedError('MAGIC_EXTENSION is not supported in this version of libmagic')
+
+ # For https://github.com/ahupp/python-magic/issues/190
+ # libmagic has fixed internal limits that some files exceed, causing
+ # an error. We can avoid this (at least for the sample file given)
+ # by bumping the limit up. It's not clear if this is a general solution
+ # or whether other internal limits should be increased, but given
+ # the lack of other reports I'll assume this is rare.
+ if _has_param:
+ try:
+ self.setparam(MAGIC_PARAM_NAME_MAX, 64)
+ except MagicException as e:
+ # some versions of libmagic fail this call,
+ # so rather than fail hard just use default behavior
+ pass
+
+ def from_buffer(self, buf):
+ """
+ Identify the contents of `buf`
+ """
+ with self.lock:
+ try:
+ # if we're on python3, convert buf to bytes
+ # otherwise this string is passed as wchar*
+ # which is not what libmagic expects
+ if type(buf) == str and str != bytes:
+ buf = buf.encode('utf-8', errors='replace')
+ return maybe_decode(magic_buffer(self.cookie, buf))
+ except MagicException as e:
+ return self._handle509Bug(e)
+
+ def from_file(self, filename):
+ # raise FileNotFoundException or IOError if the file does not exist
+ with open(filename):
+ pass
+ with self.lock:
+ try:
+ return maybe_decode(magic_file(self.cookie, filename))
+ except MagicException as e:
+ return self._handle509Bug(e)
+
+ def from_descriptor(self, fd):
+ with self.lock:
+ try:
+ return maybe_decode(magic_descriptor(self.cookie, fd))
+ except MagicException as e:
+ return self._handle509Bug(e)
+
+ def _handle509Bug(self, e):
+ # libmagic 5.09 has a bug where it might fail to identify the
+ # mimetype of a file and returns null from magic_file (and
+ # likely _buffer), but also does not return an error message.
+ if e.message is None and (self.flags & MAGIC_MIME_TYPE):
+ return "application/octet-stream"
+ else:
+ raise e
+
+ def setparam(self, param, val):
+ return magic_setparam(self.cookie, param, val)
+
+ def getparam(self, param):
+ return magic_getparam(self.cookie, param)
+
+ def __del__(self):
+ # no _thread_check here because there can be no other
+ # references to this object at this point.
+
+ # during shutdown magic_close may have been cleared already so
+ # make sure it exists before using it.
+
+ # the self.cookie check should be unnecessary and was an
+ # incorrect fix for a threading problem, however I'm leaving
+ # it in because it's harmless and I'm slightly afraid to
+ # remove it.
+ if self.cookie and magic_close:
+ magic_close(self.cookie)
+ self.cookie = None
+
+_instances = {}
+
+
+def _get_magic_type(mime):
+ i = _instances.get(mime)
+ if i is None:
+ i = _instances[mime] = Magic(mime=mime)
+ return i
+
+
+def from_file(filename, mime=False):
+ """"
+ Accepts a filename and returns the detected filetype. Return
+ value is the mimetype if mime=True, otherwise a human readable
+ name.
+
+ >>> magic.from_file("testdata/test.pdf", mime=True)
+ 'application/pdf'
+ """
+ m = _get_magic_type(mime)
+ return m.from_file(filename)
+
+
+def from_buffer(buffer, mime=False):
+ """
+ Accepts a binary string and returns the detected filetype. Return
+ value is the mimetype if mime=True, otherwise a human readable
+ name.
+
+ >>> magic.from_buffer(open("testdata/test.pdf").read(1024))
+ 'PDF document, version 1.2'
+ """
+ m = _get_magic_type(mime)
+ return m.from_buffer(buffer)
+
+
+def from_descriptor(fd, mime=False):
+ """
+ Accepts a file descriptor and returns the detected filetype. Return
+ value is the mimetype if mime=True, otherwise a human readable
+ name.
+
+ >>> f = open("testdata/test.pdf")
+ >>> magic.from_descriptor(f.fileno())
+ 'PDF document, version 1.2'
+ """
+ m = _get_magic_type(mime)
+ return m.from_descriptor(fd)
+
+
+libmagic = None
+# Let's try to find magic or magic1
+dll = ctypes.util.find_library('magic') \
+ or ctypes.util.find_library('magic1') \
+ or ctypes.util.find_library('cygmagic-1') \
+ or ctypes.util.find_library('libmagic-1') \
+ or ctypes.util.find_library('msys-magic-1') #for MSYS2
+
+# necessary because find_library returns None if it doesn't find the library
+if dll:
+ libmagic = ctypes.CDLL(dll)
+
+if not libmagic or not libmagic._name:
+ windows_dlls = ['magic1.dll', 'cygmagic-1.dll', 'libmagic-1.dll', 'msys-magic-1.dll']
+ platform_to_lib = {'darwin': ['/opt/local/lib/libmagic.dylib',
+ '/usr/local/lib/libmagic.dylib'] +
+ # Assumes there will only be one version installed
+ glob.glob('/usr/local/Cellar/libmagic/*/lib/libmagic.dylib'), # flake8:noqa
+ 'win32': windows_dlls,
+ 'cygwin': windows_dlls,
+ 'linux': ['libmagic.so.1'], # fallback for some Linuxes (e.g. Alpine) where library search does not work # flake8:noqa
+ }
+ platform = 'linux' if sys.platform.startswith('linux') else sys.platform
+ for dll in platform_to_lib.get(platform, []):
+ try:
+ libmagic = ctypes.CDLL(dll)
+ break
+ except OSError:
+ pass
+
+if not libmagic or not libmagic._name:
+ # It is better to raise an ImportError since we are importing magic module
+ raise ImportError('failed to find libmagic. Check your installation')
+
+magic_t = ctypes.c_void_p
+
+
+def errorcheck_null(result, func, args):
+ if result is None:
+ err = magic_error(args[0])
+ raise MagicException(err)
+ else:
+ return result
+
+
+def errorcheck_negative_one(result, func, args):
+ if result == -1:
+ err = magic_error(args[0])
+ raise MagicException(err)
+ else:
+ return result
+
+
+# return str on python3. Don't want to unconditionally
+# decode because that results in unicode on python2
+def maybe_decode(s):
+ if str == bytes:
+ return s
+ else:
+ # backslashreplace here because sometimes libmagic will return metadata in the charset
+ # of the file, which is unknown to us (e.g the title of a Word doc)
+ return s.decode('utf-8', 'backslashreplace')
+
+
+def coerce_filename(filename):
+ if filename is None:
+ return None
+
+ # ctypes will implicitly convert unicode strings to bytes with
+ # .encode('ascii'). If you use the filesystem encoding
+ # then you'll get inconsistent behavior (crashes) depending on the user's
+ # LANG environment variable
+ is_unicode = (sys.version_info[0] <= 2 and
+ isinstance(filename, unicode)) or \
+ (sys.version_info[0] >= 3 and
+ isinstance(filename, str))
+ if is_unicode:
+ return filename.encode('utf-8', 'surrogateescape')
+ else:
+ return filename
+
+
+magic_open = libmagic.magic_open
+magic_open.restype = magic_t
+magic_open.argtypes = [c_int]
+
+magic_close = libmagic.magic_close
+magic_close.restype = None
+magic_close.argtypes = [magic_t]
+
+magic_error = libmagic.magic_error
+magic_error.restype = c_char_p
+magic_error.argtypes = [magic_t]
+
+magic_errno = libmagic.magic_errno
+magic_errno.restype = c_int
+magic_errno.argtypes = [magic_t]
+
+_magic_file = libmagic.magic_file
+_magic_file.restype = c_char_p
+_magic_file.argtypes = [magic_t, c_char_p]
+_magic_file.errcheck = errorcheck_null
+
+
+def magic_file(cookie, filename):
+ return _magic_file(cookie, coerce_filename(filename))
+
+
+_magic_buffer = libmagic.magic_buffer
+_magic_buffer.restype = c_char_p
+_magic_buffer.argtypes = [magic_t, c_void_p, c_size_t]
+_magic_buffer.errcheck = errorcheck_null
+
+
+def magic_buffer(cookie, buf):
+ return _magic_buffer(cookie, buf, len(buf))
+
+
+_magic_descriptor = libmagic.magic_descriptor
+_magic_descriptor.restype = c_char_p
+_magic_descriptor.argtypes = [magic_t, c_int]
+_magic_descriptor.errcheck = errorcheck_null
+
+
+def magic_descriptor(cookie, fd):
+ return _magic_descriptor(cookie, fd)
+
+
+_magic_load = libmagic.magic_load
+_magic_load.restype = c_int
+_magic_load.argtypes = [magic_t, c_char_p]
+_magic_load.errcheck = errorcheck_negative_one
+
+
+def magic_load(cookie, filename):
+ return _magic_load(cookie, coerce_filename(filename))
+
+
+magic_setflags = libmagic.magic_setflags
+magic_setflags.restype = c_int
+magic_setflags.argtypes = [magic_t, c_int]
+
+magic_check = libmagic.magic_check
+magic_check.restype = c_int
+magic_check.argtypes = [magic_t, c_char_p]
+
+magic_compile = libmagic.magic_compile
+magic_compile.restype = c_int
+magic_compile.argtypes = [magic_t, c_char_p]
+
+_has_param = False
+if hasattr(libmagic, 'magic_setparam') and hasattr(libmagic, 'magic_getparam'):
+ _has_param = True
+ _magic_setparam = libmagic.magic_setparam
+ _magic_setparam.restype = c_int
+ _magic_setparam.argtypes = [magic_t, c_int, POINTER(c_size_t)]
+ _magic_setparam.errcheck = errorcheck_negative_one
+
+ _magic_getparam = libmagic.magic_getparam
+ _magic_getparam.restype = c_int
+ _magic_getparam.argtypes = [magic_t, c_int, POINTER(c_size_t)]
+ _magic_getparam.errcheck = errorcheck_negative_one
+
+def magic_setparam(cookie, param, val):
+ if not _has_param:
+ raise NotImplementedError("magic_setparam not implemented")
+ v = c_size_t(val)
+ return _magic_setparam(cookie, param, byref(v))
+
+def magic_getparam(cookie, param):
+ if not _has_param:
+ raise NotImplementedError("magic_getparam not implemented")
+ val = c_size_t()
+ _magic_getparam(cookie, param, byref(val))
+ return val.value
+
+_has_version = False
+if hasattr(libmagic, "magic_version"):
+ _has_version = True
+ magic_version = libmagic.magic_version
+ magic_version.restype = c_int
+ magic_version.argtypes = []
+
+def version():
+ if not _has_version:
+ raise NotImplementedError("magic_version not implemented")
+ return magic_version()
+
+MAGIC_NONE = 0x000000 # No flags
+MAGIC_DEBUG = 0x000001 # Turn on debugging
+MAGIC_SYMLINK = 0x000002 # Follow symlinks
+MAGIC_COMPRESS = 0x000004 # Check inside compressed files
+MAGIC_DEVICES = 0x000008 # Look at the contents of devices
+MAGIC_MIME_TYPE = 0x000010 # Return a mime string
+MAGIC_MIME_ENCODING = 0x000400 # Return the MIME encoding
+# TODO: should be
+# MAGIC_MIME = MAGIC_MIME_TYPE | MAGIC_MIME_ENCODING
+MAGIC_MIME = 0x000010 # Return a mime string
+MAGIC_EXTENSION = 0x1000000 # Return a /-separated list of extensions
+
+MAGIC_CONTINUE = 0x000020 # Return all matches
+MAGIC_CHECK = 0x000040 # Print warnings to stderr
+MAGIC_PRESERVE_ATIME = 0x000080 # Restore access time on exit
+MAGIC_RAW = 0x000100 # Don't translate unprintable chars
+MAGIC_ERROR = 0x000200 # Handle ENOENT etc as real errors
+
+MAGIC_NO_CHECK_COMPRESS = 0x001000 # Don't check for compressed files
+MAGIC_NO_CHECK_TAR = 0x002000 # Don't check for tar files
+MAGIC_NO_CHECK_SOFT = 0x004000 # Don't check magic entries
+MAGIC_NO_CHECK_APPTYPE = 0x008000 # Don't check application type
+MAGIC_NO_CHECK_ELF = 0x010000 # Don't check for elf details
+MAGIC_NO_CHECK_ASCII = 0x020000 # Don't check for ascii files
+MAGIC_NO_CHECK_TROFF = 0x040000 # Don't check ascii/troff
+MAGIC_NO_CHECK_FORTRAN = 0x080000 # Don't check ascii/fortran
+MAGIC_NO_CHECK_TOKENS = 0x100000 # Don't check ascii/tokens
+
+MAGIC_PARAM_INDIR_MAX = 0 # Recursion limit for indirect magic
+MAGIC_PARAM_NAME_MAX = 1 # Use count limit for name/use magic
+MAGIC_PARAM_ELF_PHNUM_MAX = 2 # Max ELF notes processed
+MAGIC_PARAM_ELF_SHNUM_MAX = 3 # Max ELF program sections processed
+MAGIC_PARAM_ELF_NOTES_MAX = 4 # # Max ELF sections processed
+MAGIC_PARAM_REGEX_MAX = 5 # Length limit for regex searches
+MAGIC_PARAM_BYTES_MAX = 6 # Max number of bytes to read from file