76 files changed, 1895 insertions, 1018 deletions
diff --git a/numpy/core/SConscript b/numpy/core/SConscript
index 46087610c..952544a25 100644
--- a/numpy/core/SConscript
+++ b/numpy/core/SConscript
@@ -429,7 +429,9 @@ env.Install('$distutils_installdir/lib/npy-pkg-config', mlib_ini)
 env.Install('$distutils_installdir/lib/npy-pkg-config', npymath_ini)
 
 # npysort core lib
-npysort_src = [env.GenerateFromTemplate(pjoin('src', 'npysort', 'sort.c.src'))]
+npysort_src = [env.GenerateFromTemplate(pjoin('src', 'npysort', 'quicksort.c.src')),
+               env.GenerateFromTemplate(pjoin('src', 'npysort', 'mergesort.c.src')),
+               env.GenerateFromTemplate(pjoin('src', 'npysort', 'heapsort.c.src'))]
 env.StaticExtLibrary("npysort", npysort_src)
 env.Prepend(LIBS=["npysort"])
 env.Prepend(LIBPATH=["."])
diff --git a/numpy/core/bento.info b/numpy/core/bento.info
index 04401f991..bf8ae3a81 100644
--- a/numpy/core/bento.info
+++ b/numpy/core/bento.info
@@ -10,7 +10,9 @@ Library:
             src/npymath/halffloat.c
     CompiledLibrary: npysort
         Sources:
-            src/npysort/sort.c.src
+            src/npysort/quicksort.c.src,
+            src/npysort/mergesort.c.src,
+            src/npysort/heapsort.c.src
     Extension: multiarray
         Sources:
             src/multiarray/multiarraymodule_onefile.c
diff --git a/numpy/core/code_generators/cversions.txt b/numpy/core/code_generators/cversions.txt
index 99ea072ca..5bdb33c94 100644
--- a/numpy/core/code_generators/cversions.txt
+++ b/numpy/core/code_generators/cversions.txt
@@ -10,4 +10,4 @@
 # PyArray_CountNonzero, PyArray_NewLikeArray and PyArray_MatrixProduct2.
 0x00000006 = e61d5dc51fa1c6459328266e215d6987
 # Version 7 (NumPy 1.7) improved datetime64, misc utilities.
-0x00000007 = 1768b6c404a3d5a2a6bfe7c68f89e3aa
+0x00000007 = e396ba3912dcf052eaee1b0b203a7724
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index fd2b9628e..c49c3c346 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -324,6 +324,10 @@ multiarray_funcs_api = {
     'PyArray_DebugPrint':                   285,
     'PyArray_FailUnlessWriteable':          286,
     'PyArray_SetUpdateIfCopyBase':          287,
+    'PyDataMem_NEW':                        288,
+    'PyDataMem_FREE':                       289,
+    'PyDataMem_RENEW':                      290,
+    'PyDataMem_SetEventHook':               291,
 }
 
 ufunc_types_api = {
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index 3e89079dd..d73f1313c 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -937,7 +937,7 @@ def diagonal(a, offset=0, axis1=0, axis2=1):
 
     In NumPy 1.7, it continues to return a copy of the diagonal, but depending
     on this fact is deprecated. Writing to the resulting array continues to
-    work as it used to, but a DeprecationWarning will be issued.
+    work as it used to, but a FutureWarning will be issued.
 
     In NumPy 1.8, it will switch to returning a read-only view on the original
     array. Attempting to write to the resulting array will produce an error.
diff --git a/numpy/core/include/numpy/ndarrayobject.h b/numpy/core/include/numpy/ndarrayobject.h
index 74943d535..f00dd7744 100644
--- a/numpy/core/include/numpy/ndarrayobject.h
+++ b/numpy/core/include/numpy/ndarrayobject.h
@@ -229,8 +229,10 @@ PyArray_XDECREF_ERR(PyArrayObject *arr)
 
 #if PY_VERSION_HEX >= 0x02050000
 #define DEPRECATE(msg) PyErr_WarnEx(PyExc_DeprecationWarning,msg,1)
+#define DEPRECATE_FUTUREWARNING(msg) PyErr_WarnEx(PyExc_FutureWarning,msg,1)
 #else
 #define DEPRECATE(msg) PyErr_Warn(PyExc_DeprecationWarning,msg)
+#define DEPRECATE_FUTUREWARNING(msg) PyErr_Warn(PyExc_FutureWarning,msg)
 #endif
 
 
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index 9605c20f8..13483396b 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -327,10 +327,7 @@ struct NpyAuxData_tag {
    * allocated.
    */
 
-  /* Data buffer */
-#define PyDataMem_NEW(size) ((char *)malloc(size))
-#define PyDataMem_FREE(ptr)  free(ptr)
-#define PyDataMem_RENEW(ptr,size) ((char *)realloc(ptr,size))
+  /* Data buffer - PyDataMem_NEW/FREE/RENEW are in multiarraymodule.c */
 
 #define NPY_USE_PYMEM 1
 
@@ -1714,6 +1711,13 @@ typedef struct {
                            */
 } PyArrayInterface;
 
+/*
+ * This is a function for hooking into the PyDataMem_NEW/FREE/RENEW functions.
+ * See the documentation for PyDataMem_SetEventHook.
+ */
+typedef void (PyDataMem_EventHookFunc)(void *inp, void *outp, size_t size,
+                                       void *user_data);
+
 #if !(defined(NPY_NO_DEPRECATED_API) && (NPY_API_VERSION <= NPY_NO_DEPRECATED_API))
 #include "npy_deprecated_api.h"
 #endif
diff --git a/numpy/core/include/numpy/npy_3kcompat.h b/numpy/core/include/numpy/npy_3kcompat.h
index d3d5404f5..4b9d0ea32 100644
--- a/numpy/core/include/numpy/npy_3kcompat.h
+++ b/numpy/core/include/numpy/npy_3kcompat.h
@@ -137,17 +137,6 @@ PyUnicode_Concat2(PyObject **left, PyObject *right)
     *left = newobj;
 }
 
-
-/*
- * Accessing items of ob_base
- */
-
-#if (PY_VERSION_HEX < 0x02060000)
-#define Py_TYPE(o)    (((PyObject*)(o))->ob_type)
-#define Py_REFCNT(o)  (((PyObject*)(o))->ob_refcnt)
-#define Py_SIZE(o)    (((PyVarObject*)(o))->ob_size)
-#endif
-
 /*
  * PyFile_* compatibility
  */
@@ -204,7 +193,7 @@ npy_PyFile_Dup(PyObject *file, char *mode)
         fclose(handle);
         return NULL;
     }
-    fseek(handle, pos, SEEK_SET);
+    npy_fseek(handle, pos, SEEK_SET);
     return handle;
 }
 
@@ -215,11 +204,11 @@ static NPY_INLINE int
 npy_PyFile_DupClose(PyObject *file, FILE* handle)
 {
     PyObject *ret;
-    long position;
-    position = ftell(handle);
+    Py_ssize_t position;
+    position = npy_ftell(handle);
     fclose(handle);
 
-    ret = PyObject_CallMethod(file, "seek", "li", position, 0);
+    ret = PyObject_CallMethod(file, "seek", NPY_SSIZE_T_PYFMT "i", position, 0);
     if (ret == NULL) {
         return -1;
     }
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 4f895e0a7..7fca7e220 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -16,6 +16,15 @@
         #define NPY_INLINE
 #endif
 
+/* Enable 64 bit file position support on win-amd64. Ticket #1660 */
+#if defined(_MSC_VER) && defined(_WIN64) && (_MSC_VER > 1400)
+    #define npy_fseek _fseeki64
+    #define npy_ftell _ftelli64
+#else
+    #define npy_fseek fseek
+    #define npy_ftell ftell
+#endif
+
 /* enums for detected endianness */
 enum {
         NPY_CPU_UNKNOWN_ENDIAN,
@@ -48,9 +57,7 @@ typedef Py_uintptr_t npy_uintp;
     #define PY_SSIZE_T_MIN INT_MIN
   #endif
 #define NPY_SSIZE_T_PYFMT "i"
-#undef PyIndex_Check
 #define constchar const char
-#define PyIndex_Check(op) 0
 #else
 #define NPY_SSIZE_T_PYFMT "n"
 #define constchar char
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 2d36f5ee1..a1000ae0b 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -671,7 +671,10 @@ def configuration(parent_package='',top_path=None):
 
     # This library is created for the build but it is not installed
     config.add_library('npysort',
-            sources = [join('src', 'npysort', 'sort.c.src')])
+            sources = [join('src', 'npysort', 'quicksort.c.src'),
+                       join('src', 'npysort', 'mergesort.c.src'),
+                       join('src', 'npysort', 'heapsort.c.src')])
+
 
     #######################################################################
     #                        multiarray module                            #
diff --git a/numpy/core/src/dummymodule.c b/numpy/core/src/dummymodule.c
index eca6edb54..1d48709e9 100644
--- a/numpy/core/src/dummymodule.c
+++ b/numpy/core/src/dummymodule.c
@@ -9,7 +9,7 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 
 #include <Python.h>
-#include <numpy/npy_3kcompat.h>
+#include <npy_pycompat.h>
 
 static struct PyMethodDef methods[] = {
     {NULL, NULL, 0, NULL}
diff --git a/numpy/core/src/multiarray/array_assign.c b/numpy/core/src/multiarray/array_assign.c
index 03f81f21c..9c1685c16 100644
--- a/numpy/core/src/multiarray/array_assign.c
+++ b/numpy/core/src/multiarray/array_assign.c
@@ -16,7 +16,7 @@
 #include <numpy/ndarraytypes.h>
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "shape.h"
 
diff --git a/numpy/core/src/multiarray/array_assign_array.c b/numpy/core/src/multiarray/array_assign_array.c
index 98c2b19ff..4f462cb05 100644
--- a/numpy/core/src/multiarray/array_assign_array.c
+++ b/numpy/core/src/multiarray/array_assign_array.c
@@ -15,7 +15,7 @@
 #include <numpy/ndarraytypes.h>
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "convert_datatype.h"
 #include "methods.h"
diff --git a/numpy/core/src/multiarray/array_assign_scalar.c b/numpy/core/src/multiarray/array_assign_scalar.c
index c59e6752d..57f8b9074 100644
--- a/numpy/core/src/multiarray/array_assign_scalar.c
+++ b/numpy/core/src/multiarray/array_assign_scalar.c
@@ -15,7 +15,7 @@
 #include <numpy/ndarraytypes.h>
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "convert_datatype.h"
 #include "methods.h"
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index 3cba466dd..e8bc6b7b6 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -32,7 +32,7 @@ maintainer email:  oliphant.travis@ieee.org
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 
@@ -698,7 +698,7 @@ array_might_be_written(PyArrayObject *obj)
         "release -- see numpy.diagonal docs for details. The quick fix is\n"
         "to make an explicit copy (e.g., do arr.diagonal().copy()).";
     if (PyArray_FLAGS(obj) & NPY_ARRAY_WARN_ON_WRITE) {
-        if (DEPRECATE(msg) < 0) {
+        if (DEPRECATE_FUTUREWARNING(msg) < 0) {
             return -1;
         }
         /* Only warn once per array */
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index c446619b6..ad45cdb9e 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -7,7 +7,7 @@
 #define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 #include "numpy/npy_math.h"
 #include "numpy/halffloat.h"
 
diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
index f9a7dddc0..530edbb1a 100644
--- a/numpy/core/src/multiarray/buffer.c
+++ b/numpy/core/src/multiarray/buffer.c
@@ -9,7 +9,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "buffer.h"
 #include "numpyos.h"
diff --git a/numpy/core/src/multiarray/calculation.c b/numpy/core/src/multiarray/calculation.c
index d8d02ef0c..618a8714c 100644
--- a/numpy/core/src/multiarray/calculation.c
+++ b/numpy/core/src/multiarray/calculation.c
@@ -8,7 +8,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 #include "number.h"
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index 9281a6d84..f62b37b4d 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -6,7 +6,7 @@
 #include "numpy/arrayobject.h"
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "usertypes.h"
 
diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c
index ec759b85b..914bc274d 100644
--- a/numpy/core/src/multiarray/conversion_utils.c
+++ b/numpy/core/src/multiarray/conversion_utils.c
@@ -9,7 +9,7 @@
 #include "numpy/arrayobject.h"
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 #include "arraytypes.h"
diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c
index 042aea07b..8b646e59b 100644
--- a/numpy/core/src/multiarray/convert.c
+++ b/numpy/core/src/multiarray/convert.c
@@ -9,7 +9,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "arrayobject.h"
 #include "mapping.h"
diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index 6fdb10bb2..de7468c51 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -9,7 +9,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 #include "scalartypes.h"
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index c9cdffb23..ae29add6e 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -11,7 +11,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 #include "ctors.h"
@@ -3000,41 +3000,21 @@ array_fromfile_binary(FILE *fp, PyArray_Descr *dtype, npy_intp num, size_t *nrea
 
     if (num < 0) {
         int fail = 0;
-
-#if defined(_MSC_VER) && defined(_WIN64) && (_MSC_VER > 1400)
-        /* Workaround Win64 fwrite() bug. Ticket #1660 */
-        start = (npy_intp )_ftelli64(fp);
-        if (start < 0) {
-            fail = 1;
-        }
-        if (_fseeki64(fp, 0, SEEK_END) < 0) {
-            fail = 1;
-        }
-        numbytes = (npy_intp) _ftelli64(fp);
-        if (numbytes < 0) {
-            fail = 1;
-        }
-        numbytes -= start;
-        if (_fseeki64(fp, start, SEEK_SET) < 0) {
-            fail = 1;
-        }
-#else
-        start = (npy_intp)ftell(fp);
+        start = (npy_intp) npy_ftell(fp);
         if (start < 0) {
             fail = 1;
         }
-        if (fseek(fp, 0, SEEK_END) < 0) {
+        if (npy_fseek(fp, 0, SEEK_END) < 0) {
             fail = 1;
         }
-        numbytes = (npy_intp) ftell(fp);
+        numbytes = (npy_intp) npy_ftell(fp);
         if (numbytes < 0) {
             fail = 1;
         }
         numbytes -= start;
-        if (fseek(fp, start, SEEK_SET) < 0) {
+        if (npy_fseek(fp, start, SEEK_SET) < 0) {
             fail = 1;
         }
-#endif
         if (fail) {
             PyErr_SetString(PyExc_IOError,
                             "could not seek in file");
diff --git a/numpy/core/src/multiarray/datetime.c b/numpy/core/src/multiarray/datetime.c
index 55fb3e7ed..6f70fc2ff 100644
--- a/numpy/core/src/multiarray/datetime.c
+++ b/numpy/core/src/multiarray/datetime.c
@@ -18,7 +18,7 @@
 #include <numpy/arrayobject.h>
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "numpy/arrayscalars.h"
 #include "methods.h"
diff --git a/numpy/core/src/multiarray/datetime_busday.c b/numpy/core/src/multiarray/datetime_busday.c
index aa38594cb..331e10496 100644
--- a/numpy/core/src/multiarray/datetime_busday.c
+++ b/numpy/core/src/multiarray/datetime_busday.c
@@ -15,7 +15,7 @@
 #include <numpy/arrayobject.h>
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "numpy/arrayscalars.h"
 #include "lowlevel_strided_loops.h"
diff --git a/numpy/core/src/multiarray/datetime_busdaycal.c b/numpy/core/src/multiarray/datetime_busdaycal.c
index b5b74dd14..82eea9f20 100644
--- a/numpy/core/src/multiarray/datetime_busdaycal.c
+++ b/numpy/core/src/multiarray/datetime_busdaycal.c
@@ -16,7 +16,7 @@
 #include <numpy/arrayobject.h>
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "numpy/arrayscalars.h"
 #include "lowlevel_strided_loops.h"
diff --git a/numpy/core/src/multiarray/datetime_strings.c b/numpy/core/src/multiarray/datetime_strings.c
index cfeabc783..32009bb79 100644
--- a/numpy/core/src/multiarray/datetime_strings.c
+++ b/numpy/core/src/multiarray/datetime_strings.c
@@ -17,7 +17,7 @@
 #include <numpy/arrayobject.h>
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "numpy/arrayscalars.h"
 #include "methods.h"
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index 5090dc39c..596e9b837 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -11,7 +11,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "_datetime.h"
 #include "common.h"
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index b99dd5ff4..138275c4c 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -19,7 +19,7 @@
 #include <numpy/arrayobject.h>
 #include <numpy/npy_cpu.h>
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "convert_datatype.h"
 #include "_datetime.h"
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 03def17b4..2c59533f8 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -16,7 +16,7 @@
 #define _MULTIARRAYMODULE
 #include <numpy/arrayobject.h>
 #include <numpy/halffloat.h>
-#include <numpy/npy_3kcompat.h>
+#include <npy_pycompat.h>
 
 #include <ctype.h>
 
diff --git a/numpy/core/src/multiarray/flagsobject.c b/numpy/core/src/multiarray/flagsobject.c
index 6e5455c01..faaf264e0 100644
--- a/numpy/core/src/multiarray/flagsobject.c
+++ b/numpy/core/src/multiarray/flagsobject.c
@@ -11,7 +11,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 
diff --git a/numpy/core/src/multiarray/getset.c b/numpy/core/src/multiarray/getset.c
index 51caba240..32212ebc4 100644
--- a/numpy/core/src/multiarray/getset.c
+++ b/numpy/core/src/multiarray/getset.c
@@ -10,7 +10,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 #include "scalartypes.h"
diff --git a/numpy/core/src/multiarray/hashdescr.c b/numpy/core/src/multiarray/hashdescr.c
index 71bc2adac..bff266415 100644
--- a/numpy/core/src/multiarray/hashdescr.c
+++ b/numpy/core/src/multiarray/hashdescr.c
@@ -6,7 +6,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "hashdescr.h"
 
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 977580d70..8c80c26f1 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -11,7 +11,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 #include "arrayobject.h"
@@ -19,6 +19,7 @@
 #include "lowlevel_strided_loops.h"
 
 #include "item_selection.h"
+#include "npy_sort.h"
 
 /*NUMPY_API
  * Take
@@ -890,7 +891,7 @@ _new_argsort(PyArrayObject *op, int axis, NPY_SORTKIND which)
 static PyArrayObject *global_obj;
 
 static int
-qsortCompare (const void *a, const void *b)
+sortCompare (const void *a, const void *b)
 {
     return PyArray_DESCR(global_obj)->f->compare(a,b,global_obj);
 }
@@ -954,7 +955,9 @@ PyArray_Sort(PyArrayObject *op, int axis, NPY_SORTKIND which)
     PyArrayObject *ap = NULL, *store_arr = NULL;
     char *ip;
     int i, n, m, elsize, orign;
-    int axis_orig=axis;
+    int res = 0;
+    int axis_orig = axis;
+    int (*sort)(void *, size_t, size_t, npy_comparator);
 
     n = PyArray_NDIM(op);
     if ((n == 0) || (PyArray_SIZE(op) == 1)) {
@@ -975,13 +978,30 @@ PyArray_Sort(PyArrayObject *op, int axis, NPY_SORTKIND which)
     if (PyArray_DESCR(op)->f->sort[which] != NULL) {
         return _new_sort(op, axis, which);
     }
-    if ((which != NPY_QUICKSORT)
-        || PyArray_DESCR(op)->f->compare == NULL) {
+
+    if (PyArray_DESCR(op)->f->compare == NULL) {
         PyErr_SetString(PyExc_TypeError,
-                        "desired sort not supported for this type");
+                "type does not have compare function");
         return -1;
     }
 
+    switch (which) {
+        case NPY_QUICKSORT :
+            sort = npy_quicksort;
+            break;
+        case NPY_HEAPSORT :
+            sort = npy_heapsort;
+            break;
+        case NPY_MERGESORT :
+            sort = npy_mergesort;
+            break;
+        default:
+            PyErr_SetString(PyExc_TypeError,
+                    "requested sort kind is not supported");
+            goto fail;
+    }
+
+
     SWAPAXES2(op);
 
     ap = (PyArrayObject *)PyArray_FromAny((PyObject *)op,
@@ -1001,13 +1021,26 @@ PyArray_Sort(PyArrayObject *op, int axis, NPY_SORTKIND which)
     store_arr = global_obj;
     global_obj = ap;
     for (ip = PyArray_DATA(ap), i = 0; i < n; i++, ip += elsize*m) {
-        qsort(ip, m, elsize, qsortCompare);
+        res = sort(ip, m, elsize, sortCompare);
+        if (res < 0) {
+            break;
+        }
     }
     global_obj = store_arr;
 
     if (PyErr_Occurred()) {
         goto fail;
     }
+    else if (res == -NPY_ENOMEM) {
+        PyErr_NoMemory();
+        goto fail;
+    }
+    else if (res == -NPY_ECOMP) {
+        PyErr_SetString(PyExc_TypeError,
+                "sort comparison failed");
+        goto fail;
+    }
+
 
  finish:
     Py_DECREF(ap);  /* Should update op if needed */
@@ -1045,6 +1078,8 @@ PyArray_ArgSort(PyArrayObject *op, int axis, NPY_SORTKIND which)
     npy_intp i, j, n, m, orign;
     int argsort_elsize;
     char *store_ptr;
+    int res = 0;
+    int (*sort)(void *, size_t, size_t, npy_comparator);
 
     n = PyArray_NDIM(op);
     if ((n == 0) || (PyArray_SIZE(op) == 1)) {
@@ -1071,14 +1106,32 @@ PyArray_ArgSort(PyArrayObject *op, int axis, NPY_SORTKIND which)
         return (PyObject *)ret;
     }
 
-    if ((which != NPY_QUICKSORT) || PyArray_DESCR(op2)->f->compare == NULL) {
+    if (PyArray_DESCR(op2)->f->compare == NULL) {
         PyErr_SetString(PyExc_TypeError,
-                        "requested sort not available for type");
+                "type does not have compare function");
         Py_DECREF(op2);
         op = NULL;
         goto fail;
     }
 
+    switch (which) {
+        case NPY_QUICKSORT :
+            sort = npy_quicksort;
+            break;
+        case NPY_HEAPSORT :
+            sort = npy_heapsort;
+            break;
+        case NPY_MERGESORT :
+            sort = npy_mergesort;
+            break;
+        default:
+            PyErr_SetString(PyExc_TypeError,
+                    "requested sort kind is not supported");
+            Py_DECREF(op2);
+            op = NULL;
+            goto fail;
+    }
+
     /* ap will contain the reference to op2 */
     SWAPAXES(ap, op2);
     op = (PyArrayObject *)PyArray_ContiguousFromAny((PyObject *)ap,
@@ -1109,11 +1162,27 @@ PyArray_ArgSort(PyArrayObject *op, int axis, NPY_SORTKIND which)
         for (j = 0; j < m; j++) {
             ip[j] = j;
         }
-        qsort((char *)ip, m, sizeof(npy_intp), argsort_static_compare);
+        res = sort((char *)ip, m, sizeof(npy_intp), argsort_static_compare);
+        if (res < 0) {
+            break;
+        }
     }
     global_data = store_ptr;
     global_obj = store;
 
+    if (PyErr_Occurred()) {
+        goto fail;
+    }
+    else if (res == -NPY_ENOMEM) {
+        PyErr_NoMemory();
+        goto fail;
+    }
+    else if (res == -NPY_ECOMP) {
+        PyErr_SetString(PyExc_TypeError,
+                "sort comparison failed");
+        goto fail;
+    }
+
  finish:
     Py_DECREF(op);
     SWAPBACK(op, ret);
@@ -1123,7 +1192,6 @@ PyArray_ArgSort(PyArrayObject *op, int axis, NPY_SORTKIND which)
     Py_XDECREF(op);
     Py_XDECREF(ret);
     return NULL;
-
 }
 
 
diff --git a/numpy/core/src/multiarray/iterators.c b/numpy/core/src/multiarray/iterators.c
index af0d0d3c2..2e464a3db 100644
--- a/numpy/core/src/multiarray/iterators.c
+++ b/numpy/core/src/multiarray/iterators.c
@@ -9,7 +9,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "arrayobject.h"
 #include "iterators.h"
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index c57e1ccc1..78818b31d 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -151,7 +151,7 @@ static void
 #else
 
         /* unaligned copy and swap */
-        memcpy(dst, src, @elsize@);
+        memmove(dst, src, @elsize@);
 #  if @is_swap@ == 1
         @swap@@elsize@(dst);
 #  elif @is_swap@ == 2
@@ -239,7 +239,7 @@ _strided_to_strided(char *dst, npy_intp dst_stride,
                         NpyAuxData *NPY_UNUSED(data))
 {
     while (N > 0) {
-        memcpy(dst, src, src_itemsize);
+        memmove(dst, src, src_itemsize);
         dst += dst_stride;
         src += src_stride;
         --N;
@@ -255,7 +255,7 @@ _swap_strided_to_strided(char *dst, npy_intp dst_stride,
     char *a, *b, c;
 
     while (N > 0) {
-        memcpy(dst, src, src_itemsize);
+        memmove(dst, src, src_itemsize);
         /* general in-place swap */
         a = dst;
         b = dst + src_itemsize - 1;
@@ -281,7 +281,7 @@ _swap_pair_strided_to_strided(char *dst, npy_intp dst_stride,
     npy_intp itemsize_half = src_itemsize / 2;
 
     while (N > 0) {
-        memcpy(dst, src, src_itemsize);
+        memmove(dst, src, src_itemsize);
         /* general in-place swap */
         a = dst;
         b = dst + itemsize_half - 1;
@@ -312,7 +312,7 @@ _contig_to_contig(char *dst, npy_intp NPY_UNUSED(dst_stride),
                         npy_intp N, npy_intp src_itemsize,
                         NpyAuxData *NPY_UNUSED(data))
 {
-    memcpy(dst, src, src_itemsize*N);
+    memmove(dst, src, src_itemsize*N);
 }
 
 
@@ -802,7 +802,7 @@ static void
         src_value = *((_TYPE1 *)src);
 #  endif
 #else
-        memcpy(&src_value, src, sizeof(src_value));
+        memmove(&src_value, src, sizeof(src_value));
 #endif
 
 /* Do the cast */
@@ -838,7 +838,7 @@ static void
         *((_TYPE2 *)dst) = dst_value;
 #  endif
 #else
-        memcpy(dst, &dst_value, sizeof(dst_value));
+        memmove(dst, &dst_value, sizeof(dst_value));
 #endif
 
 #if @contig@
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 7abb3ff7f..cdefb9982 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -9,7 +9,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 #include "iterators.h"
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 93d246671..c4147eff2 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -10,7 +10,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 #include "ctors.h"
diff --git a/numpy/core/src/multiarray/multiarray_tests.c.src b/numpy/core/src/multiarray/multiarray_tests.c.src
index 15d4c871e..0768ec0ca 100644
--- a/numpy/core/src/multiarray/multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/multiarray_tests.c.src
@@ -2,7 +2,7 @@
 #include <Python.h>
 #include "numpy/arrayobject.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 /*
  * TODO:
@@ -375,6 +375,59 @@ clean_ax:
     return NULL;
 }
 
+/* PyDataMem_SetHook tests */
+static int malloc_free_counts[2];
+static PyDataMem_EventHookFunc *old_hook = NULL;
+static void *old_data;
+
+static void test_hook(void *old, void *new, size_t size, void *user_data)
+{
+    int* counters = (int *) user_data;
+    if (old == NULL) {
+        counters[0]++; /* malloc counter */
+    }
+    if (size == 0) {
+        counters[1]++; /* free counter */
+    }
+}
+
+static PyObject*
+test_pydatamem_seteventhook_start(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
+{
+    malloc_free_counts[0] = malloc_free_counts[1] = 0;
+    old_hook = PyDataMem_SetEventHook(test_hook, (void *) malloc_free_counts, &old_data);
+    Py_INCREF(Py_None);
+    return Py_None;
+}
+
+static PyObject*
+test_pydatamem_seteventhook_end(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
+{
+    PyDataMem_EventHookFunc *my_hook;
+    void *my_data;
+
+    my_hook = PyDataMem_SetEventHook(old_hook, old_data, &my_data);
+    if ((my_hook != test_hook) || (my_data != (void *) malloc_free_counts)) {
+        PyErr_SetString(PyExc_ValueError,
+                        "hook/data was not the expected test hook");
+        return NULL;
+    }
+
+    if (malloc_free_counts[0] == 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "malloc count is zero after test");
+        return NULL;
+    }
+    if (malloc_free_counts[1] == 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "free count is zero after test");
+        return NULL;
+    }
+
+    Py_INCREF(Py_None);
+    return Py_None;
+}
+
 static PyMethodDef Multiarray_TestsMethods[] = {
     {"test_neighborhood_iterator",
         test_neighborhood_iterator,
@@ -382,6 +435,12 @@ static PyMethodDef Multiarray_TestsMethods[] = {
     {"test_neighborhood_iterator_oob",
         test_neighborhood_iterator_oob,
         METH_VARARGS, NULL},
+    {"test_pydatamem_seteventhook_start",
+        test_pydatamem_seteventhook_start,
+        METH_NOARGS, NULL},
+    {"test_pydatamem_seteventhook_end",
+        test_pydatamem_seteventhook_end,
+        METH_NOARGS, NULL},
     {NULL, NULL, 0, NULL}        /* Sentinel */
 };
 
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 1ab7823ad..1b9be3cd9 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -27,7 +27,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 
@@ -3430,6 +3430,102 @@ test_interrupt(PyObject *NPY_UNUSED(self), PyObject *args)
     return PyInt_FromLong(a);
 }
 
+/* malloc/free/realloc hook */
+NPY_NO_EXPORT PyDataMem_EventHookFunc *_PyDataMem_eventhook;
+NPY_NO_EXPORT void *_PyDataMem_eventhook_user_data;
+
+/*NUMPY_API
+ * Sets the allocation event hook for numpy array data.
+ * Takes a PyDataMem_EventHookFunc *, which has the signature:
+ *        void hook(void *old, void *new, size_t size, void *user_data).
+ *   Also takes a void *user_data, and void **old_data.
+ *
+ * Returns a pointer to the previous hook or NULL.  If old_data is
+ * non-NULL, the previous user_data pointer will be copied to it.
+ *
+ * If not NULL, hook will be called at the end of each PyDataMem_NEW/FREE/RENEW:
+ *   result = PyDataMem_NEW(size)        -> (*hook)(NULL, result, size, user_data)
+ *   PyDataMem_FREE(ptr)                 -> (*hook)(ptr, NULL, 0, user_data)
+ *   result = PyDataMem_RENEW(ptr, size) -> (*hook)(ptr, result, size, user_data)
+ *
+ * When the hook is called, the GIL will be held by the calling
+ * thread.  The hook should be written to be reentrant, if it performs
+ * operations that might cause new allocation events (such as the
+ * creation/descruction numpy objects, or creating/destroying Python
+ * objects which might cause a gc)
+ */
+NPY_NO_EXPORT PyDataMem_EventHookFunc *
+PyDataMem_SetEventHook(PyDataMem_EventHookFunc *newhook,
+                       void *user_data, void **old_data)
+{
+    PyGILState_STATE gilstate = PyGILState_Ensure();
+    PyDataMem_EventHookFunc *temp = _PyDataMem_eventhook;
+    _PyDataMem_eventhook = newhook;
+    if (old_data != NULL) {
+        *old_data = _PyDataMem_eventhook_user_data;
+    }
+    _PyDataMem_eventhook_user_data = user_data;
+    PyGILState_Release(gilstate);
+    return temp;
+}
+
+/*NUMPY_API
+ * Allocates memory for array data.
+ */
+NPY_NO_EXPORT void *
+PyDataMem_NEW(size_t size)
+{
+    void *result;
+
+    result = malloc(size);
+    if (_PyDataMem_eventhook != NULL) {
+        PyGILState_STATE gilstate = PyGILState_Ensure();
+        if (_PyDataMem_eventhook != NULL) {
+            (*_PyDataMem_eventhook)(NULL, result, size,
+                                    _PyDataMem_eventhook_user_data);
+        }
+        PyGILState_Release(gilstate);
+    }
+    return (char *)result;
+}
+
+/*NUMPY_API
+ * Free memory for array data.
+ */
+NPY_NO_EXPORT void
+PyDataMem_FREE(void *ptr)
+{
+    free(ptr);
+    if (_PyDataMem_eventhook != NULL) {
+        PyGILState_STATE gilstate = PyGILState_Ensure();
+        if (_PyDataMem_eventhook != NULL) {
+            (*_PyDataMem_eventhook)(ptr, NULL, 0,
+                                    _PyDataMem_eventhook_user_data);
+        }
+        PyGILState_Release(gilstate);
+    }
+}
+
+/*NUMPY_API
+ * Reallocate/resize memory for array data.
+ */
+NPY_NO_EXPORT void *
+PyDataMem_RENEW(void *ptr, size_t size)
+{
+    void *result;
+
+    result = realloc(ptr, size);
+    if (_PyDataMem_eventhook != NULL) {
+        PyGILState_STATE gilstate = PyGILState_Ensure();
+        if (_PyDataMem_eventhook != NULL) {
+            (*_PyDataMem_eventhook)(ptr, result, size,
+                                    _PyDataMem_eventhook_user_data);
+        }
+        PyGILState_Release(gilstate);
+    }
+    return (char *)result;
+}
+
 static struct PyMethodDef array_module_methods[] = {
     {"_get_ndarray_c_version",
         (PyCFunction)array__get_ndarray_c_version,
diff --git a/numpy/core/src/multiarray/nditer_impl.h b/numpy/core/src/multiarray/nditer_impl.h
index bc4d01ff9..1251baa6e 100644
--- a/numpy/core/src/multiarray/nditer_impl.h
+++ b/numpy/core/src/multiarray/nditer_impl.h
@@ -17,7 +17,7 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 #include <numpy/arrayobject.h>
-#include <numpy/npy_3kcompat.h>
+#include <npy_pycompat.h>
 #include "convert_datatype.h"
 
 #include "lowlevel_strided_loops.h"
diff --git a/numpy/core/src/multiarray/nditer_pywrap.c b/numpy/core/src/multiarray/nditer_pywrap.c
index d6029e814..fd88cdbc7 100644
--- a/numpy/core/src/multiarray/nditer_pywrap.c
+++ b/numpy/core/src/multiarray/nditer_pywrap.c
@@ -13,11 +13,8 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 #include <numpy/arrayobject.h>
-#include <numpy/npy_3kcompat.h>
-
 #include "npy_config.h"
-
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 typedef struct NewNpyArrayIterObject_tag NewNpyArrayIterObject;
 
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index 4cbf80df9..c37a232f4 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -9,7 +9,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "number.h"
 
diff --git a/numpy/core/src/multiarray/numpymemoryview.c b/numpy/core/src/multiarray/numpymemoryview.c
index eda28e1af..991321a8c 100644
--- a/numpy/core/src/multiarray/numpymemoryview.c
+++ b/numpy/core/src/multiarray/numpymemoryview.c
@@ -16,7 +16,7 @@
 #include "numpy/arrayscalars.h"
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "numpymemoryview.h"
 
diff --git a/numpy/core/src/multiarray/numpyos.c b/numpy/core/src/multiarray/numpyos.c
index 9307a2473..cef170108 100644
--- a/numpy/core/src/multiarray/numpyos.c
+++ b/numpy/core/src/multiarray/numpyos.c
@@ -11,7 +11,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 /*
  * From the C99 standard, section 7.19.6: The exponent always contains at least
diff --git a/numpy/core/src/multiarray/refcount.c b/numpy/core/src/multiarray/refcount.c
index 2ba907f59..b067cd948 100644
--- a/numpy/core/src/multiarray/refcount.c
+++ b/numpy/core/src/multiarray/refcount.c
@@ -14,7 +14,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 static void
 _fillobject(char *optr, PyObject *obj, PyArray_Descr *dtype);
diff --git a/numpy/core/src/multiarray/scalarapi.c b/numpy/core/src/multiarray/scalarapi.c
index dd0e0158b..00c71f913 100644
--- a/numpy/core/src/multiarray/scalarapi.c
+++ b/numpy/core/src/multiarray/scalarapi.c
@@ -11,7 +11,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "ctors.h"
 #include "descriptor.h"
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index 276937be1..e547071cc 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -13,7 +13,7 @@
 #include "numpy/halffloat.h"
 #include "numpy/arrayscalars.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "npy_config.h"
 #include "mapping.h"
diff --git a/numpy/core/src/multiarray/sequence.c b/numpy/core/src/multiarray/sequence.c
index 5a43f0565..e5f74251e 100644
--- a/numpy/core/src/multiarray/sequence.c
+++ b/numpy/core/src/multiarray/sequence.c
@@ -9,7 +9,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 #include "mapping.h"
diff --git a/numpy/core/src/multiarray/shape.c b/numpy/core/src/multiarray/shape.c
index 3ed46eac4..067232632 100644
--- a/numpy/core/src/multiarray/shape.c
+++ b/numpy/core/src/multiarray/shape.c
@@ -11,7 +11,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "ctors.h"
 
diff --git a/numpy/core/src/multiarray/ucsnarrow.c b/numpy/core/src/multiarray/ucsnarrow.c
index 406b776bc..b0afdc6d9 100644
--- a/numpy/core/src/multiarray/ucsnarrow.c
+++ b/numpy/core/src/multiarray/ucsnarrow.c
@@ -12,7 +12,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 /*
  * Functions only needed on narrow builds of Python for converting back and
diff --git a/numpy/core/src/multiarray/usertypes.c b/numpy/core/src/multiarray/usertypes.c
index 92c8a1243..f69abcc6b 100644
--- a/numpy/core/src/multiarray/usertypes.c
+++ b/numpy/core/src/multiarray/usertypes.c
@@ -34,7 +34,7 @@ maintainer email:  oliphant.travis@ieee.org
 
 #include "common.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "usertypes.h"
 
diff --git a/numpy/core/src/npysort/heapsort.c.src b/numpy/core/src/npysort/heapsort.c.src
new file mode 100644
index 000000000..6a8c79f7a
--- /dev/null
+++ b/numpy/core/src/npysort/heapsort.c.src
@@ -0,0 +1,341 @@
+/* -*- c -*- */
+
+/*
+ * The purpose of this module is to add faster sort functions
+ * that are type-specific.  This is done by altering the
+ * function table for the builtin descriptors.
+ *
+ * These sorting functions are copied almost directly from numarray
+ * with a few modifications (complex comparisons compare the imaginary
+ * part if the real parts are equal, for example), and the names
+ * are changed.
+ *
+ * The original sorting code is due to Charles R. Harris who wrote
+ * it for numarray.
+ */
+
+/*
+ * Quick sort is usually the fastest, but the worst case scenario can
+ * be slower than the merge and heap sorts.  The merge sort requires
+ * extra memory and so for large arrays may not be useful.
+ *
+ * The merge sort is *stable*, meaning that equal components
+ * are unmoved from their entry versions, so it can be used to
+ * implement lexigraphic sorting on multiple keys.
+ *
+ * The heap sort is included for completeness.
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include <stdlib.h>
+#include "npy_sort.h"
+#include "npysort_common.h"
+
+#define NOT_USED NPY_UNUSED(unused)
+#define PYA_QS_STACK 100
+#define SMALL_QUICKSORT 15
+#define SMALL_MERGESORT 20
+#define SMALL_STRING 16
+
+
+/*
+ *****************************************************************************
+ **                            NUMERIC SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong, half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #type = npy_bool, npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
+ *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat,
+ *         npy_cdouble, npy_clongdouble#
+ */
+
+int
+heapsort_@suff@(@type@ *start, npy_intp n, void *NOT_USED)
+{
+    @type@ tmp, *a;
+    npy_intp i,j,l;
+
+    /* The array needs to be offset by one for heapsort indexing */
+    a = start - 1;
+
+    for (l = n>>1; l > 0; --l) {
+        tmp = a[l];
+        for (i = l, j = l<<1; j <= n;) {
+            if (j < n && @TYPE@_LT(a[j], a[j+1])) {
+                j += 1;
+            }
+            if (@TYPE@_LT(tmp, a[j])) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    for (; n > 1;) {
+        tmp = a[n];
+        a[n] = a[1];
+        n -= 1;
+        for (i = 1, j = 2; j <= n;) {
+            if (j < n && @TYPE@_LT(a[j], a[j+1])) {
+                j++;
+            }
+            if (@TYPE@_LT(tmp, a[j])) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    return 0;
+}
+
+
+int
+aheapsort_@suff@(@type@ *v, npy_intp *tosort, npy_intp n, void *NOT_USED)
+{
+    npy_intp *a, i,j,l, tmp;
+    /* The arrays need to be offset by one for heapsort indexing */
+    a = tosort - 1;
+
+    for (l = n>>1; l > 0; --l) {
+        tmp = a[l];
+        for (i = l, j = l<<1; j <= n;) {
+            if (j < n && @TYPE@_LT(v[a[j]], v[a[j+1]])) {
+                j += 1;
+            }
+            if (@TYPE@_LT(v[tmp], v[a[j]])) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    for (; n > 1;) {
+        tmp = a[n];
+        a[n] = a[1];
+        n -= 1;
+        for (i = 1, j = 2; j <= n;) {
+            if (j < n && @TYPE@_LT(v[a[j]], v[a[j+1]])) {
+                j++;
+            }
+            if (@TYPE@_LT(v[tmp], v[a[j]])) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             STRING SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = STRING, UNICODE#
+ * #suff = string, unicode#
+ * #type = npy_char, npy_ucs4#
+ */
+
+int
+heapsort_@suff@(@type@ *start, npy_intp n, PyArrayObject *arr)
+{
+    size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
+    @type@ *tmp = malloc(PyArray_ITEMSIZE(arr));
+    @type@ *a = start - len;
+    npy_intp i, j, l;
+
+    for (l = n>>1; l > 0; --l) {
+        @TYPE@_COPY(tmp, a + l*len, len);
+        for (i = l, j = l<<1; j <= n;) {
+            if (j < n && @TYPE@_LT(a + j*len, a + (j+1)*len, len))
+                j += 1;
+            if (@TYPE@_LT(tmp, a + j*len, len)) {
+                @TYPE@_COPY(a + i*len, a + j*len, len);
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        @TYPE@_COPY(a + i*len, tmp, len);
+    }
+
+    for (; n > 1;) {
+        @TYPE@_COPY(tmp, a + n*len, len);
+        @TYPE@_COPY(a + n*len, a + len, len);
+        n -= 1;
+        for (i = 1, j = 2; j <= n;) {
+            if (j < n && @TYPE@_LT(a + j*len, a + (j+1)*len, len))
+                j++;
+            if (@TYPE@_LT(tmp, a + j*len, len)) {
+                @TYPE@_COPY(a + i*len, a + j*len, len);
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        @TYPE@_COPY(a + i*len, tmp, len);
+    }
+
+    free(tmp);
+    return 0;
+}
+
+
+int
+aheapsort_@suff@(@type@ *v, npy_intp *tosort, npy_intp n, PyArrayObject *arr)
+{
+    size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
+    npy_intp *a, i,j,l, tmp;
+
+    /* The array needs to be offset by one for heapsort indexing */
+    a = tosort - 1;
+
+    for (l = n>>1; l > 0; --l) {
+        tmp = a[l];
+        for (i = l, j = l<<1; j <= n;) {
+            if (j < n && @TYPE@_LT(v + a[j]*len, v + a[j+1]*len, len))
+                j += 1;
+            if (@TYPE@_LT(v + tmp*len, v + a[j]*len, len)) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    for (; n > 1;) {
+        tmp = a[n];
+        a[n] = a[1];
+        n -= 1;
+        for (i = 1, j = 2; j <= n;) {
+            if (j < n && @TYPE@_LT(v + a[j]*len, v + a[j+1]*len, len))
+                j++;
+            if (@TYPE@_LT(v + tmp*len, v + a[j]*len, len)) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             GENERIC SORT                                **
+ *****************************************************************************
+ */
+
+
+/*
+ * This sort has almost the same signature as libc qsort and is intended to
+ * provide a heapsort for array types that don't have type specific sorts.
+ * The difference in the signature is an error return, as it might be the
+ * case that a memory allocation fails.
+ */
+int
+npy_heapsort(void *base, size_t num, size_t size, npy_comparator cmp)
+{
+    char *tmp = (char *) malloc(size);
+    char *a = (char *) base - size;
+    size_t i, j, l;
+
+    if (tmp == NULL) {
+        return -NPY_ENOMEM;
+    }
+
+    for (l = num >> 1; l > 0; --l) {
+        GENERIC_COPY(tmp, a + l*size, size);
+        for (i = l, j = l << 1; j <= num;) {
+            if (j < num && GENERIC_LT(a + j*size, a + (j+1)*size, cmp))
+                j += 1;
+            if (GENERIC_LT(tmp, a + j*size, cmp)) {
+                GENERIC_COPY(a + i*size, a + j*size, size);
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        GENERIC_COPY(a + i*size, tmp, size);
+    }
+
+    for (; num > 1;) {
+        GENERIC_COPY(tmp, a + num*size, size);
+        GENERIC_COPY(a + num*size, a + size, size);
+        num -= 1;
+        for (i = 1, j = 2; j <= num;) {
+            if (j < num && GENERIC_LT(a + j*size, a + (j+1)*size, cmp))
+                j++;
+            if (GENERIC_LT(tmp, a + j*size, cmp)) {
+                GENERIC_COPY(a + i*size, a + j*size, size);
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        GENERIC_COPY(a + i*size, tmp, size);
+    }
+
+    free(tmp);
+    return 0;
+}
diff --git a/numpy/core/src/npysort/mergesort.c.src b/numpy/core/src/npysort/mergesort.c.src
new file mode 100644
index 000000000..1003b95d1
--- /dev/null
+++ b/numpy/core/src/npysort/mergesort.c.src
@@ -0,0 +1,433 @@
+/* -*- c -*- */
+
+/*
+ * The purpose of this module is to add faster sort functions
+ * that are type-specific.  This is done by altering the
+ * function table for the builtin descriptors.
+ *
+ * These sorting functions are copied almost directly from numarray
+ * with a few modifications (complex comparisons compare the imaginary
+ * part if the real parts are equal, for example), and the names
+ * are changed.
+ *
+ * The original sorting code is due to Charles R. Harris who wrote
+ * it for numarray.
+ */
+
+/*
+ * Quick sort is usually the fastest, but the worst case scenario can
+ * be slower than the merge and heap sorts.  The merge sort requires
+ * extra memory and so for large arrays may not be useful.
+ *
+ * The merge sort is *stable*, meaning that equal components
+ * are unmoved from their entry versions, so it can be used to
+ * implement lexigraphic sorting on multiple keys.
+ *
+ * The heap sort is included for completeness.
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include <stdlib.h>
+#include "npy_sort.h"
+#include "npysort_common.h"
+
+#define NOT_USED NPY_UNUSED(unused)
+#define PYA_QS_STACK 100
+#define SMALL_QUICKSORT 15
+#define SMALL_MERGESORT 20
+#define SMALL_STRING 16
+
+
+/*
+ *****************************************************************************
+ **                            NUMERIC SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong, half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #type = npy_bool, npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
+ *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat,
+ *         npy_cdouble, npy_clongdouble#
+ */
+
+static void
+mergesort0_@suff@(@type@ *pl, @type@ *pr, @type@ *pw)
+{
+    @type@ vp, *pi, *pj, *pk, *pm;
+
+    if (pr - pl > SMALL_MERGESORT) {
+        /* merge sort */
+        pm = pl + ((pr - pl) >> 1);
+        mergesort0_@suff@(pl, pm, pw);
+        mergesort0_@suff@(pm, pr, pw);
+        for (pi = pw, pj = pl; pj < pm;) {
+            *pi++ = *pj++;
+        }
+        pi = pw + (pm - pl);
+        pj = pw;
+        pk = pl;
+        while (pj < pi && pm < pr) {
+            if (@TYPE@_LT(*pm, *pj)) {
+                *pk++ = *pm++;
+            }
+            else {
+                *pk++ = *pj++;
+            }
+        }
+        while(pj < pi) {
+            *pk++ = *pj++;
+        }
+    }
+    else {
+        /* insertion sort */
+        for (pi = pl + 1; pi < pr; ++pi) {
+            vp = *pi;
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && @TYPE@_LT(vp, *pk)) {
+                *pj-- = *pk--;
+            }
+            *pj = vp;
+        }
+    }
+}
+
+
+int
+mergesort_@suff@(@type@ *start, npy_intp num, void *NOT_USED)
+{
+    @type@ *pl, *pr, *pw;
+
+    pl = start;
+    pr = pl + num;
+    pw = (@type@ *) malloc((num/2) * sizeof(@type@));
+    if (pw == NULL) {
+        PyErr_NoMemory();
+        return -NPY_ENOMEM;
+    }
+    mergesort0_@suff@(pl, pr, pw);
+
+    free(pw);
+    return 0;
+}
+
+
+static void
+amergesort0_@suff@(npy_intp *pl, npy_intp *pr, @type@ *v, npy_intp *pw)
+{
+    @type@ vp;
+    npy_intp vi, *pi, *pj, *pk, *pm;
+
+    if (pr - pl > SMALL_MERGESORT) {
+        /* merge sort */
+        pm = pl + ((pr - pl) >> 1);
+        amergesort0_@suff@(pl, pm, v, pw);
+        amergesort0_@suff@(pm, pr, v, pw);
+        for (pi = pw, pj = pl; pj < pm;) {
+            *pi++ = *pj++;
+        }
+        pi = pw + (pm - pl);
+        pj = pw;
+        pk = pl;
+        while (pj < pi && pm < pr) {
+            if (@TYPE@_LT(v[*pm], v[*pj])) {
+                *pk++ = *pm++;
+            }
+            else {
+                *pk++ = *pj++;
+            }
+        }
+        while(pj < pi) {
+            *pk++ = *pj++;
+        }
+    }
+    else {
+        /* insertion sort */
+        for (pi = pl + 1; pi < pr; ++pi) {
+            vi = *pi;
+            vp = v[vi];
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && @TYPE@_LT(vp, v[*pk])) {
+                *pj-- = *pk--;
+            }
+            *pj = vi;
+        }
+    }
+}
+
+
+int
+amergesort_@suff@(@type@ *v, npy_intp *tosort, npy_intp num, void *NOT_USED)
+{
+    npy_intp *pl, *pr, *pw;
+
+    pl = tosort;
+    pr = pl + num;
+    pw = (npy_intp *) malloc((num/2) * sizeof(npy_intp));
+    if (pw == NULL) {
+        PyErr_NoMemory();
+        return -NPY_ENOMEM;
+    }
+    amergesort0_@suff@(pl, pr, v, pw);
+    free(pw);
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             STRING SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = STRING, UNICODE#
+ * #suff = string, unicode#
+ * #type = npy_char, npy_ucs4#
+ */
+
+static void
+mergesort0_@suff@(@type@ *pl, @type@ *pr, @type@ *pw, @type@ *vp, size_t len)
+{
+    @type@ *pi, *pj, *pk, *pm;
+
+    if ((size_t)(pr - pl) > SMALL_MERGESORT*len) {
+        /* merge sort */
+        pm = pl + (((pr - pl)/len) >> 1)*len;
+        mergesort0_@suff@(pl, pm, pw, vp, len);
+        mergesort0_@suff@(pm, pr, pw, vp, len);
+        @TYPE@_COPY(pw, pl, pm - pl);
+        pi = pw + (pm - pl);
+        pj = pw;
+        pk = pl;
+        while (pj < pi && pm < pr) {
+            if (@TYPE@_LT(pm, pj, len)) {
+                @TYPE@_COPY(pk, pm, len);
+                pm += len;
+                pk += len;
+            }
+            else {
+                @TYPE@_COPY(pk, pj, len);
+                pj += len;
+                pk += len;
+            }
+        }
+        @TYPE@_COPY(pk, pj, pi - pj);
+    }
+    else {
+        /* insertion sort */
+        for (pi = pl + len; pi < pr; pi += len) {
+            @TYPE@_COPY(vp, pi, len);
+            pj = pi;
+            pk = pi - len;
+            while (pj > pl && @TYPE@_LT(vp, pk, len)) {
+                @TYPE@_COPY(pj, pk, len);
+                pj -= len;
+                pk -= len;
+            }
+            @TYPE@_COPY(pj, vp, len);
+        }
+    }
+}
+
+
+int
+mergesort_@suff@(@type@ *start, npy_intp num, PyArrayObject *arr)
+{
+    size_t elsize = PyArray_ITEMSIZE(arr);
+    size_t len = elsize / sizeof(@type@);
+    @type@ *pl, *pr, *pw, *vp;
+    int err = 0;
+
+    pl = start;
+    pr = pl + num*len;
+    pw = (@type@ *) malloc((num/2) * elsize);
+    if (pw == NULL) {
+        PyErr_NoMemory();
+        err = -NPY_ENOMEM;
+        goto fail_0;
+    }
+    vp = (@type@ *) malloc(elsize);
+    if (vp == NULL) {
+        PyErr_NoMemory();
+        err = -NPY_ENOMEM;
+        goto fail_1;
+    }
+    mergesort0_@suff@(pl, pr, pw, vp, len);
+
+    free(vp);
+fail_1:
+    free(pw);
+fail_0:
+    return err;
+}
+
+
+static void
+amergesort0_@suff@(npy_intp *pl, npy_intp *pr, @type@ *v, npy_intp *pw, size_t len)
+{
+    @type@ *vp;
+    npy_intp vi, *pi, *pj, *pk, *pm;
+
+    if (pr - pl > SMALL_MERGESORT) {
+        /* merge sort */
+        pm = pl + ((pr - pl) >> 1);
+        amergesort0_@suff@(pl, pm, v, pw, len);
+        amergesort0_@suff@(pm, pr, v, pw, len);
+        for (pi = pw, pj = pl; pj < pm;) {
+            *pi++ = *pj++;
+        }
+        pi = pw + (pm - pl);
+        pj = pw;
+        pk = pl;
+        while (pj < pi && pm < pr) {
+            if (@TYPE@_LT(v + (*pm)*len, v + (*pj)*len, len)) {
+                *pk++ = *pm++;
+            }
+            else {
+                *pk++ = *pj++;
+            }
+        }
+        while (pj < pi) {
+            *pk++ = *pj++;
+        }
+    }
+    else {
+        /* insertion sort */
+        for (pi = pl + 1; pi < pr; ++pi) {
+            vi = *pi;
+            vp = v + vi*len;
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && @TYPE@_LT(vp, v + (*pk)*len, len)) {
+                *pj-- = *pk--;
+            }
+            *pj = vi;
+        }
+    }
+}
+
+
+int
+amergesort_@suff@(@type@ *v, npy_intp *tosort, npy_intp num, PyArrayObject *arr)
+{
+    size_t elsize = PyArray_ITEMSIZE(arr);
+    size_t len = elsize / sizeof(@type@);
+    npy_intp *pl, *pr, *pw;
+
+    pl = tosort;
+    pr = pl + num;
+    pw = (npy_intp *) malloc((num/2) * sizeof(npy_intp));
+    if (pw == NULL) {
+        PyErr_NoMemory();
+        return -NPY_ENOMEM;
+    }
+    amergesort0_@suff@(pl, pr, v, pw, len);
+    free(pw);
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             GENERIC SORT                                **
+ *****************************************************************************
+ */
+
+
+static void
+npy_mergesort0(char *pl, char *pr, char *pw, char *vp, size_t size, npy_comparator cmp)
+{
+    char *pi, *pj, *pk, *pm;
+
+    if ((size_t)(pr - pl) > SMALL_MERGESORT*size) {
+        /* merge sort */
+        pm = pl + (((pr - pl)/size) >> 1)*size;
+        npy_mergesort0(pl, pm, pw, vp, size, cmp);
+        npy_mergesort0(pm, pr, pw, vp, size, cmp);
+        GENERIC_COPY(pw, pl, pm - pl);
+        pi = pw + (pm - pl);
+        pj = pw;
+        pk = pl;
+        while (pj < pi && pm < pr) {
+            if (GENERIC_LT(pm, pj, cmp)) {
+                GENERIC_COPY(pk, pm, size);
+                pm += size;
+                pk += size;
+            }
+            else {
+                GENERIC_COPY(pk, pj, size);
+                pj += size;
+                pk += size;
+            }
+        }
+        GENERIC_COPY(pk, pj, pi - pj);
+    }
+    else {
+        /* insertion sort */
+        for (pi = pl + size; pi < pr; pi += size) {
+            GENERIC_COPY(vp, pi, size);
+            pj = pi;
+            pk = pi - size;
+            while (pj > pl && GENERIC_LT(vp, pk, cmp)) {
+                GENERIC_COPY(pj, pk, size);
+                pj -= size;
+                pk -= size;
+            }
+            GENERIC_COPY(pj, vp, size);
+        }
+    }
+}
+
+
+/*
+ * This sort has almost the same signature as libc qsort and is intended to
+ * provide a mergesort for array types that don't have type specific sorts.
+ * The difference in the signature is an error return, as it might be the
+ * case that a memory allocation fails.
+ */
+int
+npy_mergesort(void *base, size_t num, size_t size, npy_comparator cmp)
+{
+    char *pl, *pr, *pw, *vp;
+    int err = 0;
+
+    pl = (char *)base;
+    pr = pl + num*size;
+    pw = (char *) malloc((num/2) * size);
+    if (pw == NULL) {
+        err = -NPY_ENOMEM;
+        goto fail_0;
+    }
+    vp = (char *) malloc(size);
+    if (vp == NULL) {
+        err = -NPY_ENOMEM;
+        goto fail_1;
+    }
+    npy_mergesort0(pl, pr, pw, vp, size, cmp);
+
+    free(vp);
+fail_1:
+    free(pw);
+fail_0:
+    return err;
+}
diff --git a/numpy/core/src/npysort/npysort_common.h b/numpy/core/src/npysort/npysort_common.h
index bc1d278b0..318a80222 100644
--- a/numpy/core/src/npysort/npysort_common.h
+++ b/numpy/core/src/npysort/npysort_common.h
@@ -1,15 +1,9 @@
 #ifndef __NPY_SORT_COMMON_H__
 #define __NPY_SORT_COMMON_H__
 
-/*
-#include "Python.h"
-#include "numpy/npy_common.h"
-#include "numpy/npy_math.h"
-#include "npy_config.h"
-*/
+#include <stdlib.h>
 #include <numpy/ndarraytypes.h>
 
-
 /*
  *****************************************************************************
  **                        SWAP MACROS                                      **
@@ -155,14 +149,17 @@ npy_half_lt_nonan(npy_half h1, npy_half h2)
     if (h1&0x8000u) {
         if (h2&0x8000u) {
             return (h1&0x7fffu) > (h2&0x7fffu);
-        } else {
+        }
+        else {
             /* Signed zeros are equal, have to check for it */
             return (h1 != 0x8000u) || (h2 != 0x0000u);
         }
-    } else {
+    }
+    else {
         if (h2&0x8000u) {
             return 0;
-        } else {
+        }
+        else {
             return (h1&0x7fffu) < (h2&0x7fffu);
         }
     }
@@ -176,7 +173,8 @@ HALF_LT(npy_half a, npy_half b)
 
     if (npy_half_isnan(b)) {
         ret = !npy_half_isnan(a);
-    } else {
+    }
+    else {
         ret = !npy_half_isnan(a) && npy_half_lt_nonan(a, b);
     }
 
@@ -254,14 +252,6 @@ CLONGDOUBLE_LT(npy_clongdouble a, npy_clongdouble b)
 }
 
 
-/* The PyObject functions are stubs for later use */
-NPY_INLINE static int
-PyObject_LT(PyObject *pa, PyObject *pb)
-{
-    return 0;
-}
-
-
 NPY_INLINE static void
 STRING_COPY(char *s1, char *s2, size_t len)
 {
@@ -333,4 +323,29 @@ UNICODE_LT(npy_ucs4 *s1, npy_ucs4 *s2, size_t len)
     return ret;
 }
 
+
+NPY_INLINE static void
+GENERIC_COPY(char *a, char *b, size_t len)
+{
+    memcpy(a, b, len);
+}
+
+
+NPY_INLINE static void
+GENERIC_SWAP(char *a, char *b, size_t len)
+{
+    while(len--) {
+        const char t = *a;
+        *a++ = *b;
+        *b++ = t;
+    }
+}
+
+
+NPY_INLINE static int
+GENERIC_LT(char *a, char *b, int (*cmp)(const void *, const void *))
+{
+    return cmp(a, b) < 0;
+}
+
 #endif
diff --git a/numpy/core/src/npysort/quicksort.c.src b/numpy/core/src/npysort/quicksort.c.src
new file mode 100644
index 000000000..2225887b1
--- /dev/null
+++ b/numpy/core/src/npysort/quicksort.c.src
@@ -0,0 +1,363 @@
+/* -*- c -*- */
+
+/*
+ * The purpose of this module is to add faster sort functions
+ * that are type-specific.  This is done by altering the
+ * function table for the builtin descriptors.
+ *
+ * These sorting functions are copied almost directly from numarray
+ * with a few modifications (complex comparisons compare the imaginary
+ * part if the real parts are equal, for example), and the names
+ * are changed.
+ *
+ * The original sorting code is due to Charles R. Harris who wrote
+ * it for numarray.
+ */
+
+/*
+ * Quick sort is usually the fastest, but the worst case scenario can
+ * be slower than the merge and heap sorts.  The merge sort requires
+ * extra memory and so for large arrays may not be useful.
+ *
+ * The merge sort is *stable*, meaning that equal components
+ * are unmoved from their entry versions, so it can be used to
+ * implement lexigraphic sorting on multiple keys.
+ *
+ * The heap sort is included for completeness.
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include <stdlib.h>
+#include "npy_sort.h"
+#include "npysort_common.h"
+
+#define NOT_USED NPY_UNUSED(unused)
+#define PYA_QS_STACK 100
+#define SMALL_QUICKSORT 15
+#define SMALL_MERGESORT 20
+#define SMALL_STRING 16
+
+
+/*
+ *****************************************************************************
+ **                            NUMERIC SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong, half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #type = npy_bool, npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
+ *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat,
+ *         npy_cdouble, npy_clongdouble#
+ */
+
+int
+quicksort_@suff@(@type@ *start, npy_intp num, void *NOT_USED)
+{
+    @type@ vp;
+    @type@ *pl = start;
+    @type@ *pr = start + num - 1;
+    @type@ *stack[PYA_QS_STACK];
+    @type@ **sptr = stack;
+    @type@ *pm, *pi, *pj, *pk;
+
+    for (;;) {
+        while ((pr - pl) > SMALL_QUICKSORT) {
+            /* quicksort partition */
+            pm = pl + ((pr - pl) >> 1);
+            if (@TYPE@_LT(*pm, *pl)) @TYPE@_SWAP(*pm, *pl);
+            if (@TYPE@_LT(*pr, *pm)) @TYPE@_SWAP(*pr, *pm);
+            if (@TYPE@_LT(*pm, *pl)) @TYPE@_SWAP(*pm, *pl);
+            vp = *pm;
+            pi = pl;
+            pj = pr - 1;
+            @TYPE@_SWAP(*pm, *pj);
+            for (;;) {
+                do ++pi; while (@TYPE@_LT(*pi, vp));
+                do --pj; while (@TYPE@_LT(vp, *pj));
+                if (pi >= pj) {
+                    break;
+                }
+                @TYPE@_SWAP(*pi,*pj);
+            }
+            pk = pr - 1;
+            @TYPE@_SWAP(*pi, *pk);
+            /* push largest partition on stack */
+            if (pi - pl < pr - pi) {
+                *sptr++ = pi + 1;
+                *sptr++ = pr;
+                pr = pi - 1;
+            }
+            else {
+                *sptr++ = pl;
+                *sptr++ = pi - 1;
+                pl = pi + 1;
+            }
+        }
+
+        /* insertion sort */
+        for (pi = pl + 1; pi <= pr; ++pi) {
+            vp = *pi;
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && @TYPE@_LT(vp, *pk)) {
+                *pj-- = *pk--;
+            }
+            *pj = vp;
+        }
+        if (sptr == stack) {
+            break;
+        }
+        pr = *(--sptr);
+        pl = *(--sptr);
+    }
+
+    return 0;
+}
+
+
+int
+aquicksort_@suff@(@type@ *v, npy_intp* tosort, npy_intp num, void *NOT_USED)
+{
+    @type@ vp;
+    npy_intp *pl = tosort;
+    npy_intp *pr = tosort + num - 1;
+    npy_intp *stack[PYA_QS_STACK];
+    npy_intp **sptr = stack;
+    npy_intp *pm, *pi, *pj, *pk, vi;
+
+    for (;;) {
+        while ((pr - pl) > SMALL_QUICKSORT) {
+            /* quicksort partition */
+            pm = pl + ((pr - pl) >> 1);
+            if (@TYPE@_LT(v[*pm],v[*pl])) INTP_SWAP(*pm, *pl);
+            if (@TYPE@_LT(v[*pr],v[*pm])) INTP_SWAP(*pr, *pm);
+            if (@TYPE@_LT(v[*pm],v[*pl])) INTP_SWAP(*pm, *pl);
+            vp = v[*pm];
+            pi = pl;
+            pj = pr - 1;
+            INTP_SWAP(*pm, *pj);
+            for (;;) {
+                do ++pi; while (@TYPE@_LT(v[*pi], vp));
+                do --pj; while (@TYPE@_LT(vp, v[*pj]));
+                if (pi >= pj) {
+                    break;
+                }
+                INTP_SWAP(*pi, *pj);
+            }
+            pk = pr - 1;
+            INTP_SWAP(*pi,*pk);
+            /* push largest partition on stack */
+            if (pi - pl < pr - pi) {
+                *sptr++ = pi + 1;
+                *sptr++ = pr;
+                pr = pi - 1;
+            }
+            else {
+                *sptr++ = pl;
+                *sptr++ = pi - 1;
+                pl = pi + 1;
+            }
+        }
+
+        /* insertion sort */
+        for (pi = pl + 1; pi <= pr; ++pi) {
+            vi = *pi;
+            vp = v[vi];
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && @TYPE@_LT(vp, v[*pk])) {
+                *pj-- = *pk--;
+            }
+            *pj = vi;
+        }
+        if (sptr == stack) {
+            break;
+        }
+        pr = *(--sptr);
+        pl = *(--sptr);
+    }
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             STRING SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = STRING, UNICODE#
+ * #suff = string, unicode#
+ * #type = npy_char, npy_ucs4#
+ */
+
+int
+quicksort_@suff@(@type@ *start, npy_intp num, PyArrayObject *arr)
+{
+    const size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
+    @type@ *vp = malloc(PyArray_ITEMSIZE(arr));
+    @type@ *pl = start;
+    @type@ *pr = start + (num - 1)*len;
+    @type@ *stack[PYA_QS_STACK], **sptr = stack, *pm, *pi, *pj, *pk;
+
+    for (;;) {
+        while ((size_t)(pr - pl) > SMALL_QUICKSORT*len) {
+            /* quicksort partition */
+            pm = pl + (((pr - pl)/len) >> 1)*len;
+            if (@TYPE@_LT(pm, pl, len)) @TYPE@_SWAP(pm, pl, len);
+            if (@TYPE@_LT(pr, pm, len)) @TYPE@_SWAP(pr, pm, len);
+            if (@TYPE@_LT(pm, pl, len)) @TYPE@_SWAP(pm, pl, len);
+            @TYPE@_COPY(vp, pm, len);
+            pi = pl;
+            pj = pr - len;
+            @TYPE@_SWAP(pm, pj, len);
+            for (;;) {
+                do pi += len; while (@TYPE@_LT(pi, vp, len));
+                do pj -= len; while (@TYPE@_LT(vp, pj, len));
+                if (pi >= pj) {
+                    break;
+                }
+                @TYPE@_SWAP(pi, pj, len);
+            }
+            pk = pr - len;
+            @TYPE@_SWAP(pi, pk, len);
+            /* push largest partition on stack */
+            if (pi - pl < pr - pi) {
+                *sptr++ = pi + len;
+                *sptr++ = pr;
+                pr = pi - len;
+            }
+            else {
+                *sptr++ = pl;
+                *sptr++ = pi - len;
+                pl = pi + len;
+            }
+        }
+
+        /* insertion sort */
+        for (pi = pl + len; pi <= pr; pi += len) {
+            @TYPE@_COPY(vp, pi, len);
+            pj = pi;
+            pk = pi - len;
+            while (pj > pl && @TYPE@_LT(vp, pk, len)) {
+                @TYPE@_COPY(pj, pk, len);
+                pj -= len;
+                pk -= len;
+            }
+            @TYPE@_COPY(pj, vp, len);
+        }
+        if (sptr == stack) {
+            break;
+        }
+        pr = *(--sptr);
+        pl = *(--sptr);
+    }
+
+    free(vp);
+    return 0;
+}
+
+
+int
+aquicksort_@suff@(@type@ *v, npy_intp* tosort, npy_intp num, PyArrayObject *arr)
+{
+    size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
+    @type@ *vp;
+    npy_intp *pl = tosort;
+    npy_intp *pr = tosort + num - 1;
+    npy_intp *stack[PYA_QS_STACK];
+    npy_intp **sptr=stack;
+    npy_intp *pm, *pi, *pj, *pk, vi;
+
+    for (;;) {
+        while ((pr - pl) > SMALL_QUICKSORT) {
+            /* quicksort partition */
+            pm = pl + ((pr - pl) >> 1);
+            if (@TYPE@_LT(v + (*pm)*len, v + (*pl)*len, len)) INTP_SWAP(*pm, *pl);
+            if (@TYPE@_LT(v + (*pr)*len, v + (*pm)*len, len)) INTP_SWAP(*pr, *pm);
+            if (@TYPE@_LT(v + (*pm)*len, v + (*pl)*len, len)) INTP_SWAP(*pm, *pl);
+            vp = v + (*pm)*len;
+            pi = pl;
+            pj = pr - 1;
+            INTP_SWAP(*pm,*pj);
+            for (;;) {
+                do ++pi; while (@TYPE@_LT(v + (*pi)*len, vp, len));
+                do --pj; while (@TYPE@_LT(vp, v + (*pj)*len, len));
+                if (pi >= pj) {
+                    break;
+                }
+                INTP_SWAP(*pi,*pj);
+            }
+            pk = pr - 1;
+            INTP_SWAP(*pi,*pk);
+            /* push largest partition on stack */
+            if (pi - pl < pr - pi) {
+                *sptr++ = pi + 1;
+                *sptr++ = pr;
+                pr = pi - 1;
+            }
+            else {
+                *sptr++ = pl;
+                *sptr++ = pi - 1;
+                pl = pi + 1;
+            }
+        }
+
+        /* insertion sort */
+        for (pi = pl + 1; pi <= pr; ++pi) {
+            vi = *pi;
+            vp = v + vi*len;
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && @TYPE@_LT(vp, v + (*pk)*len, len)) {
+                *pj-- = *pk--;
+            }
+            *pj = vi;
+        }
+        if (sptr == stack) {
+            break;
+        }
+        pr = *(--sptr);
+        pl = *(--sptr);
+    }
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             GENERIC SORT                                **
+ *****************************************************************************
+ */
+
+
+/*
+ * This sort has almost the same signature as libc qsort and is intended to
+ * supply an error return for compatibility with the other generic sort
+ * kinds.
+ */
+int
+npy_quicksort(void *base, size_t num, size_t size, npy_comparator cmp)
+{
+    qsort(base, num, size, cmp);
+    return 0;
+}
diff --git a/numpy/core/src/npysort/sort.c.src b/numpy/core/src/npysort/sort.c.src
deleted file mode 100644
index 618259f96..000000000
--- a/numpy/core/src/npysort/sort.c.src
+++ /dev/null
@@ -1,794 +0,0 @@
-/* -*- c -*- */
-
-/*
- * The purpose of this module is to add faster sort functions
- * that are type-specific.  This is done by altering the
- * function table for the builtin descriptors.
- *
- * These sorting functions are copied almost directly from numarray
- * with a few modifications (complex comparisons compare the imaginary
- * part if the real parts are equal, for example), and the names
- * are changed.
- *
- * The original sorting code is due to Charles R. Harris who wrote
- * it for numarray.
- */
-
-/*
- * Quick sort is usually the fastest, but the worst case scenario can
- * be slower than the merge and heap sorts.  The merge sort requires
- * extra memory and so for large arrays may not be useful.
- *
- * The merge sort is *stable*, meaning that equal components
- * are unmoved from their entry versions, so it can be used to
- * implement lexigraphic sorting on multiple keys.
- *
- * The heap sort is included for completeness.
- */
-
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include <stdlib.h>
-#include "npy_sort.h"
-#include "npysort_common.h"
-
-#define NOT_USED NPY_UNUSED(unused)
-#define PYA_QS_STACK 100
-#define SMALL_QUICKSORT 15
-#define SMALL_MERGESORT 20
-#define SMALL_STRING 16
-
-
-/*
- *****************************************************************************
- **                            NUMERIC SORTS                                **
- *****************************************************************************
- */
-
-
-/**begin repeat
- *
- * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
- *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE,
- *         CFLOAT, CDOUBLE, CLONGDOUBLE#
- * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
- *         longlong, ulonglong, half, float, double, longdouble,
- *         cfloat, cdouble, clongdouble#
- * #type = npy_bool, npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
- *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- *         npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat,
- *         npy_cdouble, npy_clongdouble#
- */
-
-int
-quicksort_@suff@(@type@ *start, npy_intp num, void *NOT_USED)
-{
-    @type@ *pl = start;
-    @type@ *pr = start + num - 1;
-    @type@ vp;
-    @type@ *stack[PYA_QS_STACK], **sptr = stack, *pm, *pi, *pj, *pk;
-
-    for (;;) {
-        while ((pr - pl) > SMALL_QUICKSORT) {
-            /* quicksort partition */
-            pm = pl + ((pr - pl) >> 1);
-            if (@TYPE@_LT(*pm, *pl)) @TYPE@_SWAP(*pm, *pl);
-            if (@TYPE@_LT(*pr, *pm)) @TYPE@_SWAP(*pr, *pm);
-            if (@TYPE@_LT(*pm, *pl)) @TYPE@_SWAP(*pm, *pl);
-            vp = *pm;
-            pi = pl;
-            pj = pr - 1;
-            @TYPE@_SWAP(*pm, *pj);
-            for (;;) {
-                do ++pi; while (@TYPE@_LT(*pi, vp));
-                do --pj; while (@TYPE@_LT(vp, *pj));
-                if (pi >= pj) {
-                    break;
-                }
-                @TYPE@_SWAP(*pi,*pj);
-            }
-            pk = pr - 1;
-            @TYPE@_SWAP(*pi, *pk);
-            /* push largest partition on stack */
-            if (pi - pl < pr - pi) {
-                *sptr++ = pi + 1;
-                *sptr++ = pr;
-                pr = pi - 1;
-            }
-            else {
-                *sptr++ = pl;
-                *sptr++ = pi - 1;
-                pl = pi + 1;
-            }
-        }
-
-        /* insertion sort */
-        for (pi = pl + 1; pi <= pr; ++pi) {
-            vp = *pi;
-            pj = pi;
-            pk = pi - 1;
-            while (pj > pl && @TYPE@_LT(vp, *pk)) {
-                *pj-- = *pk--;
-            }
-            *pj = vp;
-        }
-        if (sptr == stack) {
-            break;
-        }
-        pr = *(--sptr);
-        pl = *(--sptr);
-    }
-
-    return 0;
-}
-
-int
-aquicksort_@suff@(@type@ *v, npy_intp* tosort, npy_intp num, void *NOT_USED)
-{
-    @type@ vp;
-    npy_intp *pl, *pr;
-    npy_intp *stack[PYA_QS_STACK], **sptr=stack, *pm, *pi, *pj, *pk, vi;
-
-    pl = tosort;
-    pr = tosort + num - 1;
-
-    for (;;) {
-        while ((pr - pl) > SMALL_QUICKSORT) {
-            /* quicksort partition */
-            pm = pl + ((pr - pl) >> 1);
-            if (@TYPE@_LT(v[*pm],v[*pl])) INTP_SWAP(*pm,*pl);
-            if (@TYPE@_LT(v[*pr],v[*pm])) INTP_SWAP(*pr,*pm);
-            if (@TYPE@_LT(v[*pm],v[*pl])) INTP_SWAP(*pm,*pl);
-            vp = v[*pm];
-            pi = pl;
-            pj = pr - 1;
-            INTP_SWAP(*pm,*pj);
-            for (;;) {
-                do ++pi; while (@TYPE@_LT(v[*pi],vp));
-                do --pj; while (@TYPE@_LT(vp,v[*pj]));
-                if (pi >= pj) {
-                    break;
-                }
-                INTP_SWAP(*pi,*pj);
-            }
-            pk = pr - 1;
-            INTP_SWAP(*pi,*pk);
-            /* push largest partition on stack */
-            if (pi - pl < pr - pi) {
-                *sptr++ = pi + 1;
-                *sptr++ = pr;
-                pr = pi - 1;
-            }
-            else {
-                *sptr++ = pl;
-                *sptr++ = pi - 1;
-                pl = pi + 1;
-            }
-        }
-
-        /* insertion sort */
-        for (pi = pl + 1; pi <= pr; ++pi) {
-            vi = *pi;
-            vp = v[vi];
-            pj = pi;
-            pk = pi - 1;
-            while (pj > pl && @TYPE@_LT(vp, v[*pk])) {
-                *pj-- = *pk--;
-            }
-            *pj = vi;
-        }
-        if (sptr == stack) {
-            break;
-        }
-        pr = *(--sptr);
-        pl = *(--sptr);
-    }
-
-    return 0;
-}
-
-
-int
-heapsort_@suff@(@type@ *start, npy_intp n, void *NOT_USED)
-{
-    @type@ tmp, *a;
-    npy_intp i,j,l;
-
-    /* The array needs to be offset by one for heapsort indexing */
-    a = start - 1;
-
-    for (l = n>>1; l > 0; --l) {
-        tmp = a[l];
-        for (i = l, j = l<<1; j <= n;) {
-            if (j < n && @TYPE@_LT(a[j], a[j+1])) {
-                j += 1;
-            }
-            if (@TYPE@_LT(tmp, a[j])) {
-                a[i] = a[j];
-                i = j;
-                j += j;
-            }
-            else {
-                break;
-            }
-        }
-        a[i] = tmp;
-    }
-
-    for (; n > 1;) {
-        tmp = a[n];
-        a[n] = a[1];
-        n -= 1;
-        for (i = 1, j = 2; j <= n;) {
-            if (j < n && @TYPE@_LT(a[j], a[j+1])) {
-                j++;
-            }
-            if (@TYPE@_LT(tmp, a[j])) {
-                a[i] = a[j];
-                i = j;
-                j += j;
-            }
-            else {
-                break;
-            }
-        }
-        a[i] = tmp;
-    }
-
-    return 0;
-}
-
-int
-aheapsort_@suff@(@type@ *v, npy_intp *tosort, npy_intp n, void *NOT_USED)
-{
-    npy_intp *a, i,j,l, tmp;
-    /* The arrays need to be offset by one for heapsort indexing */
-    a = tosort - 1;
-
-    for (l = n>>1; l > 0; --l) {
-        tmp = a[l];
-        for (i = l, j = l<<1; j <= n;) {
-            if (j < n && @TYPE@_LT(v[a[j]], v[a[j+1]])) {
-                j += 1;
-            }
-            if (@TYPE@_LT(v[tmp], v[a[j]])) {
-                a[i] = a[j];
-                i = j;
-                j += j;
-            }
-            else {
-                break;
-            }
-        }
-        a[i] = tmp;
-    }
-
-    for (; n > 1;) {
-        tmp = a[n];
-        a[n] = a[1];
-        n -= 1;
-        for (i = 1, j = 2; j <= n;) {
-            if (j < n && @TYPE@_LT(v[a[j]], v[a[j+1]])) {
-                j++;
-            }
-            if (@TYPE@_LT(v[tmp], v[a[j]])) {
-                a[i] = a[j];
-                i = j;
-                j += j;
-            }
-            else {
-                break;
-            }
-        }
-        a[i] = tmp;
-    }
-
-    return 0;
-}
-
-static void
-mergesort0_@suff@(@type@ *pl, @type@ *pr, @type@ *pw)
-{
-    @type@ vp, *pi, *pj, *pk, *pm;
-
-    if (pr - pl > SMALL_MERGESORT) {
-        /* merge sort */
-        pm = pl + ((pr - pl) >> 1);
-        mergesort0_@suff@(pl, pm, pw);
-        mergesort0_@suff@(pm, pr, pw);
-        for (pi = pw, pj = pl; pj < pm;) {
-            *pi++ = *pj++;
-        }
-        pj = pw;
-        pk = pl;
-        while (pj < pi && pm < pr) {
-            if (@TYPE@_LT(*pm, *pj)) {
-                *pk = *pm++;
-            }
-            else {
-                *pk = *pj++;
-            }
-            pk++;
-        }
-        while(pj < pi) {
-            *pk++ = *pj++;
-        }
-    }
-    else {
-        /* insertion sort */
-        for (pi = pl + 1; pi < pr; ++pi) {
-            vp = *pi;
-            pj = pi;
-            pk = pi -1;
-            while (pj > pl && @TYPE@_LT(vp, *pk)) {
-                *pj-- = *pk--;
-            }
-            *pj = vp;
-        }
-    }
-}
-
-int
-mergesort_@suff@(@type@ *start, npy_intp num, void *NOT_USED)
-{
-    @type@ *pl, *pr, *pw;
-
-    pl = start;
-    pr = pl + num;
-    pw = (@type@ *) PyDataMem_NEW((num/2)*sizeof(@type@));
-    if (!pw) {
-        PyErr_NoMemory();
-        return -1;
-    }
-    mergesort0_@suff@(pl, pr, pw);
-
-    PyDataMem_FREE(pw);
-    return 0;
-}
-
-static void
-amergesort0_@suff@(npy_intp *pl, npy_intp *pr, @type@ *v, npy_intp *pw)
-{
-    @type@ vp;
-    npy_intp vi, *pi, *pj, *pk, *pm;
-
-    if (pr - pl > SMALL_MERGESORT) {
-        /* merge sort */
-        pm = pl + ((pr - pl + 1)>>1);
-        amergesort0_@suff@(pl, pm-1, v, pw);
-        amergesort0_@suff@(pm, pr, v, pw);
-        for (pi = pw, pj = pl; pj < pm; ++pi, ++pj) {
-            *pi = *pj;
-        }
-        for (pk = pw, pm = pl; pk < pi && pj <= pr; ++pm) {
-            if (@TYPE@_LT(v[*pj], v[*pk])) {
-                *pm = *pj;
-                ++pj;
-            }
-            else {
-                *pm = *pk;
-                ++pk;
-            }
-        }
-        for (; pk < pi; ++pm, ++pk) {
-            *pm = *pk;
-        }
-    }
-    else {
-        /* insertion sort */
-        for (pi = pl + 1; pi <= pr; ++pi) {
-            vi = *pi;
-            vp = v[vi];
-            for (pj = pi, pk = pi - 1; pj > pl && @TYPE@_LT(vp, v[*pk]); --pj, --pk) {
-                *pj = *pk;
-            }
-            *pj = vi;
-        }
-    }
-}
-
-int
-amergesort_@suff@(@type@ *v, npy_intp *tosort, npy_intp num, void *NOT_USED)
-{
-    npy_intp *pl, *pr, *pw;
-
-    pl = tosort; pr = pl + num - 1;
-    pw = PyDimMem_NEW((1+num/2));
-
-    if (!pw) {
-        PyErr_NoMemory();
-        return -1;
-    }
-
-    amergesort0_@suff@(pl, pr, v, pw);
-    PyDimMem_FREE(pw);
-
-    return 0;
-}
-
-
-/**end repeat**/
-
-/*
- *****************************************************************************
- **                             STRING SORTS                                **
- *****************************************************************************
- */
-
-
-/**begin repeat
- *
- * #TYPE = STRING, UNICODE#
- * #suff = string, unicode#
- * #type = npy_char, npy_ucs4#
- */
-
-static void
-mergesort0_@suff@(@type@ *pl, @type@ *pr, @type@ *pw, @type@ *vp, size_t len)
-{
-    @type@ *pi, *pj, *pk, *pm;
-
-    if ((size_t)(pr - pl) > SMALL_MERGESORT*len) {
-        /* merge sort */
-        pm = pl + (((pr - pl)/len) >> 1)*len;
-        mergesort0_@suff@(pl, pm, pw, vp, len);
-        mergesort0_@suff@(pm, pr, pw, vp, len);
-        @TYPE@_COPY(pw, pl, pm - pl);
-        pi = pw + (pm - pl);
-        pj = pw;
-        pk = pl;
-        while (pj < pi && pm < pr) {
-            if (@TYPE@_LT(pm, pj, len)) {
-                @TYPE@_COPY(pk, pm, len);
-                pm += len;
-            }
-            else {
-                @TYPE@_COPY(pk, pj, len);
-                pj += len;
-            }
-            pk += len;
-        }
-        @TYPE@_COPY(pk, pj, pi - pj);
-    }
-    else {
-        /* insertion sort */
-        for (pi = pl + len; pi < pr; pi += len) {
-            @TYPE@_COPY(vp, pi, len);
-            pj = pi;
-            pk = pi - len;
-            while (pj > pl && @TYPE@_LT(vp, pk, len)) {
-                @TYPE@_COPY(pj, pk, len);
-                pj -= len;
-                pk -= len;
-            }
-            @TYPE@_COPY(pj, vp, len);
-        }
-    }
-}
-
-int
-mergesort_@suff@(@type@ *start, npy_intp num, PyArrayObject *arr)
-{
-    const size_t elsize = PyArray_ITEMSIZE(arr);
-    const size_t len = elsize / sizeof(@type@);
-    @type@ *pl, *pr, *pw, *vp;
-    int err = 0;
-
-    pl = start;
-    pr = pl + num*len;
-    pw = (@type@ *) PyDataMem_NEW((num/2)*elsize);
-    if (!pw) {
-        PyErr_NoMemory();
-        err = -1;
-        goto fail_0;
-    }
-    vp = (@type@ *) PyDataMem_NEW(elsize);
-    if (!vp) {
-        PyErr_NoMemory();
-        err = -1;
-        goto fail_1;
-    }
-    mergesort0_@suff@(pl, pr, pw, vp, len);
-
-    PyDataMem_FREE(vp);
-fail_1:
-    PyDataMem_FREE(pw);
-fail_0:
-    return err;
-}
-
-int
-quicksort_@suff@(@type@ *start, npy_intp num, PyArrayObject *arr)
-{
-    const size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
-    @type@ *vp = malloc(PyArray_ITEMSIZE(arr));
-    @type@ *pl = start;
-    @type@ *pr = start + (num - 1)*len;
-    @type@ *stack[PYA_QS_STACK], **sptr = stack, *pm, *pi, *pj, *pk;
-
-    for (;;) {
-        while ((size_t)(pr - pl) > SMALL_QUICKSORT*len) {
-            /* quicksort partition */
-            pm = pl + (((pr - pl)/len) >> 1)*len;
-            if (@TYPE@_LT(pm, pl, len)) @TYPE@_SWAP(pm, pl, len);
-            if (@TYPE@_LT(pr, pm, len)) @TYPE@_SWAP(pr, pm, len);
-            if (@TYPE@_LT(pm, pl, len)) @TYPE@_SWAP(pm, pl, len);
-            @TYPE@_COPY(vp, pm, len);
-            pi = pl;
-            pj = pr - len;
-            @TYPE@_SWAP(pm, pj, len);
-            for (;;) {
-                do pi += len; while (@TYPE@_LT(pi, vp, len));
-                do pj -= len; while (@TYPE@_LT(vp, pj, len));
-                if (pi >= pj) {
-                    break;
-                }
-                @TYPE@_SWAP(pi, pj, len);
-            }
-            pk = pr - len;
-            @TYPE@_SWAP(pi, pk, len);
-            /* push largest partition on stack */
-            if (pi - pl < pr - pi) {
-                *sptr++ = pi + len;
-                *sptr++ = pr;
-                pr = pi - len;
-            }
-            else {
-                *sptr++ = pl;
-                *sptr++ = pi - len;
-                pl = pi + len;
-            }
-        }
-
-        /* insertion sort */
-        for (pi = pl + len; pi <= pr; pi += len) {
-            @TYPE@_COPY(vp, pi, len);
-            pj = pi;
-            pk = pi - len;
-            while (pj > pl && @TYPE@_LT(vp, pk, len)) {
-                @TYPE@_COPY(pj, pk, len);
-                pj -= len;
-                pk -= len;
-            }
-            @TYPE@_COPY(pj, vp, len);
-        }
-        if (sptr == stack) {
-            break;
-        }
-        pr = *(--sptr);
-        pl = *(--sptr);
-    }
-
-    free(vp);
-    return 0;
-}
-
-
-int
-heapsort_@suff@(@type@ *start, npy_intp n, PyArrayObject *arr)
-{
-    size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
-    @type@ *tmp = malloc(PyArray_ITEMSIZE(arr));
-    @type@ *a = start - len;
-    npy_intp i,j,l;
-
-    for (l = n>>1; l > 0; --l) {
-        @TYPE@_COPY(tmp, a + l*len, len);
-        for (i = l, j = l<<1; j <= n;) {
-            if (j < n && @TYPE@_LT(a + j*len, a + (j+1)*len, len))
-                j += 1;
-            if (@TYPE@_LT(tmp, a + j*len, len)) {
-                @TYPE@_COPY(a + i*len, a + j*len, len);
-                i = j;
-                j += j;
-            }
-            else {
-                break;
-            }
-        }
-        @TYPE@_COPY(a + i*len, tmp, len);
-    }
-
-    for (; n > 1;) {
-        @TYPE@_COPY(tmp, a + n*len, len);
-        @TYPE@_COPY(a + n*len, a + len, len);
-        n -= 1;
-        for (i = 1, j = 2; j <= n;) {
-            if (j < n && @TYPE@_LT(a + j*len, a + (j+1)*len, len))
-                j++;
-            if (@TYPE@_LT(tmp, a + j*len, len)) {
-                @TYPE@_COPY(a + i*len, a + j*len, len);
-                i = j;
-                j += j;
-            }
-            else {
-                break;
-            }
-        }
-        @TYPE@_COPY(a + i*len, tmp, len);
-    }
-
-    free(tmp);
-    return 0;
-}
-
-
-int
-aheapsort_@suff@(@type@ *v, npy_intp *tosort, npy_intp n, PyArrayObject *arr)
-{
-    size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
-    npy_intp *a, i,j,l, tmp;
-
-    /* The array needs to be offset by one for heapsort indexing */
-    a = tosort - 1;
-
-    for (l = n>>1; l > 0; --l) {
-        tmp = a[l];
-        for (i = l, j = l<<1; j <= n;) {
-            if (j < n && @TYPE@_LT(v + a[j]*len, v + a[j+1]*len, len))
-                j += 1;
-            if (@TYPE@_LT(v + tmp*len, v + a[j]*len, len)) {
-                a[i] = a[j];
-                i = j;
-                j += j;
-            }
-            else {
-                break;
-            }
-        }
-        a[i] = tmp;
-    }
-
-    for (; n > 1;) {
-        tmp = a[n];
-        a[n] = a[1];
-        n -= 1;
-        for (i = 1, j = 2; j <= n;) {
-            if (j < n && @TYPE@_LT(v + a[j]*len, v + a[j+1]*len, len))
-                j++;
-            if (@TYPE@_LT(v + tmp*len, v + a[j]*len, len)) {
-                a[i] = a[j];
-                i = j;
-                j += j;
-            }
-            else {
-                break;
-            }
-        }
-        a[i] = tmp;
-    }
-
-    return 0;
-}
-
-
-int
-aquicksort_@suff@(@type@ *v, npy_intp* tosort, npy_intp num, PyArrayObject *arr)
-{
-    size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
-    @type@ *vp;
-    npy_intp *pl = tosort;
-    npy_intp *pr = tosort + num - 1;
-    npy_intp *stack[PYA_QS_STACK];
-    npy_intp **sptr=stack;
-    npy_intp *pm, *pi, *pj, *pk, vi;
-
-    for (;;) {
-        while ((pr - pl) > SMALL_QUICKSORT) {
-            /* quicksort partition */
-            pm = pl + ((pr - pl) >> 1);
-            if (@TYPE@_LT(v + (*pm)*len, v + (*pl)*len, len)) INTP_SWAP(*pm, *pl);
-            if (@TYPE@_LT(v + (*pr)*len, v + (*pm)*len, len)) INTP_SWAP(*pr, *pm);
-            if (@TYPE@_LT(v + (*pm)*len, v + (*pl)*len, len)) INTP_SWAP(*pm, *pl);
-            vp = v + (*pm)*len;
-            pi = pl;
-            pj = pr - 1;
-            INTP_SWAP(*pm,*pj);
-            for (;;) {
-                do ++pi; while (@TYPE@_LT(v + (*pi)*len, vp, len));
-                do --pj; while (@TYPE@_LT(vp, v + (*pj)*len, len));
-                if (pi >= pj) {
-                    break;
-                }
-                INTP_SWAP(*pi,*pj);
-            }
-            pk = pr - 1;
-            INTP_SWAP(*pi,*pk);
-            /* push largest partition on stack */
-            if (pi - pl < pr - pi) {
-                *sptr++ = pi + 1;
-                *sptr++ = pr;
-                pr = pi - 1;
-            }
-            else {
-                *sptr++ = pl;
-                *sptr++ = pi - 1;
-                pl = pi + 1;
-            }
-        }
-
-        /* insertion sort */
-        for (pi = pl + 1; pi <= pr; ++pi) {
-            vi = *pi;
-            vp = v + vi*len;
-            pj = pi;
-            pk = pi - 1;
-            while (pj > pl && @TYPE@_LT(vp, v + (*pk)*len, len)) {
-                *pj-- = *pk--;
-            }
-            *pj = vi;
-        }
-        if (sptr == stack) {
-            break;
-        }
-        pr = *(--sptr);
-        pl = *(--sptr);
-    }
-
-    return 0;
-}
-
-
-static void
-amergesort0_@suff@(npy_intp *pl, npy_intp *pr, @type@ *v, npy_intp *pw, int len)
-{
-    @type@ *vp;
-    npy_intp vi, *pi, *pj, *pk, *pm;
-
-    if (pr - pl > SMALL_MERGESORT) {
-        /* merge sort */
-        pm = pl + ((pr - pl) >> 1);
-        amergesort0_@suff@(pl, pm, v, pw, len);
-        amergesort0_@suff@(pm, pr, v, pw, len);
-        for (pi = pw, pj = pl; pj < pm;) {
-            *pi++ = *pj++;
-        }
-        pj = pw;
-        pk = pl;
-        while (pj < pi && pm < pr) {
-            if (@TYPE@_LT(v + (*pm)*len, v + (*pj)*len, len)) {
-                *pk = *pm++;
-            } else {
-                *pk = *pj++;
-            }
-            pk++;
-        }
-        while (pj < pi) {
-            *pk++ = *pj++;
-        }
-    } else {
-        /* insertion sort */
-        for (pi = pl + 1; pi < pr; ++pi) {
-            vi = *pi;
-            vp = v + vi*len;
-            pj = pi;
-            pk = pi -1;
-            while (pj > pl && @TYPE@_LT(vp, v + (*pk)*len, len)) {
-                *pj-- = *pk--;
-            }
-            *pj = vi;
-        }
-    }
-}
-
-
-int
-amergesort_@suff@(@type@ *v, npy_intp *tosort, npy_intp num, PyArrayObject *arr)
-{
-    const size_t elsize = PyArray_ITEMSIZE(arr);
-    const size_t len = elsize / sizeof(@type@);
-    npy_intp *pl, *pr, *pw;
-
-    pl = tosort;
-    pr = pl + num;
-    pw = PyDimMem_NEW(num/2);
-    if (!pw) {
-        PyErr_NoMemory();
-        return -1;
-    }
-    amergesort0_@suff@(pl, pr, v, pw, len);
-
-    PyDimMem_FREE(pw);
-    return 0;
-}
-/**end repeat**/
diff --git a/numpy/core/src/private/npy_pycompat.h b/numpy/core/src/private/npy_pycompat.h
new file mode 100644
index 000000000..6b49c7476
--- /dev/null
+++ b/numpy/core/src/private/npy_pycompat.h
@@ -0,0 +1,24 @@
+#ifndef _NPY_PYCOMPAT_H_
+#define _NPY_PYCOMPAT_H_
+
+#include "numpy/npy_3kcompat.h"
+
+/*
+ * Accessing items of ob_base
+ */
+
+#if (PY_VERSION_HEX < 0x02060000)
+#define Py_TYPE(o)    (((PyObject*)(o))->ob_type)
+#define Py_REFCNT(o)  (((PyObject*)(o))->ob_refcnt)
+#define Py_SIZE(o)    (((PyVarObject*)(o))->ob_size)
+#endif
+
+/*
+ * PyIndex_Check
+ */
+#if (PY_VERSION_HEX < 0x02050000)
+#undef PyIndex_Check
+#define PyIndex_Check(o)     0
+#endif
+
+#endif /* _NPY_COMPAT_H_ */
diff --git a/numpy/core/src/private/npy_sort.h b/numpy/core/src/private/npy_sort.h
index 94d831f86..120e7efc2 100644
--- a/numpy/core/src/private/npy_sort.h
+++ b/numpy/core/src/private/npy_sort.h
@@ -6,6 +6,10 @@
 #include <numpy/npy_common.h>
 #include <numpy/ndarraytypes.h>
 
+#define NPY_ENOMEM 1
+#define NPY_ECOMP 2
+
+typedef int (*npy_comparator)(const void *, const void *);
 
 int quicksort_bool(npy_bool *vec, npy_intp cnt, void *null);
 int heapsort_bool(npy_bool *vec, npy_intp cnt, void *null);
@@ -166,4 +170,9 @@ int aquicksort_unicode(npy_ucs4 *vec, npy_intp *ind, npy_intp cnt, PyArrayObject
 int aheapsort_unicode(npy_ucs4 *vec, npy_intp *ind, npy_intp cnt, PyArrayObject *arr);
 int amergesort_unicode(npy_ucs4 *vec, npy_intp *ind, npy_intp cnt, PyArrayObject *arr);
 
+
+int npy_quicksort(void *base, size_t num, size_t size, npy_comparator cmp);
+int npy_heapsort(void *base, size_t num, size_t size, npy_comparator cmp);
+int npy_mergesort(void *base, size_t num, size_t size, npy_comparator cmp);
+
 #endif
diff --git a/numpy/core/src/scalarmathmodule.c.src b/numpy/core/src/scalarmathmodule.c.src
index d9f7abc4e..4b56c1510 100644
--- a/numpy/core/src/scalarmathmodule.c.src
+++ b/numpy/core/src/scalarmathmodule.c.src
@@ -13,7 +13,7 @@
 #include "numpy/ufuncobject.h"
 #include "numpy/arrayscalars.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "numpy/halffloat.h"
 
diff --git a/numpy/core/src/umath/funcs.inc.src b/numpy/core/src/umath/funcs.inc.src
index f3fefcfc5..6d46efc44 100644
--- a/numpy/core/src/umath/funcs.inc.src
+++ b/numpy/core/src/umath/funcs.inc.src
@@ -7,7 +7,7 @@
  */
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 
 /*
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 7eede45d6..040486a32 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -16,7 +16,7 @@
 #include "numpy/npy_math.h"
 #include "numpy/halffloat.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "ufunc_object.h"
 
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 3e0306bd2..2712f0474 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -34,7 +34,7 @@
 #define NO_IMPORT_ARRAY
 #endif
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "numpy/arrayobject.h"
 #include "numpy/ufuncobject.h"
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index da1b2e811..372edc07a 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -19,7 +19,7 @@
 #define NO_IMPORT_ARRAY
 #endif
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "numpy/ufuncobject.h"
 #include "ufunc_type_resolution.h"
diff --git a/numpy/core/src/umath/umath_tests.c.src b/numpy/core/src/umath/umath_tests.c.src
index 43583d119..0ba13f15f 100644
--- a/numpy/core/src/umath/umath_tests.c.src
+++ b/numpy/core/src/umath/umath_tests.c.src
@@ -11,7 +11,7 @@
 #include "numpy/arrayobject.h"
 #include "numpy/ufuncobject.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "npy_config.h"
 
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 82ead8958..3427800a2 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -10,7 +10,8 @@ from numpy.testing.utils import WarningManager
 from numpy.compat import asbytes, getexception, strchar
 from test_print import in_foreign_locale
 from numpy.core.multiarray_tests import (
-        test_neighborhood_iterator, test_neighborhood_iterator_oob
+        test_neighborhood_iterator, test_neighborhood_iterator_oob,
+        test_pydatamem_seteventhook_start, test_pydatamem_seteventhook_end,
         )
 from numpy.testing import (
         TestCase, run_module_suite, assert_, assert_raises,
@@ -490,7 +491,7 @@ class TestMethods(TestCase):
         # of sorted items must be greater than ~50 to check the actual
         # algorithm because quick and merge sort fall over to insertion
         # sort for small arrays.
-        a = np.arange(100)
+        a = np.arange(101)
         b = a[::-1].copy()
         for kind in ['q','m','h'] :
             msg = "scalar sort, kind=%s" % kind
@@ -526,7 +527,7 @@ class TestMethods(TestCase):
 
         # test string sorts.
         s = 'aaaaaaaa'
-        a = np.array([s + chr(i) for i in range(100)])
+        a = np.array([s + chr(i) for i in range(101)])
         b = a[::-1].copy()
         for kind in ['q', 'm', 'h'] :
             msg = "string sort, kind=%s" % kind
@@ -537,9 +538,9 @@ class TestMethods(TestCase):
             c.sort(kind=kind)
             assert_equal(c, a, msg)
 
-        # test unicode sort.
+        # test unicode sorts.
         s = 'aaaaaaaa'
-        a = np.array([s + chr(i) for i in range(100)], dtype=np.unicode)
+        a = np.array([s + chr(i) for i in range(101)], dtype=np.unicode)
         b = a[::-1].copy()
         for kind in ['q', 'm', 'h'] :
             msg = "unicode sort, kind=%s" % kind
@@ -550,7 +551,55 @@ class TestMethods(TestCase):
             c.sort(kind=kind)
             assert_equal(c, a, msg)
 
-        # todo, check object array sorts.
+        # test object array sorts.
+        a = np.empty((101,), dtype=np.object)
+        a[:] = range(101)
+        b = a[::-1]
+        for kind in ['q', 'h', 'm'] :
+            msg = "object sort, kind=%s" % kind
+            c = a.copy();
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+            c = b.copy();
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+
+        # test record array sorts.
+        dt = np.dtype([('f',float),('i',int)])
+        a = array([(i,i) for i in range(101)], dtype = dt)
+        b = a[::-1]
+        for kind in ['q', 'h', 'm'] :
+            msg = "object sort, kind=%s" % kind
+            c = a.copy();
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+            c = b.copy();
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+
+        # test datetime64 sorts.
+        a = np.arange(0, 101, dtype='datetime64[D]')
+        b = a[::-1]
+        for kind in ['q', 'h', 'm'] :
+            msg = "datetime64 sort, kind=%s" % kind
+            c = a.copy();
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+            c = b.copy();
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+
+        # test timedelta64 sorts.
+        a = np.arange(0, 101, dtype='timedelta64[D]')
+        b = a[::-1]
+        for kind in ['q', 'h', 'm'] :
+            msg = "timedelta64 sort, kind=%s" % kind
+            c = a.copy();
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+            c = b.copy();
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
 
         # check axis handling. This should be the same for all type
         # specific sorts, so we only check it for one type and one kind
@@ -566,10 +615,6 @@ class TestMethods(TestCase):
         d = a.copy()
         d.sort()
         assert_equal(d, c, "test sort with default axis")
-        # using None is known fail at this point
-        # d = a.copy()
-        # d.sort(axis=None)
-        #assert_equal(d, c, "test sort with axis=None")
 
 
     def test_sort_order(self):
@@ -612,7 +657,7 @@ class TestMethods(TestCase):
         # of sorted items must be greater than ~50 to check the actual
         # algorithm because quick and merge sort fall over to insertion
         # sort for small arrays.
-        a = np.arange(100)
+        a = np.arange(101)
         b = a[::-1].copy()
         for kind in ['q','m','h'] :
             msg = "scalar argsort, kind=%s" % kind
@@ -636,10 +681,10 @@ class TestMethods(TestCase):
 
         # test string argsorts.
         s = 'aaaaaaaa'
-        a = np.array([s + chr(i) for i in range(100)])
+        a = np.array([s + chr(i) for i in range(101)])
         b = a[::-1].copy()
-        r = arange(100)
-        rr = r[::-1].copy()
+        r = np.arange(101)
+        rr = r[::-1]
         for kind in ['q', 'm', 'h'] :
             msg = "string argsort, kind=%s" % kind
             assert_equal(a.copy().argsort(kind=kind), r, msg)
@@ -647,16 +692,57 @@ class TestMethods(TestCase):
 
         # test unicode argsorts.
         s = 'aaaaaaaa'
-        a = np.array([s + chr(i) for i in range(100)], dtype=np.unicode)
-        b = a[::-1].copy()
-        r = arange(100)
-        rr = r[::-1].copy()
+        a = np.array([s + chr(i) for i in range(101)], dtype=np.unicode)
+        b = a[::-1]
+        r = np.arange(101)
+        rr = r[::-1]
         for kind in ['q', 'm', 'h'] :
             msg = "unicode argsort, kind=%s" % kind
             assert_equal(a.copy().argsort(kind=kind), r, msg)
             assert_equal(b.copy().argsort(kind=kind), rr, msg)
 
-        # todo, check object array argsorts.
+        # test object array argsorts.
+        a = np.empty((101,), dtype=np.object)
+        a[:] = range(101)
+        b = a[::-1]
+        r = np.arange(101)
+        rr = r[::-1]
+        for kind in ['q', 'm', 'h'] :
+            msg = "object argsort, kind=%s" % kind
+            assert_equal(a.copy().argsort(kind=kind), r, msg)
+            assert_equal(b.copy().argsort(kind=kind), rr, msg)
+
+        # test structured array argsorts.
+        dt = np.dtype([('f',float),('i',int)])
+        a = array([(i,i) for i in range(101)], dtype = dt)
+        b = a[::-1]
+        r = np.arange(101)
+        rr = r[::-1]
+        for kind in ['q', 'm', 'h'] :
+            msg = "structured array argsort, kind=%s" % kind
+            assert_equal(a.copy().argsort(kind=kind), r, msg)
+            assert_equal(b.copy().argsort(kind=kind), rr, msg)
+
+        # test datetime64 argsorts.
+        a = np.arange(0, 101, dtype='datetime64[D]')
+        b = a[::-1]
+        r = np.arange(101)
+        rr = r[::-1]
+        for kind in ['q', 'h', 'm'] :
+            msg = "datetime64 argsort, kind=%s" % kind
+            assert_equal(a.copy().argsort(kind=kind), r, msg)
+            assert_equal(b.copy().argsort(kind=kind), rr, msg)
+
+        # test timedelta64 argsorts.
+        a = np.arange(0, 101, dtype='timedelta64[D]')
+        b = a[::-1]
+        r = np.arange(101)
+        rr = r[::-1]
+        for kind in ['q', 'h', 'm'] :
+            msg = "timedelta64 argsort, kind=%s" % kind
+            assert_equal(a.copy().argsort(kind=kind), r, msg)
+            assert_equal(b.copy().argsort(kind=kind), rr, msg)
+
 
         # check axis handling. This should be the same for all type
         # specific argsorts, so we only check it for one type and one kind
@@ -827,19 +913,19 @@ class TestMethods(TestCase):
         # All the different functions raise a warning, but not an error, and
         # 'a' is not modified:
         assert_equal(collect_warning_types(a.diagonal().__setitem__, 0, 10),
-                     [DeprecationWarning])
+                     [FutureWarning])
         assert_equal(a, np.arange(9).reshape(3, 3))
         assert_equal(collect_warning_types(np.diagonal(a).__setitem__, 0, 10),
-                     [DeprecationWarning])
+                     [FutureWarning])
         assert_equal(a, np.arange(9).reshape(3, 3))
         assert_equal(collect_warning_types(np.diag(a).__setitem__, 0, 10),
-                     [DeprecationWarning])
+                     [FutureWarning])
         assert_equal(a, np.arange(9).reshape(3, 3))
         # Views also warn
         d = np.diagonal(a)
         d_view = d.view()
         assert_equal(collect_warning_types(d_view.__setitem__, 0, 10),
-                     [DeprecationWarning])
+                     [FutureWarning])
         # But the write goes through:
         assert_equal(d[0], 10)
         # Only one warning per call to diagonal, though (even if there are
@@ -861,21 +947,21 @@ class TestMethods(TestCase):
                 buf_or_memoryview[0] = "x"
         assert_equal(collect_warning_types(get_data_and_write,
                                            lambda d: d.data),
-                     [DeprecationWarning])
+                     [FutureWarning])
         if hasattr(np, "getbuffer"):
             assert_equal(collect_warning_types(get_data_and_write,
                                                np.getbuffer),
-                         [DeprecationWarning])
+                         [FutureWarning])
         # PEP 3118:
         if have_memoryview:
             assert_equal(collect_warning_types(get_data_and_write, memoryview),
-                         [DeprecationWarning])
+                         [FutureWarning])
         # Void dtypes can give us a read-write buffer, but only in Python 2:
         import sys
         if sys.version_info[0] < 3:
             aV = np.empty((3, 3), dtype="V10")
             assert_equal(collect_warning_types(aV.diagonal().item, 0),
-                         [DeprecationWarning])
+                         [FutureWarning])
             # XX it seems that direct indexing of a void object returns a void
             # scalar, which ignores not just WARN_ON_WRITE but even WRITEABLE.
             # i.e. in this:
@@ -887,7 +973,7 @@ class TestMethods(TestCase):
         # __array_interface also lets a data pointer get away from us
         log = collect_warning_types(getattr, a.diagonal(),
                                     "__array_interface__")
-        assert_equal(log, [DeprecationWarning])
+        assert_equal(log, [FutureWarning])
         # ctypeslib goes via __array_interface__:
         try:
             # may not exist in python 2.4:
@@ -896,10 +982,10 @@ class TestMethods(TestCase):
             pass
         else:
             log = collect_warning_types(np.ctypeslib.as_ctypes, a.diagonal())
-            assert_equal(log, [DeprecationWarning])
+            assert_equal(log, [FutureWarning])
         # __array_struct__
         log = collect_warning_types(getattr, a.diagonal(), "__array_struct__")
-        assert_equal(log, [DeprecationWarning])
+        assert_equal(log, [FutureWarning])
 
         # Make sure that our recommendation to silence the warning by copying
         # the array actually works:
@@ -932,7 +1018,7 @@ class TestMethods(TestCase):
         ro_diag.flags.writeable = False
         assert_equal(collect_warning_types(getattr, ro_diag,
                                            "__array_struct__"), [])
-        
+
 
     def test_ravel(self):
         a = np.array([[0,1],[2,3]])
@@ -1092,7 +1178,7 @@ class TestFancyIndexing(TestCase):
         x = xorig.copy()
         x[m3] = 10
         assert_array_equal(x, array([[1,10,3,4],[5,6,7,8]]))
-                           
+
 
 class TestStringCompare(TestCase):
     def test_string(self):
@@ -1700,7 +1786,7 @@ class TestFlat(TestCase):
         self.b = a[::2,::2]
         self.a0 = a0
         self.b0 = a0[::2,::2]
-        
+
     def test_contiguous(self):
         testpassed = False
         try:
@@ -1734,7 +1820,7 @@ class TestFlat(TestCase):
         assert d.flags.updateifcopy is False
         assert e.flags.updateifcopy is False
         assert f.flags.updateifcopy is True
-        assert f.base is self.b0    
+        assert f.base is self.b0
 
 class TestResize(TestCase):
     def test_basic(self):
@@ -2649,6 +2735,16 @@ def test_flat_element_deletion():
     except:
         raise AssertionError
 
+class TestMemEventHook(TestCase):
+    def test_mem_seteventhook(self):
+        # The actual tests are within the C code in
+        # multiarray/multiarray_tests.c.src
+        test_pydatamem_seteventhook_start()
+        # force an allocation and free of a numpy array
+        a = np.zeros(10)
+        del a
+        test_pydatamem_seteventhook_end()
+
 
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/core/tests/test_multiarray_assignment.py b/numpy/core/tests/test_multiarray_assignment.py
new file mode 100644
index 000000000..29ddcf906
--- /dev/null
+++ b/numpy/core/tests/test_multiarray_assignment.py
@@ -0,0 +1,78 @@
+import numpy as np
+from numpy.testing import TestCase
+
+ndims = 2
+size = 10
+shape = tuple([size] * ndims)
+
+
+def _indices_for_nelems(nelems):
+    """Returns slices of length nelems, from start onwards, in direction sign."""
+
+    if nelems == 0:
+        return [size // 2]  # int index
+
+    res = []
+    for step in (1, 2):
+        for sign in (-1, 1):
+            start = size // 2 - nelems * step * sign // 2
+            stop = start + nelems * step * sign
+            res.append(slice(start, stop, step * sign))
+
+    return res
+
+
+def _indices_for_axis():
+    """Returns (src, dst) pairs of indices."""
+
+    res = []
+    for nelems in (0, 2, 3):
+        ind = _indices_for_nelems(nelems)
+
+        # no itertools.product available in Py2.4
+        res.extend([(a, b) for a in ind for b in ind])  # all assignments of size "nelems"
+
+    return res
+
+
+def _indices(ndims):
+    """Returns ((axis0_src, axis0_dst), (axis1_src, axis1_dst), ... ) index pairs."""
+
+    ind = _indices_for_axis()
+
+    # no itertools.product available in Py2.4
+
+    res = [[]]
+    for i in xrange(ndims):
+        newres = []
+        for elem in ind:
+            for others in res:
+                newres.append([elem] + others)
+        res = newres
+
+    return res
+
+
+def _check_assignment(srcidx, dstidx):
+    """Check assignment arr[dstidx] = arr[srcidx] works."""
+
+    arr = np.arange(np.product(shape)).reshape(shape)
+
+    cpy = arr.copy()
+
+    cpy[dstidx] = arr[srcidx]
+    arr[dstidx] = arr[srcidx]
+
+    assert np.all(arr == cpy), 'assigning arr[%s] = arr[%s]' % (dstidx, srcidx)
+
+
+def test_overlapping_assignments():
+    """Test automatically generated assignments which overlap in memory."""
+
+    inds = _indices(ndims)
+
+    for ind in inds:
+        srcidx = tuple([a[0] for a in ind])
+        dstidx = tuple([a[1] for a in ind])
+
+        yield _check_assignment, srcidx, dstidx
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 6aed19ca6..892b1fae6 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -1,10 +1,17 @@
 import sys
+import platform
 
 from numpy.testing import *
 import numpy.core.umath as ncu
 import numpy as np
 
 
+def on_powerpc():
+    """ True if we are running on a Power PC platform."""
+    return platform.processor() == 'powerpc' or \
+           platform.machine().startswith('ppc')
+
+
 class _FilterInvalids(object):
     def setUp(self):
         self.olderr = np.seterr(invalid='ignore')
@@ -1118,7 +1125,8 @@ def test_nextafter():
 def test_nextafterf():
     return _test_nextafter(np.float32)
 
-@dec.knownfailureif(sys.platform == 'win32', "Long double support buggy on win32")
+@dec.knownfailureif(sys.platform == 'win32' or on_powerpc(),
+            "Long double support buggy on win32 and PPC, ticket 1664.")
 def test_nextafterl():
     return _test_nextafter(np.longdouble)
 
@@ -1143,7 +1151,8 @@ def test_spacing():
 def test_spacingf():
     return _test_spacing(np.float32)
 
-@dec.knownfailureif(sys.platform == 'win32', "Long double support buggy on win32")
+@dec.knownfailureif(sys.platform == 'win32' or on_powerpc(),
+            "Long double support buggy on win32 and PPC, ticket 1664.")
 def test_spacingl():
     return _test_spacing(np.longdouble)
 
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index 7921b4116..2f2a4bc57 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -203,13 +203,17 @@ class LineSplitter(object):
             self._handyman = _handyman
     #
     def _delimited_splitter(self, line):
-        line = line.split(self.comments)[0].strip(asbytes(" \r\n"))
+        if self.comments is not None:
+            line = line.split(self.comments)[0]
+        line = line.strip(asbytes(" \r\n"))
         if not line:
             return []
         return line.split(self.delimiter)
     #
     def _fixedwidth_splitter(self, line):
-        line = line.split(self.comments)[0].strip(asbytes("\r\n"))
+        if self.comments is not None:
+            line = line.split(self.comments)[0]
+        line = line.strip(asbytes("\r\n"))
         if not line:
             return []
         fixed = self.delimiter
@@ -217,7 +221,8 @@ class LineSplitter(object):
         return [line[s] for s in slices]
     #
     def _variablewidth_splitter(self, line):
-        line = line.split(self.comments)[0]
+        if self.comments is not None:
+            line = line.split(self.comments)[0]
         if not line:
             return []
         slices = self.delimiter
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index b1e891f77..b63003f80 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -1291,7 +1291,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
 
     """
     # Py3 data conversions to bytes, for convenience
-    comments = asbytes(comments)
+    if comments is not None:
+        comments = asbytes(comments)
     if isinstance(delimiter, unicode):
         delimiter = asbytes(delimiter)
     if isinstance(missing, unicode):
diff --git a/numpy/lib/src/_compiled_base.c b/numpy/lib/src/_compiled_base.c
index c31ee5cd8..d389b7f8e 100644
--- a/numpy/lib/src/_compiled_base.c
+++ b/numpy/lib/src/_compiled_base.c
@@ -7,6 +7,10 @@
 #include "numpy/ufuncobject.h"
 #include "string.h"
 
+#if (PY_VERSION_HEX < 0x02060000)
+#define Py_TYPE(o)    (((PyObject*)(o))->ob_type)
+#endif
+
 static npy_intp
 incr_slot_(double x, double *bins, npy_intp lbins)
 {
@@ -670,7 +674,10 @@ arr_interp(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
 
     /* only pre-calculate slopes if there are relatively few of them. */
     if (lenxp <= lenx) {
-        slopes = (double *) PyDataMem_NEW((lenxp - 1)*sizeof(double));
+        slopes = (double *) PyArray_malloc((lenxp - 1)*sizeof(double));
+        if (! slopes) {
+            goto fail;
+        }
         NPY_BEGIN_ALLOW_THREADS;
         for (i = 0; i < lenxp - 1; i++) {
             slopes[i] = (dy[i + 1] - dy[i])/(dx[i + 1] - dx[i]);
@@ -692,7 +699,7 @@ arr_interp(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
             }
         }
         NPY_END_ALLOW_THREADS;
-        PyDataMem_FREE(slopes);
+        PyArray_free(slopes);
     }
     else {
         NPY_BEGIN_ALLOW_THREADS;
diff --git a/numpy/lib/tests/test_arraysetops.py b/numpy/lib/tests/test_arraysetops.py
index e40c155a4..b0d2ca7c3 100644
--- a/numpy/lib/tests/test_arraysetops.py
+++ b/numpy/lib/tests/test_arraysetops.py
@@ -8,31 +8,62 @@ from numpy.lib.arraysetops import *
 
 import warnings
 
-class TestAso(TestCase):
-    def test_unique( self ):
-        a = np.array( [5, 7, 1, 2, 1, 5, 7] )
-
-        ec = np.array( [1, 2, 5, 7] )
-        c = unique( a )
-        assert_array_equal( c, ec )
-
-        vals, indices = unique( a, return_index=True )
+class TestSetOps(TestCase):
 
 
-        ed = np.array( [2, 3, 0, 1] )
-        assert_array_equal(vals, ec)
-        assert_array_equal(indices, ed)
-
-        vals, ind0, ind1 = unique( a, return_index=True,
-                                     return_inverse=True )
-
+    def test_unique( self ):
 
-        ee = np.array( [2, 3, 0, 1, 0, 2, 3] )
-        assert_array_equal(vals, ec)
-        assert_array_equal(ind0, ed)
-        assert_array_equal(ind1, ee)
+        def check_all(a, b, i1, i2, dt):
+            msg = "check values failed for type '%s'" % dt
+            v = unique(a)
+            assert_array_equal(v, b, msg)
+
+            msg = "check indexes failed for type '%s'" % dt
+            v, j = unique(a, 1, 0)
+            assert_array_equal(v, b, msg)
+            assert_array_equal(j, i1, msg)
+
+            msg = "check reverse indexes failed for type '%s'" % dt
+            v, j = unique(a, 0, 1)
+            assert_array_equal(v, b, msg)
+            assert_array_equal(j, i2, msg)
+
+            msg = "check with all indexes failed for type '%s'" % dt
+            v, j1, j2 = unique(a, 1, 1)
+            assert_array_equal(v, b, msg)
+            assert_array_equal(j1, i1, msg)
+            assert_array_equal(j2, i2, msg)
+
+        a = [5, 7, 1, 2, 1, 5, 7]*10
+        b = [1, 2, 5, 7]
+        i1 = [2, 3, 0, 1]
+        i2 = [2, 3, 0, 1, 0, 2, 3]*10
+
+        # test for numeric arrays
+        types = []
+        types.extend(np.typecodes['AllInteger'])
+        types.extend(np.typecodes['AllFloat'])
+        types.append('datetime64[D]')
+        types.append('timedelta64[D]')
+        for dt in types:
+            aa = np.array(a, dt)
+            bb = np.array(b, dt)
+            check_all(aa, bb, i1, i2, dt)
+
+        # test for object arrays
+        dt = 'O'
+        aa = np.empty(len(a), dt)
+        aa[:] = a
+        bb = np.empty(len(b), dt)
+        bb[:] = b
+        check_all(aa, bb, i1, i2, dt)
+
+        # test for structured arrays
+        dt = [('', 'i'), ('', 'i')]
+        aa = np.array(zip(a,a), dt)
+        bb = np.array(zip(b,b), dt)
+        check_all(aa, bb, i1, i2, dt)
 
-        assert_array_equal([], unique([]))
 
     def test_intersect1d( self ):
         # unique inputs
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index c539c040a..f8caeedb6 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -187,6 +187,7 @@ class TestSavezLoad(RoundtripTest, TestCase):
             assert_(not fp.closed)
 
         finally:
+            fp.close()
             os.remove(tmp)
 
     def test_closing_fid(self):
@@ -1426,8 +1427,6 @@ M   33  21.99
                              usecols=("A", "C", "E"), names=True)
         assert_equal(test.dtype.names, ctrl_names)
 
-
-
     def test_fixed_width_names(self):
         "Test fix-width w/ names"
         data = "    A    B   C\n    0    1 2.3\n   45   67   9."
@@ -1451,6 +1450,14 @@ M   33  21.99
         test = np.ndfromtxt(StringIO(data), **kwargs)
         assert_equal(test, ctrl)
 
+    def test_comments_is_none(self):
+        # Github issue 329 (None was previously being converted to 'None').
+        test = np.genfromtxt(StringIO("test1,testNonetherestofthedata"),
+                             dtype=None, comments=None, delimiter=',')
+        assert_equal(test[1], asbytes('testNonetherestofthedata'))
+        test = np.genfromtxt(StringIO("test1, testNonetherestofthedata"),
+                             dtype=None, comments=None, delimiter=',')
+        assert_equal(test[1], asbytes(' testNonetherestofthedata'))
 
     def test_recfromtxt(self):
         #
@@ -1471,7 +1478,6 @@ M   33  21.99
         assert_equal(test.mask, control.mask)
         assert_equal(test.A, [0, 2])
 
-
     def test_recfromcsv(self):
         #
         data = StringIO('A,B\n0,1\n2,3')
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index f25452064..a7f1c074d 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -1420,8 +1420,8 @@ def matrix_rank(M, tol=None):
     """
     Return matrix rank of array using SVD method
 
-    Rank of the array is the number of SVD singular values of the
-    array that are greater than `tol`.
+    Rank of the array is the number of SVD singular values of the array that are
+    greater than `tol`.
 
     Parameters
     ----------
@@ -1431,27 +1431,53 @@ def matrix_rank(M, tol=None):
        threshold below which SVD values are considered zero. If `tol` is
        None, and ``S`` is an array with singular values for `M`, and
        ``eps`` is the epsilon value for datatype of ``S``, then `tol` is
-       set to ``S.max() * eps``.
+       set to ``S.max() * max(M.shape) * eps``.
 
     Notes
     -----
-    Golub and van Loan [1]_ define "numerical rank deficiency" as using
-    tol=eps*S[0] (where S[0] is the maximum singular value and thus the
-    2-norm of the matrix). This is one definition of rank deficiency,
-    and the one we use here.  When floating point roundoff is the main
-    concern, then "numerical rank deficiency" is a reasonable choice. In
-    some cases you may prefer other definitions. The most useful measure
-    of the tolerance depends on the operations you intend to use on your
-    matrix. For example, if your data come from uncertain measurements
-    with uncertainties greater than floating point epsilon, choosing a
-    tolerance near that uncertainty may be preferable.  The tolerance
-    may be absolute if the uncertainties are absolute rather than
-    relative.
+    The default threshold to detect rank deficiency is a test on the magnitude
+    of the singular values of `M`.  By default, we identify singular values less
+    than ``S.max() * max(M.shape) * eps`` as indicating rank deficiency (with
+    the symbols defined above). This is the algorithm MATLAB uses [1].  It also
+    appears in *Numerical recipes* in the discussion of SVD solutions for linear
+    least squares [2].
+
+    This default threshold is designed to detect rank deficiency accounting for
+    the numerical errors of the SVD computation.  Imagine that there is a column
+    in `M` that is an exact (in floating point) linear combination of other
+    columns in `M`. Computing the SVD on `M` will not produce a singular value
+    exactly equal to 0 in general: any difference of the smallest SVD value from
+    0 will be caused by numerical imprecision in the calculation of the SVD.
+    Our threshold for small SVD values takes this numerical imprecision into
+    account, and the default threshold will detect such numerical rank
+    deficiency.  The threshold may declare a matrix `M` rank deficient even if
+    the linear combination of some columns of `M` is not exactly equal to
+    another column of `M` but only numerically very close to another column of
+    `M`.
+
+    We chose our default threshold because it is in wide use.  Other thresholds
+    are possible.  For example, elsewhere in the 2007 edition of *Numerical
+    recipes* there is an alternative threshold of ``S.max() *
+    np.finfo(M.dtype).eps / 2. * np.sqrt(m + n + 1.)``. The authors describe
+    this threshold as being based on "expected roundoff error" (p 71).
+
+    The thresholds above deal with floating point roundoff error in the
+    calculation of the SVD.  However, you may have more information about the
+    sources of error in `M` that would make you consider other tolerance values
+    to detect *effective* rank deficiency.  The most useful measure of the
+    tolerance depends on the operations you intend to use on your matrix.  For
+    example, if your data come from uncertain measurements with uncertainties
+    greater than floating point epsilon, choosing a tolerance near that
+    uncertainty may be preferable.  The tolerance may be absolute if the
+    uncertainties are absolute rather than relative.
 
     References
     ----------
-    .. [1] G. H. Golub and C. F. Van Loan, *Matrix Computations*.
-       Baltimore: Johns Hopkins University Press, 1996.
+    .. [1] MATLAB reference documention, "Rank"
+           http://www.mathworks.com/help/techdoc/ref/rank.html
+    .. [2] W. H. Press, S. A. Teukolsky, W. T. Vetterling and B. P. Flannery,
+           "Numerical Recipes (3rd edition)", Cambridge University Press, 2007,
+           page 795.
 
     Examples
     --------
@@ -1465,7 +1491,6 @@ def matrix_rank(M, tol=None):
     1
     >>> matrix_rank(np.zeros((4,)))
     0
-
     """
     M = asarray(M)
     if M.ndim > 2:
@@ -1474,7 +1499,7 @@ def matrix_rank(M, tol=None):
         return int(not all(M==0))
     S = svd(M, compute_uv=False)
     if tol is None:
-        tol = S.max() * finfo(S.dtype).eps
+        tol = S.max() * max(M.shape) * finfo(S.dtype).eps
     return sum(S > tol)
 
 
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index 2a2ec3da5..623939863 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -3,7 +3,9 @@
 import sys
 
 import numpy as np
-from numpy.testing import *
+from numpy.testing import (TestCase, assert_, assert_equal, assert_raises,
+                           assert_array_equal, assert_almost_equal,
+                           run_module_suite)
 from numpy import array, single, double, csingle, cdouble, dot, identity
 from numpy import multiply, atleast_2d, inf, asarray, matrix
 from numpy import linalg
@@ -451,6 +453,19 @@ class TestMatrixRank(object):
         yield assert_equal, matrix_rank(1), 1
 
 
+def test_reduced_rank():
+    # Test matrices with reduced rank
+    rng = np.random.RandomState(20120714)
+    for i in range(100):
+        # Make a rank deficient matrix
+        X = rng.normal(size=(40, 10))
+        X[:, 0] = X[:, 1] + X[:, 2]
+        # Assert that matrix_rank detected deficiency
+        assert_equal(matrix_rank(X), 9)
+        X[:, 3] = X[:, 4] + X[:, 5]
+        assert_equal(matrix_rank(X), 8)
+
+
 class TestQR(TestCase):
     def test_qr_empty(self):
         a = np.zeros((0,2))
diff --git a/numpy/testing/decorators.py b/numpy/testing/decorators.py
index a7cc9a847..053b99211 100644
--- a/numpy/testing/decorators.py
+++ b/numpy/testing/decorators.py
@@ -210,7 +210,7 @@ def knownfailureif(fail_condition, msg=None):
         from noseclasses import KnownFailureTest
         def knownfailer(*args, **kwargs):
             if fail_val():
-                raise KnownFailureTest, msg
+                raise KnownFailureTest(msg)
             else:
                 return f(*args, **kwargs)
         return nose.tools.make_decorator(f)(knownfailer)