84 files changed, 2893 insertions, 1019 deletions
diff --git a/.gitignore b/.gitignore
index f54e63b89..416299a1f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,8 @@
 .*.sw[nop]
 .sw[nop]
 *.tmp
+*.vim
+tags
 
 # Compiled source #
 ###################
@@ -90,3 +92,29 @@ numpy/core/include/numpy/_numpyconfig.h
 numpy/version.py
 site.cfg
 .tox
+numpy/core/include/numpy/__multiarray_api.c
+numpy/core/include/numpy/__ufunc_api.c
+numpy/core/include/numpy/__umath_generated.c
+numpy/core/include/numpy/config.h
+numpy/core/include/numpy/multiarray_api.txt
+numpy/core/include/numpy/ufunc_api.txt
+numpy/core/lib/
+numpy/core/src/multiarray/arraytypes.c
+numpy/core/src/multiarray/einsum.c
+numpy/core/src/multiarray/lowlevel_strided_loops.c
+numpy/core/src/multiarray/multiarray_tests.c
+numpy/core/src/multiarray/nditer_templ.c
+numpy/core/src/multiarray/scalartypes.c
+numpy/core/src/npymath/ieee754.c
+numpy/core/src/npymath/npy_math.c
+numpy/core/src/npymath/npy_math_complex.c
+numpy/core/src/npysort/heapsort.c
+numpy/core/src/npysort/mergesort.c
+numpy/core/src/npysort/quicksort.c
+numpy/core/src/npysort/sort.c
+numpy/core/src/scalarmathmodule.c
+numpy/core/src/umath/funcs.inc
+numpy/core/src/umath/loops.c
+numpy/core/src/umath/umath_tests.c
+numpy/distutils/__config__.py
+doc/source/reference/generated
+\ No newline at end of file
diff --git a/doc/TESTS.rst.txt b/doc/TESTS.rst.txt
index 9bde1849f..92f236f5a 100644
--- a/doc/TESTS.rst.txt
+++ b/doc/TESTS.rst.txt
@@ -107,6 +107,11 @@ whether a certain assumption is valid. If the assertion fails, the test fails.
 Note that the Python builtin ``assert`` should not be used, because it is
 stripped during compilation with ``-O``.
 
+Note that ``test_`` functions or methods should not have a docstring, because
+that makes it hard to identify the test from the output of running the test
+suite with ``verbose=2`` (or similar verbosity setting).  Use plain comments
+(``#``) if necessary.
+
 Sometimes it is convenient to run ``test_yyy.py`` by itself, so we add
 
 ::
@@ -174,7 +179,7 @@ nose decorators::
 
   @nose.with_setup(setup_func, teardown_func)
   def test_with_extras():
-      """This test uses the setup/teardown functions."""
+      # This test uses the setup/teardown functions.
       global helpful_variable
       print "  In test_with_extras"
       print "  Helpful is %s" % helpful_variable
@@ -198,6 +203,15 @@ Note that 'check_even' is not itself a test (no 'test' in the name),
 but 'test_evens' is a generator that returns a series of tests, using
 'check_even', across a range of inputs.
 
+A problem with generator tests can be that if a test is failing, it's
+hard to see for which parameters.  To avoid this problem, ensure that:
+
+  - No computation related to the features tested is done in the 
+    ``test_*`` generator function, but delegated to a corresponding
+    ``check_*`` function (can be inside the generator, to share namespace).
+  - The generators are used *solely* for loops over parameters.
+  - Those parameters are *not* arrays.
+
 .. warning::
 
    Parametric tests cannot be implemented on classes derived from
diff --git a/doc/release/1.7.0-notes.rst b/doc/release/1.7.0-notes.rst
new file mode 100644
index 000000000..e8b1de72d
--- /dev/null
+++ b/doc/release/1.7.0-notes.rst
@@ -0,0 +1,279 @@
+=========================
+NumPy 1.7.0 Release Notes
+=========================
+
+This release includes several new features as well as numerous bug fixes and
+refactorings. It supports Python 2.4 - 2.7 and 3.1 - 3.2.
+
+Highlights
+==========
+
+* ``where=`` parameter to ufuncs (allows the use of boolean arrays to choose
+  where a computation should be done)
+* ``vectorize`` improvements (added 'excluded' and 'cache' keyword, general
+  cleanup and bug fixes)
+* ``numpy.random.choice`` (random sample generating function)
+
+
+Compatibility notes
+===================
+
+In a future version of numpy, the functions np.diag, np.diagonal, and
+the diagonal method of ndarrays will return a view onto the original
+array, instead of producing a copy as they do now. This makes a
+difference if you write to the array returned by any of these
+functions. To facilitate this transition, numpy 1.7 produces a
+FutureWarning if it detects that you may be attempting to write to
+such an array. See the documentation for np.diagonal for details.
+
+The default casting rule for UFunc out= parameters has been changed from
+'unsafe' to 'same_kind'.  Most usages which violate the 'same_kind'
+rule are likely bugs, so this change may expose previously undetected
+errors in projects that depend on NumPy.
+
+Full-array boolean indexing has been optimized to use a different,
+optimized code path.   This code path should produce the same results,
+but any feedback about changes to your code would be appreciated. 
+
+Attempting to write to a read-only array (one with
+``arr.flags.writeable`` set to ``False``) used to raise either a
+RuntimeError, ValueError, or TypeError inconsistently, depending on
+which code path was taken. It now consistently raises a ValueError.
+
+The <ufunc>.reduce functions evaluate some reductions in a different
+order than in previous versions of NumPy, generally providing higher
+performance. Because of the nature of floating-point arithmetic, this
+may subtly change some results, just as linking NumPy to a different
+BLAS implementations such as MKL can.
+
+If upgrading from 1.5, then generally in 1.6 and 1.7 there have been
+substantial code added and some code paths altered, particularly in
+the areas of type resolution and buffered iteration over universal
+functions.   This might have an impact on your code particularly if
+you relied on accidental behavior in the past. 
+
+New features
+============
+
+Mask-based NA missing values
+----------------------------
+
+Preliminary support for NA missing values similar to those in R has
+been implemented.  This was done by adding optional NA masks to the core
+array object.
+
+.. note:: The NA API is *experimental*, and may undergo changes in future
+   versions of NumPy.  The current implementation based on masks will likely be
+   supplemented by a second one based on bit-patterns, and it is possible that
+   a difference will be made between missing and ignored data.
+
+While a significant amount of the NumPy functionality has been extended to
+support NA masks, not everything is yet supported. Here is an (incomplete)
+list of things that do and do not work with NA values:
+
+What works with NA:
+    * Basic indexing and slicing, as well as full boolean mask indexing.
+    * All element-wise ufuncs.
+    * All UFunc.reduce methods, with a new skipna parameter.
+    * np.all and np.any satisfy the rules NA | True == True and
+      NA & False == False
+    * The nditer object.
+    * Array methods:
+       + ndarray.clip, ndarray.min, ndarray.max, ndarray.sum, ndarray.prod,
+         ndarray.conjugate, ndarray.diagonal, ndarray.flatten
+       + numpy.concatenate, numpy.column_stack, numpy.hstack,
+         numpy.vstack, numpy.dstack, numpy.squeeze, numpy.mean, numpy.std,
+         numpy.var
+
+What doesn't work with NA:
+    * Fancy indexing, such as with lists and partial boolean masks.
+    * ndarray.flat and any other methods that use the old iterator
+      mechanism instead of the newer nditer.
+    * Struct dtypes, which will have corresponding struct masks with
+      one mask value per primitive field of the struct dtype.
+    * UFunc.accumulate, UFunc.reduceat.
+    * Ufunc calls with both NA masks and a where= mask at the same time.
+    * File I/O has not been extended to support NA-masks.
+    * np.logical_and and np.logical_or don't satisfy the
+      rules NA | True == True and NA & False == False yet.
+    * Array methods:
+       + ndarray.argmax, ndarray.argmin,
+       + numpy.repeat, numpy.delete (relies on fancy indexing),
+         numpy.append, numpy.insert (relies on fancy indexing),
+         numpy.where,
+
+Differences with R:
+    * R's parameter rm.na=T is spelled skipna=True in NumPy.
+    * np.isna(nan) is False, but R's is.na(nan) is TRUE. This is because
+      NumPy's NA is treated independently of the underlying data type.
+    * Boolean indexing, where the result is compressed to just
+      the elements with true in the mask, raises if the boolean mask
+      has an NA value in it. This is because that value could be either
+      True or False, meaning the count of the output array is actually
+      NA. R treats this case in a manner inconsistent with the NA model,
+      returning NA values in the spots where the boolean index has NA.
+      This may have a practical advantage in spite of violating the
+      NA theoretical model, so NumPy could adopt the behavior if necessary
+
+For more information, see `doc/neps/missing-data.rst <https://github.com/numpy/numpy/blob/maintenance/1.7.x/doc/neps/missing-data.rst>`_.
+
+Reduction UFuncs Generalize axis= Parameter
+-------------------------------------------
+
+Any ufunc.reduce function call, as well as other reductions like
+sum, prod, any, all, max and min support the ability to choose
+a subset of the axes to reduce over. Previously, one could say
+axis=None to mean all the axes or axis=# to pick a single axis.
+Now, one can also say axis=(#,#) to pick a list of axes for reduction.
+
+Reduction UFuncs New keepdims= Parameter
+----------------------------------------
+
+There is a new keepdims= parameter, which if set to True, doesn't
+throw away the reduction axes but instead sets them to have size one.
+When this option is set, the reduction result will broadcast correctly
+to the original operand which was reduced.
+
+Datetime support
+----------------
+
+.. note:: The datetime API is *experimental* in 1.7.0, and may undergo changes
+   in future versions of NumPy.
+
+There have been a lot of fixes and enhancements to datetime64 compared
+to NumPy 1.6:
+
+* the parser is quite strict about only accepting ISO 8601 dates, with a few
+  convenience extensions
+* converts between units correctly
+* datetime arithmetic works correctly
+* business day functionality (allows the datetime to be used in contexts where
+  only certain days of the week are valid)
+
+The notes in `doc/source/reference/arrays.datetime.rst <https://github.com/numpy/numpy/blob/maintenance/1.7.x/doc/source/reference/arrays.datetime.rst>`_
+(also available in the online docs at `arrays.datetime.html
+<http://docs.scipy.org/doc/numpy/reference/arrays.datetime.html>`_) should be
+consulted for more details.
+
+Custom formatter for printing arrays
+------------------------------------
+
+See the new ``formatter`` parameter of the ``numpy.set_printoptions``
+function.
+
+New function numpy.random.choice
+---------------------------------
+
+A generic sampling function has been added which will generate samples from
+a given array-like. The samples can be with or without replacement, and
+with uniform or given non-uniform probabilities.
+
+New function isclose
+--------------------
+
+Returns a boolean array where two arrays are element-wise equal within a
+tolerance. Both relative and absolute tolerance can be specified. The
+function is NA aware.
+
+Preliminary multi-dimensional support in the polynomial package
+---------------------------------------------------------------
+
+Axis keywords have been added to the integration and differentiation
+functions and a tensor keyword was added to the evaluation functions.
+These additions allow multi-dimensional coefficient arrays to be used in
+those functions. New functions for evaluating 2-D and 3-D coefficient
+arrays on grids or sets of points were added together with 2-D and 3-D
+pseudo-Vandermonde matrices that can be used for fitting.
+
+Support for mask-based NA values in the polynomial package fits
+---------------------------------------------------------------
+
+The fitting functions recognize and remove masked data from the fit.
+
+Ability to pad rank-n arrays
+----------------------------
+
+A pad module containing functions for padding n-dimensional arrays has
+been added. The various private padding functions are exposed as options to
+a public 'pad' function.  Example::
+
+    pad(a, 5, mode='mean')
+
+Current modes are ``constant``, ``edge``, ``linear_ramp``, ``maximum``,
+``mean``, ``median``, ``minimum``, ``reflect``, ``symmetric``, ``wrap``, and
+``<function>``.
+
+
+New argument to searchsorted
+----------------------------
+
+The function searchsorted now accepts a 'sorter' argument that is a
+permuation array that sorts the array to search.
+
+C API
+-----
+
+New function ``PyArray_RequireWriteable`` provides a consistent
+interface for checking array writeability -- any C code which works
+with arrays whose WRITEABLE flag is not known to be True a priori,
+should make sure to call this function before writing.
+
+NumPy C Style Guide added (``doc/C_STYLE_GUIDE.rst.txt``).
+
+Changes
+=======
+
+General
+-------
+
+The function np.concatenate tries to match the layout of its input
+arrays. Previously, the layout did not follow any particular reason,
+and depended in an undesirable way on the particular axis chosen for
+concatenation. A bug was also fixed which silently allowed out of bounds
+axis arguments.
+
+The ufuncs logical_or, logical_and, and logical_not now follow Python's
+behavior with object arrays, instead of trying to call methods on the
+objects. For example the expression (3 and 'test') produces the string
+'test', and now np.logical_and(np.array(3, 'O'), np.array('test', 'O'))
+produces 'test' as well.
+
+C-API
+-----
+
+The following macros now require trailing semicolons::
+
+    NPY_BEGIN_THREADS_DEF
+    NPY_BEGIN_THREADS
+    NPY_ALLOW_C_API
+    NPY_ALLOW_C_API_DEF
+    NPY_DISABLE_C_API
+
+
+Deprecations
+============
+
+General
+-------
+
+Specifying a custom string formatter with a `_format` array attribute is
+deprecated. The new `formatter` keyword in ``numpy.set_printoptions`` or
+``numpy.array2string`` can be used instead.
+
+The deprecated imports in the polynomial package have been removed.
+
+C-API
+-----
+
+Direct access to the fields of PyArrayObject* has been deprecated. Direct
+access has been recommended against for many releases. Expect similar
+deprecations for PyArray_Descr* and other core objects in the future as
+preparation for NumPy 2.0.
+
+The macros in old_defines.h are deprecated and will be removed in the next
+minor release (>= 1.8). The sed script tools/replace_old_macros.sed can
+be used to replace these macros with the newer versions.
+
+You can test your code against the deprecated C API by #defining
+NPY_NO_DEPRECATED_API to the target version number, for example
+NPY_1_7_API_VERSION, before including any NumPy headers.
diff --git a/doc/source/reference/c-api.array.rst b/doc/source/reference/c-api.array.rst
index 8736cbc3f..bf8077a69 100644
--- a/doc/source/reference/c-api.array.rst
+++ b/doc/source/reference/c-api.array.rst
@@ -289,6 +289,7 @@ From scratch
 
     Fill the array pointed to by *obj* ---which must be a (subclass
     of) bigndarray---with the contents of *val* (evaluated as a byte).
+    This macro calls memset, so obj must be contiguous.
 
 .. cfunction:: PyObject* PyArray_Zeros(int nd, npy_intp* dims, PyArray_Descr* dtype, int fortran)
 
diff --git a/numpy/core/SConscript b/numpy/core/SConscript
index 46087610c..952544a25 100644
--- a/numpy/core/SConscript
+++ b/numpy/core/SConscript
@@ -429,7 +429,9 @@ env.Install('$distutils_installdir/lib/npy-pkg-config', mlib_ini)
 env.Install('$distutils_installdir/lib/npy-pkg-config', npymath_ini)
 
 # npysort core lib
-npysort_src = [env.GenerateFromTemplate(pjoin('src', 'npysort', 'sort.c.src'))]
+npysort_src = [env.GenerateFromTemplate(pjoin('src', 'npysort', 'quicksort.c.src')),
+               env.GenerateFromTemplate(pjoin('src', 'npysort', 'mergesort.c.src')),
+               env.GenerateFromTemplate(pjoin('src', 'npysort', 'heapsort.c.src'))]
 env.StaticExtLibrary("npysort", npysort_src)
 env.Prepend(LIBS=["npysort"])
 env.Prepend(LIBPATH=["."])
diff --git a/numpy/core/bento.info b/numpy/core/bento.info
index 04401f991..bf8ae3a81 100644
--- a/numpy/core/bento.info
+++ b/numpy/core/bento.info
@@ -10,7 +10,9 @@ Library:
             src/npymath/halffloat.c
     CompiledLibrary: npysort
         Sources:
-            src/npysort/sort.c.src
+            src/npysort/quicksort.c.src,
+            src/npysort/mergesort.c.src,
+            src/npysort/heapsort.c.src
     Extension: multiarray
         Sources:
             src/multiarray/multiarraymodule_onefile.c
diff --git a/numpy/core/code_generators/cversions.txt b/numpy/core/code_generators/cversions.txt
index 99ea072ca..5bdb33c94 100644
--- a/numpy/core/code_generators/cversions.txt
+++ b/numpy/core/code_generators/cversions.txt
@@ -10,4 +10,4 @@
 # PyArray_CountNonzero, PyArray_NewLikeArray and PyArray_MatrixProduct2.
 0x00000006 = e61d5dc51fa1c6459328266e215d6987
 # Version 7 (NumPy 1.7) improved datetime64, misc utilities.
-0x00000007 = 1768b6c404a3d5a2a6bfe7c68f89e3aa
+0x00000007 = e396ba3912dcf052eaee1b0b203a7724
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index fd2b9628e..c49c3c346 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -324,6 +324,10 @@ multiarray_funcs_api = {
     'PyArray_DebugPrint':                   285,
     'PyArray_FailUnlessWriteable':          286,
     'PyArray_SetUpdateIfCopyBase':          287,
+    'PyDataMem_NEW':                        288,
+    'PyDataMem_FREE':                       289,
+    'PyDataMem_RENEW':                      290,
+    'PyDataMem_SetEventHook':               291,
 }
 
 ufunc_types_api = {
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index 3e89079dd..d73f1313c 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -937,7 +937,7 @@ def diagonal(a, offset=0, axis1=0, axis2=1):
 
     In NumPy 1.7, it continues to return a copy of the diagonal, but depending
     on this fact is deprecated. Writing to the resulting array continues to
-    work as it used to, but a DeprecationWarning will be issued.
+    work as it used to, but a FutureWarning will be issued.
 
     In NumPy 1.8, it will switch to returning a read-only view on the original
     array. Attempting to write to the resulting array will produce an error.
diff --git a/numpy/core/include/numpy/ndarrayobject.h b/numpy/core/include/numpy/ndarrayobject.h
index 74943d535..f00dd7744 100644
--- a/numpy/core/include/numpy/ndarrayobject.h
+++ b/numpy/core/include/numpy/ndarrayobject.h
@@ -229,8 +229,10 @@ PyArray_XDECREF_ERR(PyArrayObject *arr)
 
 #if PY_VERSION_HEX >= 0x02050000
 #define DEPRECATE(msg) PyErr_WarnEx(PyExc_DeprecationWarning,msg,1)
+#define DEPRECATE_FUTUREWARNING(msg) PyErr_WarnEx(PyExc_FutureWarning,msg,1)
 #else
 #define DEPRECATE(msg) PyErr_Warn(PyExc_DeprecationWarning,msg)
+#define DEPRECATE_FUTUREWARNING(msg) PyErr_Warn(PyExc_FutureWarning,msg)
 #endif
 
 
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index 9605c20f8..13483396b 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -327,10 +327,7 @@ struct NpyAuxData_tag {
    * allocated.
    */
 
-  /* Data buffer */
-#define PyDataMem_NEW(size) ((char *)malloc(size))
-#define PyDataMem_FREE(ptr)  free(ptr)
-#define PyDataMem_RENEW(ptr,size) ((char *)realloc(ptr,size))
+  /* Data buffer - PyDataMem_NEW/FREE/RENEW are in multiarraymodule.c */
 
 #define NPY_USE_PYMEM 1
 
@@ -1714,6 +1711,13 @@ typedef struct {
                            */
 } PyArrayInterface;
 
+/*
+ * This is a function for hooking into the PyDataMem_NEW/FREE/RENEW functions.
+ * See the documentation for PyDataMem_SetEventHook.
+ */
+typedef void (PyDataMem_EventHookFunc)(void *inp, void *outp, size_t size,
+                                       void *user_data);
+
 #if !(defined(NPY_NO_DEPRECATED_API) && (NPY_API_VERSION <= NPY_NO_DEPRECATED_API))
 #include "npy_deprecated_api.h"
 #endif
diff --git a/numpy/core/include/numpy/npy_3kcompat.h b/numpy/core/include/numpy/npy_3kcompat.h
index d3d5404f5..4b9d0ea32 100644
--- a/numpy/core/include/numpy/npy_3kcompat.h
+++ b/numpy/core/include/numpy/npy_3kcompat.h
@@ -137,17 +137,6 @@ PyUnicode_Concat2(PyObject **left, PyObject *right)
     *left = newobj;
 }
 
-
-/*
- * Accessing items of ob_base
- */
-
-#if (PY_VERSION_HEX < 0x02060000)
-#define Py_TYPE(o)    (((PyObject*)(o))->ob_type)
-#define Py_REFCNT(o)  (((PyObject*)(o))->ob_refcnt)
-#define Py_SIZE(o)    (((PyVarObject*)(o))->ob_size)
-#endif
-
 /*
  * PyFile_* compatibility
  */
@@ -204,7 +193,7 @@ npy_PyFile_Dup(PyObject *file, char *mode)
         fclose(handle);
         return NULL;
     }
-    fseek(handle, pos, SEEK_SET);
+    npy_fseek(handle, pos, SEEK_SET);
     return handle;
 }
 
@@ -215,11 +204,11 @@ static NPY_INLINE int
 npy_PyFile_DupClose(PyObject *file, FILE* handle)
 {
     PyObject *ret;
-    long position;
-    position = ftell(handle);
+    Py_ssize_t position;
+    position = npy_ftell(handle);
     fclose(handle);
 
-    ret = PyObject_CallMethod(file, "seek", "li", position, 0);
+    ret = PyObject_CallMethod(file, "seek", NPY_SSIZE_T_PYFMT "i", position, 0);
     if (ret == NULL) {
         return -1;
     }
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 4f895e0a7..7fca7e220 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -16,6 +16,15 @@
         #define NPY_INLINE
 #endif
 
+/* Enable 64 bit file position support on win-amd64. Ticket #1660 */
+#if defined(_MSC_VER) && defined(_WIN64) && (_MSC_VER > 1400)
+    #define npy_fseek _fseeki64
+    #define npy_ftell _ftelli64
+#else
+    #define npy_fseek fseek
+    #define npy_ftell ftell
+#endif
+
 /* enums for detected endianness */
 enum {
         NPY_CPU_UNKNOWN_ENDIAN,
@@ -48,9 +57,7 @@ typedef Py_uintptr_t npy_uintp;
     #define PY_SSIZE_T_MIN INT_MIN
   #endif
 #define NPY_SSIZE_T_PYFMT "i"
-#undef PyIndex_Check
 #define constchar const char
-#define PyIndex_Check(op) 0
 #else
 #define NPY_SSIZE_T_PYFMT "n"
 #define constchar char
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 2d36f5ee1..a1000ae0b 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -671,7 +671,10 @@ def configuration(parent_package='',top_path=None):
 
     # This library is created for the build but it is not installed
     config.add_library('npysort',
-            sources = [join('src', 'npysort', 'sort.c.src')])
+            sources = [join('src', 'npysort', 'quicksort.c.src'),
+                       join('src', 'npysort', 'mergesort.c.src'),
+                       join('src', 'npysort', 'heapsort.c.src')])
+
 
     #######################################################################
     #                        multiarray module                            #
diff --git a/numpy/core/src/dummymodule.c b/numpy/core/src/dummymodule.c
index eca6edb54..1d48709e9 100644
--- a/numpy/core/src/dummymodule.c
+++ b/numpy/core/src/dummymodule.c
@@ -9,7 +9,7 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 
 #include <Python.h>
-#include <numpy/npy_3kcompat.h>
+#include <npy_pycompat.h>
 
 static struct PyMethodDef methods[] = {
     {NULL, NULL, 0, NULL}
diff --git a/numpy/core/src/multiarray/array_assign.c b/numpy/core/src/multiarray/array_assign.c
index 03f81f21c..9c1685c16 100644
--- a/numpy/core/src/multiarray/array_assign.c
+++ b/numpy/core/src/multiarray/array_assign.c
@@ -16,7 +16,7 @@
 #include <numpy/ndarraytypes.h>
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "shape.h"
 
diff --git a/numpy/core/src/multiarray/array_assign_array.c b/numpy/core/src/multiarray/array_assign_array.c
index 98c2b19ff..4f462cb05 100644
--- a/numpy/core/src/multiarray/array_assign_array.c
+++ b/numpy/core/src/multiarray/array_assign_array.c
@@ -15,7 +15,7 @@
 #include <numpy/ndarraytypes.h>
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "convert_datatype.h"
 #include "methods.h"
diff --git a/numpy/core/src/multiarray/array_assign_scalar.c b/numpy/core/src/multiarray/array_assign_scalar.c
index c59e6752d..57f8b9074 100644
--- a/numpy/core/src/multiarray/array_assign_scalar.c
+++ b/numpy/core/src/multiarray/array_assign_scalar.c
@@ -15,7 +15,7 @@
 #include <numpy/ndarraytypes.h>
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "convert_datatype.h"
 #include "methods.h"
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index 3cba466dd..e8bc6b7b6 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -32,7 +32,7 @@ maintainer email:  oliphant.travis@ieee.org
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 
@@ -698,7 +698,7 @@ array_might_be_written(PyArrayObject *obj)
         "release -- see numpy.diagonal docs for details. The quick fix is\n"
         "to make an explicit copy (e.g., do arr.diagonal().copy()).";
     if (PyArray_FLAGS(obj) & NPY_ARRAY_WARN_ON_WRITE) {
-        if (DEPRECATE(msg) < 0) {
+        if (DEPRECATE_FUTUREWARNING(msg) < 0) {
             return -1;
         }
         /* Only warn once per array */
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index c446619b6..ad45cdb9e 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -7,7 +7,7 @@
 #define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 #include "numpy/npy_math.h"
 #include "numpy/halffloat.h"
 
diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
index f9a7dddc0..530edbb1a 100644
--- a/numpy/core/src/multiarray/buffer.c
+++ b/numpy/core/src/multiarray/buffer.c
@@ -9,7 +9,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "buffer.h"
 #include "numpyos.h"
diff --git a/numpy/core/src/multiarray/calculation.c b/numpy/core/src/multiarray/calculation.c
index d8d02ef0c..618a8714c 100644
--- a/numpy/core/src/multiarray/calculation.c
+++ b/numpy/core/src/multiarray/calculation.c
@@ -8,7 +8,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 #include "number.h"
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index 9281a6d84..f62b37b4d 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -6,7 +6,7 @@
 #include "numpy/arrayobject.h"
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "usertypes.h"
 
diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c
index ec759b85b..914bc274d 100644
--- a/numpy/core/src/multiarray/conversion_utils.c
+++ b/numpy/core/src/multiarray/conversion_utils.c
@@ -9,7 +9,7 @@
 #include "numpy/arrayobject.h"
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 #include "arraytypes.h"
diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c
index 042aea07b..8b646e59b 100644
--- a/numpy/core/src/multiarray/convert.c
+++ b/numpy/core/src/multiarray/convert.c
@@ -9,7 +9,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "arrayobject.h"
 #include "mapping.h"
diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index 6fdb10bb2..de7468c51 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -9,7 +9,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 #include "scalartypes.h"
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index c9cdffb23..ae29add6e 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -11,7 +11,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 #include "ctors.h"
@@ -3000,41 +3000,21 @@ array_fromfile_binary(FILE *fp, PyArray_Descr *dtype, npy_intp num, size_t *nrea
 
     if (num < 0) {
         int fail = 0;
-
-#if defined(_MSC_VER) && defined(_WIN64) && (_MSC_VER > 1400)
-        /* Workaround Win64 fwrite() bug. Ticket #1660 */
-        start = (npy_intp )_ftelli64(fp);
-        if (start < 0) {
-            fail = 1;
-        }
-        if (_fseeki64(fp, 0, SEEK_END) < 0) {
-            fail = 1;
-        }
-        numbytes = (npy_intp) _ftelli64(fp);
-        if (numbytes < 0) {
-            fail = 1;
-        }
-        numbytes -= start;
-        if (_fseeki64(fp, start, SEEK_SET) < 0) {
-            fail = 1;
-        }
-#else
-        start = (npy_intp)ftell(fp);
+        start = (npy_intp) npy_ftell(fp);
         if (start < 0) {
             fail = 1;
         }
-        if (fseek(fp, 0, SEEK_END) < 0) {
+        if (npy_fseek(fp, 0, SEEK_END) < 0) {
             fail = 1;
         }
-        numbytes = (npy_intp) ftell(fp);
+        numbytes = (npy_intp) npy_ftell(fp);
         if (numbytes < 0) {
             fail = 1;
         }
         numbytes -= start;
-        if (fseek(fp, start, SEEK_SET) < 0) {
+        if (npy_fseek(fp, start, SEEK_SET) < 0) {
             fail = 1;
         }
-#endif
         if (fail) {
             PyErr_SetString(PyExc_IOError,
                             "could not seek in file");
diff --git a/numpy/core/src/multiarray/datetime.c b/numpy/core/src/multiarray/datetime.c
index 55fb3e7ed..6f70fc2ff 100644
--- a/numpy/core/src/multiarray/datetime.c
+++ b/numpy/core/src/multiarray/datetime.c
@@ -18,7 +18,7 @@
 #include <numpy/arrayobject.h>
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "numpy/arrayscalars.h"
 #include "methods.h"
diff --git a/numpy/core/src/multiarray/datetime_busday.c b/numpy/core/src/multiarray/datetime_busday.c
index aa38594cb..331e10496 100644
--- a/numpy/core/src/multiarray/datetime_busday.c
+++ b/numpy/core/src/multiarray/datetime_busday.c
@@ -15,7 +15,7 @@
 #include <numpy/arrayobject.h>
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "numpy/arrayscalars.h"
 #include "lowlevel_strided_loops.h"
diff --git a/numpy/core/src/multiarray/datetime_busdaycal.c b/numpy/core/src/multiarray/datetime_busdaycal.c
index b5b74dd14..82eea9f20 100644
--- a/numpy/core/src/multiarray/datetime_busdaycal.c
+++ b/numpy/core/src/multiarray/datetime_busdaycal.c
@@ -16,7 +16,7 @@
 #include <numpy/arrayobject.h>
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "numpy/arrayscalars.h"
 #include "lowlevel_strided_loops.h"
diff --git a/numpy/core/src/multiarray/datetime_strings.c b/numpy/core/src/multiarray/datetime_strings.c
index cfeabc783..32009bb79 100644
--- a/numpy/core/src/multiarray/datetime_strings.c
+++ b/numpy/core/src/multiarray/datetime_strings.c
@@ -17,7 +17,7 @@
 #include <numpy/arrayobject.h>
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "numpy/arrayscalars.h"
 #include "methods.h"
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index 5090dc39c..596e9b837 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -11,7 +11,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "_datetime.h"
 #include "common.h"
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index b99dd5ff4..138275c4c 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -19,7 +19,7 @@
 #include <numpy/arrayobject.h>
 #include <numpy/npy_cpu.h>
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "convert_datatype.h"
 #include "_datetime.h"
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 03def17b4..2c59533f8 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -16,7 +16,7 @@
 #define _MULTIARRAYMODULE
 #include <numpy/arrayobject.h>
 #include <numpy/halffloat.h>
-#include <numpy/npy_3kcompat.h>
+#include <npy_pycompat.h>
 
 #include <ctype.h>
 
diff --git a/numpy/core/src/multiarray/flagsobject.c b/numpy/core/src/multiarray/flagsobject.c
index 6e5455c01..faaf264e0 100644
--- a/numpy/core/src/multiarray/flagsobject.c
+++ b/numpy/core/src/multiarray/flagsobject.c
@@ -11,7 +11,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 
diff --git a/numpy/core/src/multiarray/getset.c b/numpy/core/src/multiarray/getset.c
index 51caba240..32212ebc4 100644
--- a/numpy/core/src/multiarray/getset.c
+++ b/numpy/core/src/multiarray/getset.c
@@ -10,7 +10,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 #include "scalartypes.h"
diff --git a/numpy/core/src/multiarray/hashdescr.c b/numpy/core/src/multiarray/hashdescr.c
index 71bc2adac..bff266415 100644
--- a/numpy/core/src/multiarray/hashdescr.c
+++ b/numpy/core/src/multiarray/hashdescr.c
@@ -6,7 +6,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "hashdescr.h"
 
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 977580d70..8c80c26f1 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -11,7 +11,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 #include "arrayobject.h"
@@ -19,6 +19,7 @@
 #include "lowlevel_strided_loops.h"
 
 #include "item_selection.h"
+#include "npy_sort.h"
 
 /*NUMPY_API
  * Take
@@ -890,7 +891,7 @@ _new_argsort(PyArrayObject *op, int axis, NPY_SORTKIND which)
 static PyArrayObject *global_obj;
 
 static int
-qsortCompare (const void *a, const void *b)
+sortCompare (const void *a, const void *b)
 {
     return PyArray_DESCR(global_obj)->f->compare(a,b,global_obj);
 }
@@ -954,7 +955,9 @@ PyArray_Sort(PyArrayObject *op, int axis, NPY_SORTKIND which)
     PyArrayObject *ap = NULL, *store_arr = NULL;
     char *ip;
     int i, n, m, elsize, orign;
-    int axis_orig=axis;
+    int res = 0;
+    int axis_orig = axis;
+    int (*sort)(void *, size_t, size_t, npy_comparator);
 
     n = PyArray_NDIM(op);
     if ((n == 0) || (PyArray_SIZE(op) == 1)) {
@@ -975,13 +978,30 @@ PyArray_Sort(PyArrayObject *op, int axis, NPY_SORTKIND which)
     if (PyArray_DESCR(op)->f->sort[which] != NULL) {
         return _new_sort(op, axis, which);
     }
-    if ((which != NPY_QUICKSORT)
-        || PyArray_DESCR(op)->f->compare == NULL) {
+
+    if (PyArray_DESCR(op)->f->compare == NULL) {
         PyErr_SetString(PyExc_TypeError,
-                        "desired sort not supported for this type");
+                "type does not have compare function");
         return -1;
     }
 
+    switch (which) {
+        case NPY_QUICKSORT :
+            sort = npy_quicksort;
+            break;
+        case NPY_HEAPSORT :
+            sort = npy_heapsort;
+            break;
+        case NPY_MERGESORT :
+            sort = npy_mergesort;
+            break;
+        default:
+            PyErr_SetString(PyExc_TypeError,
+                    "requested sort kind is not supported");
+            goto fail;
+    }
+
+
     SWAPAXES2(op);
 
     ap = (PyArrayObject *)PyArray_FromAny((PyObject *)op,
@@ -1001,13 +1021,26 @@ PyArray_Sort(PyArrayObject *op, int axis, NPY_SORTKIND which)
     store_arr = global_obj;
     global_obj = ap;
     for (ip = PyArray_DATA(ap), i = 0; i < n; i++, ip += elsize*m) {
-        qsort(ip, m, elsize, qsortCompare);
+        res = sort(ip, m, elsize, sortCompare);
+        if (res < 0) {
+            break;
+        }
     }
     global_obj = store_arr;
 
     if (PyErr_Occurred()) {
         goto fail;
     }
+    else if (res == -NPY_ENOMEM) {
+        PyErr_NoMemory();
+        goto fail;
+    }
+    else if (res == -NPY_ECOMP) {
+        PyErr_SetString(PyExc_TypeError,
+                "sort comparison failed");
+        goto fail;
+    }
+
 
  finish:
     Py_DECREF(ap);  /* Should update op if needed */
@@ -1045,6 +1078,8 @@ PyArray_ArgSort(PyArrayObject *op, int axis, NPY_SORTKIND which)
     npy_intp i, j, n, m, orign;
     int argsort_elsize;
     char *store_ptr;
+    int res = 0;
+    int (*sort)(void *, size_t, size_t, npy_comparator);
 
     n = PyArray_NDIM(op);
     if ((n == 0) || (PyArray_SIZE(op) == 1)) {
@@ -1071,14 +1106,32 @@ PyArray_ArgSort(PyArrayObject *op, int axis, NPY_SORTKIND which)
         return (PyObject *)ret;
     }
 
-    if ((which != NPY_QUICKSORT) || PyArray_DESCR(op2)->f->compare == NULL) {
+    if (PyArray_DESCR(op2)->f->compare == NULL) {
         PyErr_SetString(PyExc_TypeError,
-                        "requested sort not available for type");
+                "type does not have compare function");
         Py_DECREF(op2);
         op = NULL;
         goto fail;
     }
 
+    switch (which) {
+        case NPY_QUICKSORT :
+            sort = npy_quicksort;
+            break;
+        case NPY_HEAPSORT :
+            sort = npy_heapsort;
+            break;
+        case NPY_MERGESORT :
+            sort = npy_mergesort;
+            break;
+        default:
+            PyErr_SetString(PyExc_TypeError,
+                    "requested sort kind is not supported");
+            Py_DECREF(op2);
+            op = NULL;
+            goto fail;
+    }
+
     /* ap will contain the reference to op2 */
     SWAPAXES(ap, op2);
     op = (PyArrayObject *)PyArray_ContiguousFromAny((PyObject *)ap,
@@ -1109,11 +1162,27 @@ PyArray_ArgSort(PyArrayObject *op, int axis, NPY_SORTKIND which)
         for (j = 0; j < m; j++) {
             ip[j] = j;
         }
-        qsort((char *)ip, m, sizeof(npy_intp), argsort_static_compare);
+        res = sort((char *)ip, m, sizeof(npy_intp), argsort_static_compare);
+        if (res < 0) {
+            break;
+        }
     }
     global_data = store_ptr;
     global_obj = store;
 
+    if (PyErr_Occurred()) {
+        goto fail;
+    }
+    else if (res == -NPY_ENOMEM) {
+        PyErr_NoMemory();
+        goto fail;
+    }
+    else if (res == -NPY_ECOMP) {
+        PyErr_SetString(PyExc_TypeError,
+                "sort comparison failed");
+        goto fail;
+    }
+
  finish:
     Py_DECREF(op);
     SWAPBACK(op, ret);
@@ -1123,7 +1192,6 @@ PyArray_ArgSort(PyArrayObject *op, int axis, NPY_SORTKIND which)
     Py_XDECREF(op);
     Py_XDECREF(ret);
     return NULL;
-
 }
 
 
diff --git a/numpy/core/src/multiarray/iterators.c b/numpy/core/src/multiarray/iterators.c
index af0d0d3c2..2e464a3db 100644
--- a/numpy/core/src/multiarray/iterators.c
+++ b/numpy/core/src/multiarray/iterators.c
@@ -9,7 +9,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "arrayobject.h"
 #include "iterators.h"
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index c57e1ccc1..78818b31d 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -151,7 +151,7 @@ static void
 #else
 
         /* unaligned copy and swap */
-        memcpy(dst, src, @elsize@);
+        memmove(dst, src, @elsize@);
 #  if @is_swap@ == 1
         @swap@@elsize@(dst);
 #  elif @is_swap@ == 2
@@ -239,7 +239,7 @@ _strided_to_strided(char *dst, npy_intp dst_stride,
                         NpyAuxData *NPY_UNUSED(data))
 {
     while (N > 0) {
-        memcpy(dst, src, src_itemsize);
+        memmove(dst, src, src_itemsize);
         dst += dst_stride;
         src += src_stride;
         --N;
@@ -255,7 +255,7 @@ _swap_strided_to_strided(char *dst, npy_intp dst_stride,
     char *a, *b, c;
 
     while (N > 0) {
-        memcpy(dst, src, src_itemsize);
+        memmove(dst, src, src_itemsize);
         /* general in-place swap */
         a = dst;
         b = dst + src_itemsize - 1;
@@ -281,7 +281,7 @@ _swap_pair_strided_to_strided(char *dst, npy_intp dst_stride,
     npy_intp itemsize_half = src_itemsize / 2;
 
     while (N > 0) {
-        memcpy(dst, src, src_itemsize);
+        memmove(dst, src, src_itemsize);
         /* general in-place swap */
         a = dst;
         b = dst + itemsize_half - 1;
@@ -312,7 +312,7 @@ _contig_to_contig(char *dst, npy_intp NPY_UNUSED(dst_stride),
                         npy_intp N, npy_intp src_itemsize,
                         NpyAuxData *NPY_UNUSED(data))
 {
-    memcpy(dst, src, src_itemsize*N);
+    memmove(dst, src, src_itemsize*N);
 }
 
 
@@ -802,7 +802,7 @@ static void
         src_value = *((_TYPE1 *)src);
 #  endif
 #else
-        memcpy(&src_value, src, sizeof(src_value));
+        memmove(&src_value, src, sizeof(src_value));
 #endif
 
 /* Do the cast */
@@ -838,7 +838,7 @@ static void
         *((_TYPE2 *)dst) = dst_value;
 #  endif
 #else
-        memcpy(dst, &dst_value, sizeof(dst_value));
+        memmove(dst, &dst_value, sizeof(dst_value));
 #endif
 
 #if @contig@
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 7abb3ff7f..cdefb9982 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -9,7 +9,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 #include "iterators.h"
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 93d246671..c4147eff2 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -10,7 +10,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 #include "ctors.h"
diff --git a/numpy/core/src/multiarray/multiarray_tests.c.src b/numpy/core/src/multiarray/multiarray_tests.c.src
index 15d4c871e..0768ec0ca 100644
--- a/numpy/core/src/multiarray/multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/multiarray_tests.c.src
@@ -2,7 +2,7 @@
 #include <Python.h>
 #include "numpy/arrayobject.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 /*
  * TODO:
@@ -375,6 +375,59 @@ clean_ax:
     return NULL;
 }
 
+/* PyDataMem_SetHook tests */
+static int malloc_free_counts[2];
+static PyDataMem_EventHookFunc *old_hook = NULL;
+static void *old_data;
+
+static void test_hook(void *old, void *new, size_t size, void *user_data)
+{
+    int* counters = (int *) user_data;
+    if (old == NULL) {
+        counters[0]++; /* malloc counter */
+    }
+    if (size == 0) {
+        counters[1]++; /* free counter */
+    }
+}
+
+static PyObject*
+test_pydatamem_seteventhook_start(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
+{
+    malloc_free_counts[0] = malloc_free_counts[1] = 0;
+    old_hook = PyDataMem_SetEventHook(test_hook, (void *) malloc_free_counts, &old_data);
+    Py_INCREF(Py_None);
+    return Py_None;
+}
+
+static PyObject*
+test_pydatamem_seteventhook_end(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
+{
+    PyDataMem_EventHookFunc *my_hook;
+    void *my_data;
+
+    my_hook = PyDataMem_SetEventHook(old_hook, old_data, &my_data);
+    if ((my_hook != test_hook) || (my_data != (void *) malloc_free_counts)) {
+        PyErr_SetString(PyExc_ValueError,
+                        "hook/data was not the expected test hook");
+        return NULL;
+    }
+
+    if (malloc_free_counts[0] == 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "malloc count is zero after test");
+        return NULL;
+    }
+    if (malloc_free_counts[1] == 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "free count is zero after test");
+        return NULL;
+    }
+
+    Py_INCREF(Py_None);
+    return Py_None;
+}
+
 static PyMethodDef Multiarray_TestsMethods[] = {
     {"test_neighborhood_iterator",
         test_neighborhood_iterator,
@@ -382,6 +435,12 @@ static PyMethodDef Multiarray_TestsMethods[] = {
     {"test_neighborhood_iterator_oob",
         test_neighborhood_iterator_oob,
         METH_VARARGS, NULL},
+    {"test_pydatamem_seteventhook_start",
+        test_pydatamem_seteventhook_start,
+        METH_NOARGS, NULL},
+    {"test_pydatamem_seteventhook_end",
+        test_pydatamem_seteventhook_end,
+        METH_NOARGS, NULL},
     {NULL, NULL, 0, NULL}        /* Sentinel */
 };
 
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 1ab7823ad..1b9be3cd9 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -27,7 +27,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 
@@ -3430,6 +3430,102 @@ test_interrupt(PyObject *NPY_UNUSED(self), PyObject *args)
     return PyInt_FromLong(a);
 }
 
+/* malloc/free/realloc hook */
+NPY_NO_EXPORT PyDataMem_EventHookFunc *_PyDataMem_eventhook;
+NPY_NO_EXPORT void *_PyDataMem_eventhook_user_data;
+
+/*NUMPY_API
+ * Sets the allocation event hook for numpy array data.
+ * Takes a PyDataMem_EventHookFunc *, which has the signature:
+ *        void hook(void *old, void *new, size_t size, void *user_data).
+ *   Also takes a void *user_data, and void **old_data.
+ *
+ * Returns a pointer to the previous hook or NULL.  If old_data is
+ * non-NULL, the previous user_data pointer will be copied to it.
+ *
+ * If not NULL, hook will be called at the end of each PyDataMem_NEW/FREE/RENEW:
+ *   result = PyDataMem_NEW(size)        -> (*hook)(NULL, result, size, user_data)
+ *   PyDataMem_FREE(ptr)                 -> (*hook)(ptr, NULL, 0, user_data)
+ *   result = PyDataMem_RENEW(ptr, size) -> (*hook)(ptr, result, size, user_data)
+ *
+ * When the hook is called, the GIL will be held by the calling
+ * thread.  The hook should be written to be reentrant, if it performs
+ * operations that might cause new allocation events (such as the
+ * creation/descruction numpy objects, or creating/destroying Python
+ * objects which might cause a gc)
+ */
+NPY_NO_EXPORT PyDataMem_EventHookFunc *
+PyDataMem_SetEventHook(PyDataMem_EventHookFunc *newhook,
+                       void *user_data, void **old_data)
+{
+    PyGILState_STATE gilstate = PyGILState_Ensure();
+    PyDataMem_EventHookFunc *temp = _PyDataMem_eventhook;
+    _PyDataMem_eventhook = newhook;
+    if (old_data != NULL) {
+        *old_data = _PyDataMem_eventhook_user_data;
+    }
+    _PyDataMem_eventhook_user_data = user_data;
+    PyGILState_Release(gilstate);
+    return temp;
+}
+
+/*NUMPY_API
+ * Allocates memory for array data.
+ */
+NPY_NO_EXPORT void *
+PyDataMem_NEW(size_t size)
+{
+    void *result;
+
+    result = malloc(size);
+    if (_PyDataMem_eventhook != NULL) {
+        PyGILState_STATE gilstate = PyGILState_Ensure();
+        if (_PyDataMem_eventhook != NULL) {
+            (*_PyDataMem_eventhook)(NULL, result, size,
+                                    _PyDataMem_eventhook_user_data);
+        }
+        PyGILState_Release(gilstate);
+    }
+    return (char *)result;
+}
+
+/*NUMPY_API
+ * Free memory for array data.
+ */
+NPY_NO_EXPORT void
+PyDataMem_FREE(void *ptr)
+{
+    free(ptr);
+    if (_PyDataMem_eventhook != NULL) {
+        PyGILState_STATE gilstate = PyGILState_Ensure();
+        if (_PyDataMem_eventhook != NULL) {
+            (*_PyDataMem_eventhook)(ptr, NULL, 0,
+                                    _PyDataMem_eventhook_user_data);
+        }
+        PyGILState_Release(gilstate);
+    }
+}
+
+/*NUMPY_API
+ * Reallocate/resize memory for array data.
+ */
+NPY_NO_EXPORT void *
+PyDataMem_RENEW(void *ptr, size_t size)
+{
+    void *result;
+
+    result = realloc(ptr, size);
+    if (_PyDataMem_eventhook != NULL) {
+        PyGILState_STATE gilstate = PyGILState_Ensure();
+        if (_PyDataMem_eventhook != NULL) {
+            (*_PyDataMem_eventhook)(ptr, result, size,
+                                    _PyDataMem_eventhook_user_data);
+        }
+        PyGILState_Release(gilstate);
+    }
+    return (char *)result;
+}
+
 static struct PyMethodDef array_module_methods[] = {
     {"_get_ndarray_c_version",
         (PyCFunction)array__get_ndarray_c_version,
diff --git a/numpy/core/src/multiarray/nditer_impl.h b/numpy/core/src/multiarray/nditer_impl.h
index bc4d01ff9..1251baa6e 100644
--- a/numpy/core/src/multiarray/nditer_impl.h
+++ b/numpy/core/src/multiarray/nditer_impl.h
@@ -17,7 +17,7 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 #include <numpy/arrayobject.h>
-#include <numpy/npy_3kcompat.h>
+#include <npy_pycompat.h>
 #include "convert_datatype.h"
 
 #include "lowlevel_strided_loops.h"
diff --git a/numpy/core/src/multiarray/nditer_pywrap.c b/numpy/core/src/multiarray/nditer_pywrap.c
index d6029e814..fd88cdbc7 100644
--- a/numpy/core/src/multiarray/nditer_pywrap.c
+++ b/numpy/core/src/multiarray/nditer_pywrap.c
@@ -13,11 +13,8 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 #include <numpy/arrayobject.h>
-#include <numpy/npy_3kcompat.h>
-
 #include "npy_config.h"
-
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 typedef struct NewNpyArrayIterObject_tag NewNpyArrayIterObject;
 
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index 4cbf80df9..c37a232f4 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -9,7 +9,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "number.h"
 
diff --git a/numpy/core/src/multiarray/numpymemoryview.c b/numpy/core/src/multiarray/numpymemoryview.c
index eda28e1af..991321a8c 100644
--- a/numpy/core/src/multiarray/numpymemoryview.c
+++ b/numpy/core/src/multiarray/numpymemoryview.c
@@ -16,7 +16,7 @@
 #include "numpy/arrayscalars.h"
 
 #include "npy_config.h"
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "numpymemoryview.h"
 
diff --git a/numpy/core/src/multiarray/numpyos.c b/numpy/core/src/multiarray/numpyos.c
index 9307a2473..cef170108 100644
--- a/numpy/core/src/multiarray/numpyos.c
+++ b/numpy/core/src/multiarray/numpyos.c
@@ -11,7 +11,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 /*
  * From the C99 standard, section 7.19.6: The exponent always contains at least
diff --git a/numpy/core/src/multiarray/refcount.c b/numpy/core/src/multiarray/refcount.c
index 2ba907f59..b067cd948 100644
--- a/numpy/core/src/multiarray/refcount.c
+++ b/numpy/core/src/multiarray/refcount.c
@@ -14,7 +14,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 static void
 _fillobject(char *optr, PyObject *obj, PyArray_Descr *dtype);
diff --git a/numpy/core/src/multiarray/scalarapi.c b/numpy/core/src/multiarray/scalarapi.c
index dd0e0158b..00c71f913 100644
--- a/numpy/core/src/multiarray/scalarapi.c
+++ b/numpy/core/src/multiarray/scalarapi.c
@@ -11,7 +11,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "ctors.h"
 #include "descriptor.h"
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index 276937be1..e547071cc 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -13,7 +13,7 @@
 #include "numpy/halffloat.h"
 #include "numpy/arrayscalars.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "npy_config.h"
 #include "mapping.h"
diff --git a/numpy/core/src/multiarray/sequence.c b/numpy/core/src/multiarray/sequence.c
index 5a43f0565..e5f74251e 100644
--- a/numpy/core/src/multiarray/sequence.c
+++ b/numpy/core/src/multiarray/sequence.c
@@ -9,7 +9,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "common.h"
 #include "mapping.h"
diff --git a/numpy/core/src/multiarray/shape.c b/numpy/core/src/multiarray/shape.c
index 3ed46eac4..067232632 100644
--- a/numpy/core/src/multiarray/shape.c
+++ b/numpy/core/src/multiarray/shape.c
@@ -11,7 +11,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "ctors.h"
 
diff --git a/numpy/core/src/multiarray/ucsnarrow.c b/numpy/core/src/multiarray/ucsnarrow.c
index 406b776bc..b0afdc6d9 100644
--- a/numpy/core/src/multiarray/ucsnarrow.c
+++ b/numpy/core/src/multiarray/ucsnarrow.c
@@ -12,7 +12,7 @@
 
 #include "npy_config.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 /*
  * Functions only needed on narrow builds of Python for converting back and
diff --git a/numpy/core/src/multiarray/usertypes.c b/numpy/core/src/multiarray/usertypes.c
index 92c8a1243..f69abcc6b 100644
--- a/numpy/core/src/multiarray/usertypes.c
+++ b/numpy/core/src/multiarray/usertypes.c
@@ -34,7 +34,7 @@ maintainer email:  oliphant.travis@ieee.org
 
 #include "common.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "usertypes.h"
 
diff --git a/numpy/core/src/npysort/heapsort.c.src b/numpy/core/src/npysort/heapsort.c.src
new file mode 100644
index 000000000..6a8c79f7a
--- /dev/null
+++ b/numpy/core/src/npysort/heapsort.c.src
@@ -0,0 +1,341 @@
+/* -*- c -*- */
+
+/*
+ * The purpose of this module is to add faster sort functions
+ * that are type-specific.  This is done by altering the
+ * function table for the builtin descriptors.
+ *
+ * These sorting functions are copied almost directly from numarray
+ * with a few modifications (complex comparisons compare the imaginary
+ * part if the real parts are equal, for example), and the names
+ * are changed.
+ *
+ * The original sorting code is due to Charles R. Harris who wrote
+ * it for numarray.
+ */
+
+/*
+ * Quick sort is usually the fastest, but the worst case scenario can
+ * be slower than the merge and heap sorts.  The merge sort requires
+ * extra memory and so for large arrays may not be useful.
+ *
+ * The merge sort is *stable*, meaning that equal components
+ * are unmoved from their entry versions, so it can be used to
+ * implement lexigraphic sorting on multiple keys.
+ *
+ * The heap sort is included for completeness.
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include <stdlib.h>
+#include "npy_sort.h"
+#include "npysort_common.h"
+
+#define NOT_USED NPY_UNUSED(unused)
+#define PYA_QS_STACK 100
+#define SMALL_QUICKSORT 15
+#define SMALL_MERGESORT 20
+#define SMALL_STRING 16
+
+
+/*
+ *****************************************************************************
+ **                            NUMERIC SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong, half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #type = npy_bool, npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
+ *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat,
+ *         npy_cdouble, npy_clongdouble#
+ */
+
+int
+heapsort_@suff@(@type@ *start, npy_intp n, void *NOT_USED)
+{
+    @type@ tmp, *a;
+    npy_intp i,j,l;
+
+    /* The array needs to be offset by one for heapsort indexing */
+    a = start - 1;
+
+    for (l = n>>1; l > 0; --l) {
+        tmp = a[l];
+        for (i = l, j = l<<1; j <= n;) {
+            if (j < n && @TYPE@_LT(a[j], a[j+1])) {
+                j += 1;
+            }
+            if (@TYPE@_LT(tmp, a[j])) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    for (; n > 1;) {
+        tmp = a[n];
+        a[n] = a[1];
+        n -= 1;
+        for (i = 1, j = 2; j <= n;) {
+            if (j < n && @TYPE@_LT(a[j], a[j+1])) {
+                j++;
+            }
+            if (@TYPE@_LT(tmp, a[j])) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    return 0;
+}
+
+
+int
+aheapsort_@suff@(@type@ *v, npy_intp *tosort, npy_intp n, void *NOT_USED)
+{
+    npy_intp *a, i,j,l, tmp;
+    /* The arrays need to be offset by one for heapsort indexing */
+    a = tosort - 1;
+
+    for (l = n>>1; l > 0; --l) {
+        tmp = a[l];
+        for (i = l, j = l<<1; j <= n;) {
+            if (j < n && @TYPE@_LT(v[a[j]], v[a[j+1]])) {
+                j += 1;
+            }
+            if (@TYPE@_LT(v[tmp], v[a[j]])) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    for (; n > 1;) {
+        tmp = a[n];
+        a[n] = a[1];
+        n -= 1;
+        for (i = 1, j = 2; j <= n;) {
+            if (j < n && @TYPE@_LT(v[a[j]], v[a[j+1]])) {
+                j++;
+            }
+            if (@TYPE@_LT(v[tmp], v[a[j]])) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             STRING SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = STRING, UNICODE#
+ * #suff = string, unicode#
+ * #type = npy_char, npy_ucs4#
+ */
+
+int
+heapsort_@suff@(@type@ *start, npy_intp n, PyArrayObject *arr)
+{
+    size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
+    @type@ *tmp = malloc(PyArray_ITEMSIZE(arr));
+    @type@ *a = start - len;
+    npy_intp i, j, l;
+
+    for (l = n>>1; l > 0; --l) {
+        @TYPE@_COPY(tmp, a + l*len, len);
+        for (i = l, j = l<<1; j <= n;) {
+            if (j < n && @TYPE@_LT(a + j*len, a + (j+1)*len, len))
+                j += 1;
+            if (@TYPE@_LT(tmp, a + j*len, len)) {
+                @TYPE@_COPY(a + i*len, a + j*len, len);
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        @TYPE@_COPY(a + i*len, tmp, len);
+    }
+
+    for (; n > 1;) {
+        @TYPE@_COPY(tmp, a + n*len, len);
+        @TYPE@_COPY(a + n*len, a + len, len);
+        n -= 1;
+        for (i = 1, j = 2; j <= n;) {
+            if (j < n && @TYPE@_LT(a + j*len, a + (j+1)*len, len))
+                j++;
+            if (@TYPE@_LT(tmp, a + j*len, len)) {
+                @TYPE@_COPY(a + i*len, a + j*len, len);
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        @TYPE@_COPY(a + i*len, tmp, len);
+    }
+
+    free(tmp);
+    return 0;
+}
+
+
+int
+aheapsort_@suff@(@type@ *v, npy_intp *tosort, npy_intp n, PyArrayObject *arr)
+{
+    size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
+    npy_intp *a, i,j,l, tmp;
+
+    /* The array needs to be offset by one for heapsort indexing */
+    a = tosort - 1;
+
+    for (l = n>>1; l > 0; --l) {
+        tmp = a[l];
+        for (i = l, j = l<<1; j <= n;) {
+            if (j < n && @TYPE@_LT(v + a[j]*len, v + a[j+1]*len, len))
+                j += 1;
+            if (@TYPE@_LT(v + tmp*len, v + a[j]*len, len)) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    for (; n > 1;) {
+        tmp = a[n];
+        a[n] = a[1];
+        n -= 1;
+        for (i = 1, j = 2; j <= n;) {
+            if (j < n && @TYPE@_LT(v + a[j]*len, v + a[j+1]*len, len))
+                j++;
+            if (@TYPE@_LT(v + tmp*len, v + a[j]*len, len)) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             GENERIC SORT                                **
+ *****************************************************************************
+ */
+
+
+/*
+ * This sort has almost the same signature as libc qsort and is intended to
+ * provide a heapsort for array types that don't have type specific sorts.
+ * The difference in the signature is an error return, as it might be the
+ * case that a memory allocation fails.
+ */
+int
+npy_heapsort(void *base, size_t num, size_t size, npy_comparator cmp)
+{
+    char *tmp = (char *) malloc(size);
+    char *a = (char *) base - size;
+    size_t i, j, l;
+
+    if (tmp == NULL) {
+        return -NPY_ENOMEM;
+    }
+
+    for (l = num >> 1; l > 0; --l) {
+        GENERIC_COPY(tmp, a + l*size, size);
+        for (i = l, j = l << 1; j <= num;) {
+            if (j < num && GENERIC_LT(a + j*size, a + (j+1)*size, cmp))
+                j += 1;
+            if (GENERIC_LT(tmp, a + j*size, cmp)) {
+                GENERIC_COPY(a + i*size, a + j*size, size);
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        GENERIC_COPY(a + i*size, tmp, size);
+    }
+
+    for (; num > 1;) {
+        GENERIC_COPY(tmp, a + num*size, size);
+        GENERIC_COPY(a + num*size, a + size, size);
+        num -= 1;
+        for (i = 1, j = 2; j <= num;) {
+            if (j < num && GENERIC_LT(a + j*size, a + (j+1)*size, cmp))
+                j++;
+            if (GENERIC_LT(tmp, a + j*size, cmp)) {
+                GENERIC_COPY(a + i*size, a + j*size, size);
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        GENERIC_COPY(a + i*size, tmp, size);
+    }
+
+    free(tmp);
+    return 0;
+}
diff --git a/numpy/core/src/npysort/mergesort.c.src b/numpy/core/src/npysort/mergesort.c.src
new file mode 100644
index 000000000..1003b95d1
--- /dev/null
+++ b/numpy/core/src/npysort/mergesort.c.src
@@ -0,0 +1,433 @@
+/* -*- c -*- */
+
+/*
+ * The purpose of this module is to add faster sort functions
+ * that are type-specific.  This is done by altering the
+ * function table for the builtin descriptors.
+ *
+ * These sorting functions are copied almost directly from numarray
+ * with a few modifications (complex comparisons compare the imaginary
+ * part if the real parts are equal, for example), and the names
+ * are changed.
+ *
+ * The original sorting code is due to Charles R. Harris who wrote
+ * it for numarray.
+ */
+
+/*
+ * Quick sort is usually the fastest, but the worst case scenario can
+ * be slower than the merge and heap sorts.  The merge sort requires
+ * extra memory and so for large arrays may not be useful.
+ *
+ * The merge sort is *stable*, meaning that equal components
+ * are unmoved from their entry versions, so it can be used to
+ * implement lexigraphic sorting on multiple keys.
+ *
+ * The heap sort is included for completeness.
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include <stdlib.h>
+#include "npy_sort.h"
+#include "npysort_common.h"
+
+#define NOT_USED NPY_UNUSED(unused)
+#define PYA_QS_STACK 100
+#define SMALL_QUICKSORT 15
+#define SMALL_MERGESORT 20
+#define SMALL_STRING 16
+
+
+/*
+ *****************************************************************************
+ **                            NUMERIC SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong, half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #type = npy_bool, npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
+ *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat,
+ *         npy_cdouble, npy_clongdouble#
+ */
+
+static void
+mergesort0_@suff@(@type@ *pl, @type@ *pr, @type@ *pw)
+{
+    @type@ vp, *pi, *pj, *pk, *pm;
+
+    if (pr - pl > SMALL_MERGESORT) {
+        /* merge sort */
+        pm = pl + ((pr - pl) >> 1);
+        mergesort0_@suff@(pl, pm, pw);
+        mergesort0_@suff@(pm, pr, pw);
+        for (pi = pw, pj = pl; pj < pm;) {
+            *pi++ = *pj++;
+        }
+        pi = pw + (pm - pl);
+        pj = pw;
+        pk = pl;
+        while (pj < pi && pm < pr) {
+            if (@TYPE@_LT(*pm, *pj)) {
+                *pk++ = *pm++;
+            }
+            else {
+                *pk++ = *pj++;
+            }
+        }
+        while(pj < pi) {
+            *pk++ = *pj++;
+        }
+    }
+    else {
+        /* insertion sort */
+        for (pi = pl + 1; pi < pr; ++pi) {
+            vp = *pi;
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && @TYPE@_LT(vp, *pk)) {
+                *pj-- = *pk--;
+            }
+            *pj = vp;
+        }
+    }
+}
+
+
+int
+mergesort_@suff@(@type@ *start, npy_intp num, void *NOT_USED)
+{
+    @type@ *pl, *pr, *pw;
+
+    pl = start;
+    pr = pl + num;
+    pw = (@type@ *) malloc((num/2) * sizeof(@type@));
+    if (pw == NULL) {
+        PyErr_NoMemory();
+        return -NPY_ENOMEM;
+    }
+    mergesort0_@suff@(pl, pr, pw);
+
+    free(pw);
+    return 0;
+}
+
+
+static void
+amergesort0_@suff@(npy_intp *pl, npy_intp *pr, @type@ *v, npy_intp *pw)
+{
+    @type@ vp;
+    npy_intp vi, *pi, *pj, *pk, *pm;
+
+    if (pr - pl > SMALL_MERGESORT) {
+        /* merge sort */
+        pm = pl + ((pr - pl) >> 1);
+        amergesort0_@suff@(pl, pm, v, pw);
+        amergesort0_@suff@(pm, pr, v, pw);
+        for (pi = pw, pj = pl; pj < pm;) {
+            *pi++ = *pj++;
+        }
+        pi = pw + (pm - pl);
+        pj = pw;
+        pk = pl;
+        while (pj < pi && pm < pr) {
+            if (@TYPE@_LT(v[*pm], v[*pj])) {
+                *pk++ = *pm++;
+            }
+            else {
+                *pk++ = *pj++;
+            }
+        }
+        while(pj < pi) {
+            *pk++ = *pj++;
+        }
+    }
+    else {
+        /* insertion sort */
+        for (pi = pl + 1; pi < pr; ++pi) {
+            vi = *pi;
+            vp = v[vi];
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && @TYPE@_LT(vp, v[*pk])) {
+                *pj-- = *pk--;
+            }
+            *pj = vi;
+        }
+    }
+}
+
+
+int
+amergesort_@suff@(@type@ *v, npy_intp *tosort, npy_intp num, void *NOT_USED)
+{
+    npy_intp *pl, *pr, *pw;
+
+    pl = tosort;
+    pr = pl + num;
+    pw = (npy_intp *) malloc((num/2) * sizeof(npy_intp));
+    if (pw == NULL) {
+        PyErr_NoMemory();
+        return -NPY_ENOMEM;
+    }
+    amergesort0_@suff@(pl, pr, v, pw);
+    free(pw);
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             STRING SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = STRING, UNICODE#
+ * #suff = string, unicode#
+ * #type = npy_char, npy_ucs4#
+ */
+
+static void
+mergesort0_@suff@(@type@ *pl, @type@ *pr, @type@ *pw, @type@ *vp, size_t len)
+{
+    @type@ *pi, *pj, *pk, *pm;
+
+    if ((size_t)(pr - pl) > SMALL_MERGESORT*len) {
+        /* merge sort */
+        pm = pl + (((pr - pl)/len) >> 1)*len;
+        mergesort0_@suff@(pl, pm, pw, vp, len);
+        mergesort0_@suff@(pm, pr, pw, vp, len);
+        @TYPE@_COPY(pw, pl, pm - pl);
+        pi = pw + (pm - pl);
+        pj = pw;
+        pk = pl;
+        while (pj < pi && pm < pr) {
+            if (@TYPE@_LT(pm, pj, len)) {
+                @TYPE@_COPY(pk, pm, len);
+                pm += len;
+                pk += len;
+            }
+            else {
+                @TYPE@_COPY(pk, pj, len);
+                pj += len;
+                pk += len;
+            }
+        }
+        @TYPE@_COPY(pk, pj, pi - pj);
+    }
+    else {
+        /* insertion sort */
+        for (pi = pl + len; pi < pr; pi += len) {
+            @TYPE@_COPY(vp, pi, len);
+            pj = pi;
+            pk = pi - len;
+            while (pj > pl && @TYPE@_LT(vp, pk, len)) {
+                @TYPE@_COPY(pj, pk, len);
+                pj -= len;
+                pk -= len;
+            }
+            @TYPE@_COPY(pj, vp, len);
+        }
+    }
+}
+
+
+int
+mergesort_@suff@(@type@ *start, npy_intp num, PyArrayObject *arr)
+{
+    size_t elsize = PyArray_ITEMSIZE(arr);
+    size_t len = elsize / sizeof(@type@);
+    @type@ *pl, *pr, *pw, *vp;
+    int err = 0;
+
+    pl = start;
+    pr = pl + num*len;
+    pw = (@type@ *) malloc((num/2) * elsize);
+    if (pw == NULL) {
+        PyErr_NoMemory();
+        err = -NPY_ENOMEM;
+        goto fail_0;
+    }
+    vp = (@type@ *) malloc(elsize);
+    if (vp == NULL) {
+        PyErr_NoMemory();
+        err = -NPY_ENOMEM;
+        goto fail_1;
+    }
+    mergesort0_@suff@(pl, pr, pw, vp, len);
+
+    free(vp);
+fail_1:
+    free(pw);
+fail_0:
+    return err;
+}
+
+
+static void
+amergesort0_@suff@(npy_intp *pl, npy_intp *pr, @type@ *v, npy_intp *pw, size_t len)
+{
+    @type@ *vp;
+    npy_intp vi, *pi, *pj, *pk, *pm;
+
+    if (pr - pl > SMALL_MERGESORT) {
+        /* merge sort */
+        pm = pl + ((pr - pl) >> 1);
+        amergesort0_@suff@(pl, pm, v, pw, len);
+        amergesort0_@suff@(pm, pr, v, pw, len);
+        for (pi = pw, pj = pl; pj < pm;) {
+            *pi++ = *pj++;
+        }
+        pi = pw + (pm - pl);
+        pj = pw;
+        pk = pl;
+        while (pj < pi && pm < pr) {
+            if (@TYPE@_LT(v + (*pm)*len, v + (*pj)*len, len)) {
+                *pk++ = *pm++;
+            }
+            else {
+                *pk++ = *pj++;
+            }
+        }
+        while (pj < pi) {
+            *pk++ = *pj++;
+        }
+    }
+    else {
+        /* insertion sort */
+        for (pi = pl + 1; pi < pr; ++pi) {
+            vi = *pi;
+            vp = v + vi*len;
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && @TYPE@_LT(vp, v + (*pk)*len, len)) {
+                *pj-- = *pk--;
+            }
+            *pj = vi;
+        }
+    }
+}
+
+
+int
+amergesort_@suff@(@type@ *v, npy_intp *tosort, npy_intp num, PyArrayObject *arr)
+{
+    size_t elsize = PyArray_ITEMSIZE(arr);
+    size_t len = elsize / sizeof(@type@);
+    npy_intp *pl, *pr, *pw;
+
+    pl = tosort;
+    pr = pl + num;
+    pw = (npy_intp *) malloc((num/2) * sizeof(npy_intp));
+    if (pw == NULL) {
+        PyErr_NoMemory();
+        return -NPY_ENOMEM;
+    }
+    amergesort0_@suff@(pl, pr, v, pw, len);
+    free(pw);
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             GENERIC SORT                                **
+ *****************************************************************************
+ */
+
+
+static void
+npy_mergesort0(char *pl, char *pr, char *pw, char *vp, size_t size, npy_comparator cmp)
+{
+    char *pi, *pj, *pk, *pm;
+
+    if ((size_t)(pr - pl) > SMALL_MERGESORT*size) {
+        /* merge sort */
+        pm = pl + (((pr - pl)/size) >> 1)*size;
+        npy_mergesort0(pl, pm, pw, vp, size, cmp);
+        npy_mergesort0(pm, pr, pw, vp, size, cmp);
+        GENERIC_COPY(pw, pl, pm - pl);
+        pi = pw + (pm - pl);
+        pj = pw;
+        pk = pl;
+        while (pj < pi && pm < pr) {
+            if (GENERIC_LT(pm, pj, cmp)) {
+                GENERIC_COPY(pk, pm, size);
+                pm += size;
+                pk += size;
+            }
+            else {
+                GENERIC_COPY(pk, pj, size);
+                pj += size;
+                pk += size;
+            }
+        }
+        GENERIC_COPY(pk, pj, pi - pj);
+    }
+    else {
+        /* insertion sort */
+        for (pi = pl + size; pi < pr; pi += size) {
+            GENERIC_COPY(vp, pi, size);
+            pj = pi;
+            pk = pi - size;
+            while (pj > pl && GENERIC_LT(vp, pk, cmp)) {
+                GENERIC_COPY(pj, pk, size);
+                pj -= size;
+                pk -= size;
+            }
+            GENERIC_COPY(pj, vp, size);
+        }
+    }
+}
+
+
+/*
+ * This sort has almost the same signature as libc qsort and is intended to
+ * provide a mergesort for array types that don't have type specific sorts.
+ * The difference in the signature is an error return, as it might be the
+ * case that a memory allocation fails.
+ */
+int
+npy_mergesort(void *base, size_t num, size_t size, npy_comparator cmp)
+{
+    char *pl, *pr, *pw, *vp;
+    int err = 0;
+
+    pl = (char *)base;
+    pr = pl + num*size;
+    pw = (char *) malloc((num/2) * size);
+    if (pw == NULL) {
+        err = -NPY_ENOMEM;
+        goto fail_0;
+    }
+    vp = (char *) malloc(size);
+    if (vp == NULL) {
+        err = -NPY_ENOMEM;
+        goto fail_1;
+    }
+    npy_mergesort0(pl, pr, pw, vp, size, cmp);
+
+    free(vp);
+fail_1:
+    free(pw);
+fail_0:
+    return err;
+}
diff --git a/numpy/core/src/npysort/npysort_common.h b/numpy/core/src/npysort/npysort_common.h
index bc1d278b0..318a80222 100644
--- a/numpy/core/src/npysort/npysort_common.h
+++ b/numpy/core/src/npysort/npysort_common.h
@@ -1,15 +1,9 @@
 #ifndef __NPY_SORT_COMMON_H__
 #define __NPY_SORT_COMMON_H__
 
-/*
-#include "Python.h"
-#include "numpy/npy_common.h"
-#include "numpy/npy_math.h"
-#include "npy_config.h"
-*/
+#include <stdlib.h>
 #include <numpy/ndarraytypes.h>
 
-
 /*
  *****************************************************************************
  **                        SWAP MACROS                                      **
@@ -155,14 +149,17 @@ npy_half_lt_nonan(npy_half h1, npy_half h2)
     if (h1&0x8000u) {
         if (h2&0x8000u) {
             return (h1&0x7fffu) > (h2&0x7fffu);
-        } else {
+        }
+        else {
             /* Signed zeros are equal, have to check for it */
             return (h1 != 0x8000u) || (h2 != 0x0000u);
         }
-    } else {
+    }
+    else {
         if (h2&0x8000u) {
             return 0;
-        } else {
+        }
+        else {
             return (h1&0x7fffu) < (h2&0x7fffu);
         }
     }
@@ -176,7 +173,8 @@ HALF_LT(npy_half a, npy_half b)
 
     if (npy_half_isnan(b)) {
         ret = !npy_half_isnan(a);
-    } else {
+    }
+    else {
         ret = !npy_half_isnan(a) && npy_half_lt_nonan(a, b);
     }
 
@@ -254,14 +252,6 @@ CLONGDOUBLE_LT(npy_clongdouble a, npy_clongdouble b)
 }
 
 
-/* The PyObject functions are stubs for later use */
-NPY_INLINE static int
-PyObject_LT(PyObject *pa, PyObject *pb)
-{
-    return 0;
-}
-
-
 NPY_INLINE static void
 STRING_COPY(char *s1, char *s2, size_t len)
 {
@@ -333,4 +323,29 @@ UNICODE_LT(npy_ucs4 *s1, npy_ucs4 *s2, size_t len)
     return ret;
 }
 
+
+NPY_INLINE static void
+GENERIC_COPY(char *a, char *b, size_t len)
+{
+    memcpy(a, b, len);
+}
+
+
+NPY_INLINE static void
+GENERIC_SWAP(char *a, char *b, size_t len)
+{
+    while(len--) {
+        const char t = *a;
+        *a++ = *b;
+        *b++ = t;
+    }
+}
+
+
+NPY_INLINE static int
+GENERIC_LT(char *a, char *b, int (*cmp)(const void *, const void *))
+{
+    return cmp(a, b) < 0;
+}
+
 #endif
diff --git a/numpy/core/src/npysort/quicksort.c.src b/numpy/core/src/npysort/quicksort.c.src
new file mode 100644
index 000000000..2225887b1
--- /dev/null
+++ b/numpy/core/src/npysort/quicksort.c.src
@@ -0,0 +1,363 @@
+/* -*- c -*- */
+
+/*
+ * The purpose of this module is to add faster sort functions
+ * that are type-specific.  This is done by altering the
+ * function table for the builtin descriptors.
+ *
+ * These sorting functions are copied almost directly from numarray
+ * with a few modifications (complex comparisons compare the imaginary
+ * part if the real parts are equal, for example), and the names
+ * are changed.
+ *
+ * The original sorting code is due to Charles R. Harris who wrote
+ * it for numarray.
+ */
+
+/*
+ * Quick sort is usually the fastest, but the worst case scenario can
+ * be slower than the merge and heap sorts.  The merge sort requires
+ * extra memory and so for large arrays may not be useful.
+ *
+ * The merge sort is *stable*, meaning that equal components
+ * are unmoved from their entry versions, so it can be used to
+ * implement lexigraphic sorting on multiple keys.
+ *
+ * The heap sort is included for completeness.
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include <stdlib.h>
+#include "npy_sort.h"
+#include "npysort_common.h"
+
+#define NOT_USED NPY_UNUSED(unused)
+#define PYA_QS_STACK 100
+#define SMALL_QUICKSORT 15
+#define SMALL_MERGESORT 20
+#define SMALL_STRING 16
+
+
+/*
+ *****************************************************************************
+ **                            NUMERIC SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong, half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #type = npy_bool, npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
+ *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat,
+ *         npy_cdouble, npy_clongdouble#
+ */
+
+int
+quicksort_@suff@(@type@ *start, npy_intp num, void *NOT_USED)
+{
+    @type@ vp;
+    @type@ *pl = start;
+    @type@ *pr = start + num - 1;
+    @type@ *stack[PYA_QS_STACK];
+    @type@ **sptr = stack;
+    @type@ *pm, *pi, *pj, *pk;
+
+    for (;;) {
+        while ((pr - pl) > SMALL_QUICKSORT) {
+            /* quicksort partition */
+            pm = pl + ((pr - pl) >> 1);
+            if (@TYPE@_LT(*pm, *pl)) @TYPE@_SWAP(*pm, *pl);
+            if (@TYPE@_LT(*pr, *pm)) @TYPE@_SWAP(*pr, *pm);
+            if (@TYPE@_LT(*pm, *pl)) @TYPE@_SWAP(*pm, *pl);
+            vp = *pm;
+            pi = pl;
+            pj = pr - 1;
+            @TYPE@_SWAP(*pm, *pj);
+            for (;;) {
+                do ++pi; while (@TYPE@_LT(*pi, vp));
+                do --pj; while (@TYPE@_LT(vp, *pj));
+                if (pi >= pj) {
+                    break;
+                }
+                @TYPE@_SWAP(*pi,*pj);
+            }
+            pk = pr - 1;
+            @TYPE@_SWAP(*pi, *pk);
+            /* push largest partition on stack */
+            if (pi - pl < pr - pi) {
+                *sptr++ = pi + 1;
+                *sptr++ = pr;
+                pr = pi - 1;
+            }
+            else {
+                *sptr++ = pl;
+                *sptr++ = pi - 1;
+                pl = pi + 1;
+            }
+        }
+
+        /* insertion sort */
+        for (pi = pl + 1; pi <= pr; ++pi) {
+            vp = *pi;
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && @TYPE@_LT(vp, *pk)) {
+                *pj-- = *pk--;
+            }
+            *pj = vp;
+        }
+        if (sptr == stack) {
+            break;
+        }
+        pr = *(--sptr);
+        pl = *(--sptr);
+    }
+
+    return 0;
+}
+
+
+int
+aquicksort_@suff@(@type@ *v, npy_intp* tosort, npy_intp num, void *NOT_USED)
+{
+    @type@ vp;
+    npy_intp *pl = tosort;
+    npy_intp *pr = tosort + num - 1;
+    npy_intp *stack[PYA_QS_STACK];
+    npy_intp **sptr = stack;
+    npy_intp *pm, *pi, *pj, *pk, vi;
+
+    for (;;) {
+        while ((pr - pl) > SMALL_QUICKSORT) {
+            /* quicksort partition */
+            pm = pl + ((pr - pl) >> 1);
+            if (@TYPE@_LT(v[*pm],v[*pl])) INTP_SWAP(*pm, *pl);
+            if (@TYPE@_LT(v[*pr],v[*pm])) INTP_SWAP(*pr, *pm);
+            if (@TYPE@_LT(v[*pm],v[*pl])) INTP_SWAP(*pm, *pl);
+            vp = v[*pm];
+            pi = pl;
+            pj = pr - 1;
+            INTP_SWAP(*pm, *pj);
+            for (;;) {
+                do ++pi; while (@TYPE@_LT(v[*pi], vp));
+                do --pj; while (@TYPE@_LT(vp, v[*pj]));
+                if (pi >= pj) {
+                    break;
+                }
+                INTP_SWAP(*pi, *pj);
+            }
+            pk = pr - 1;
+            INTP_SWAP(*pi,*pk);
+            /* push largest partition on stack */
+            if (pi - pl < pr - pi) {
+                *sptr++ = pi + 1;
+                *sptr++ = pr;
+                pr = pi - 1;
+            }
+            else {
+                *sptr++ = pl;
+                *sptr++ = pi - 1;
+                pl = pi + 1;
+            }
+        }
+
+        /* insertion sort */
+        for (pi = pl + 1; pi <= pr; ++pi) {
+            vi = *pi;
+            vp = v[vi];
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && @TYPE@_LT(vp, v[*pk])) {
+                *pj-- = *pk--;
+            }
+            *pj = vi;
+        }
+        if (sptr == stack) {
+            break;
+        }
+        pr = *(--sptr);
+        pl = *(--sptr);
+    }
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             STRING SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = STRING, UNICODE#
+ * #suff = string, unicode#
+ * #type = npy_char, npy_ucs4#
+ */
+
+int
+quicksort_@suff@(@type@ *start, npy_intp num, PyArrayObject *arr)
+{
+    const size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
+    @type@ *vp = malloc(PyArray_ITEMSIZE(arr));
+    @type@ *pl = start;
+    @type@ *pr = start + (num - 1)*len;
+    @type@ *stack[PYA_QS_STACK], **sptr = stack, *pm, *pi, *pj, *pk;
+
+    for (;;) {
+        while ((size_t)(pr - pl) > SMALL_QUICKSORT*len) {
+            /* quicksort partition */
+            pm = pl + (((pr - pl)/len) >> 1)*len;
+            if (@TYPE@_LT(pm, pl, len)) @TYPE@_SWAP(pm, pl, len);
+            if (@TYPE@_LT(pr, pm, len)) @TYPE@_SWAP(pr, pm, len);
+            if (@TYPE@_LT(pm, pl, len)) @TYPE@_SWAP(pm, pl, len);
+            @TYPE@_COPY(vp, pm, len);
+            pi = pl;
+            pj = pr - len;
+            @TYPE@_SWAP(pm, pj, len);
+            for (;;) {
+                do pi += len; while (@TYPE@_LT(pi, vp, len));
+                do pj -= len; while (@TYPE@_LT(vp, pj, len));
+                if (pi >= pj) {
+                    break;
+                }
+                @TYPE@_SWAP(pi, pj, len);
+            }
+            pk = pr - len;
+            @TYPE@_SWAP(pi, pk, len);
+            /* push largest partition on stack */
+            if (pi - pl < pr - pi) {
+                *sptr++ = pi + len;
+                *sptr++ = pr;
+                pr = pi - len;
+            }
+            else {
+                *sptr++ = pl;
+                *sptr++ = pi - len;
+                pl = pi + len;
+            }
+        }
+
+        /* insertion sort */
+        for (pi = pl + len; pi <= pr; pi += len) {
+            @TYPE@_COPY(vp, pi, len);
+            pj = pi;
+            pk = pi - len;
+            while (pj > pl && @TYPE@_LT(vp, pk, len)) {
+                @TYPE@_COPY(pj, pk, len);
+                pj -= len;
+                pk -= len;
+            }
+            @TYPE@_COPY(pj, vp, len);
+        }
+        if (sptr == stack) {
+            break;
+        }
+        pr = *(--sptr);
+        pl = *(--sptr);
+    }
+
+    free(vp);
+    return 0;
+}
+
+
+int
+aquicksort_@suff@(@type@ *v, npy_intp* tosort, npy_intp num, PyArrayObject *arr)
+{
+    size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
+    @type@ *vp;
+    npy_intp *pl = tosort;
+    npy_intp *pr = tosort + num - 1;
+    npy_intp *stack[PYA_QS_STACK];
+    npy_intp **sptr=stack;
+    npy_intp *pm, *pi, *pj, *pk, vi;
+
+    for (;;) {
+        while ((pr - pl) > SMALL_QUICKSORT) {
+            /* quicksort partition */
+            pm = pl + ((pr - pl) >> 1);
+            if (@TYPE@_LT(v + (*pm)*len, v + (*pl)*len, len)) INTP_SWAP(*pm, *pl);
+            if (@TYPE@_LT(v + (*pr)*len, v + (*pm)*len, len)) INTP_SWAP(*pr, *pm);
+            if (@TYPE@_LT(v + (*pm)*len, v + (*pl)*len, len)) INTP_SWAP(*pm, *pl);
+            vp = v + (*pm)*len;
+            pi = pl;
+            pj = pr - 1;
+            INTP_SWAP(*pm,*pj);
+            for (;;) {
+                do ++pi; while (@TYPE@_LT(v + (*pi)*len, vp, len));
+                do --pj; while (@TYPE@_LT(vp, v + (*pj)*len, len));
+                if (pi >= pj) {
+                    break;
+                }
+                INTP_SWAP(*pi,*pj);
+            }
+            pk = pr - 1;
+            INTP_SWAP(*pi,*pk);
+            /* push largest partition on stack */
+            if (pi - pl < pr - pi) {
+                *sptr++ = pi + 1;
+                *sptr++ = pr;
+                pr = pi - 1;
+            }
+            else {
+                *sptr++ = pl;
+                *sptr++ = pi - 1;
+                pl = pi + 1;
+            }
+        }
+
+        /* insertion sort */
+        for (pi = pl + 1; pi <= pr; ++pi) {
+            vi = *pi;
+            vp = v + vi*len;
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && @TYPE@_LT(vp, v + (*pk)*len, len)) {
+                *pj-- = *pk--;
+            }
+            *pj = vi;
+        }
+        if (sptr == stack) {
+            break;
+        }
+        pr = *(--sptr);
+        pl = *(--sptr);
+    }
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             GENERIC SORT                                **
+ *****************************************************************************
+ */
+
+
+/*
+ * This sort has almost the same signature as libc qsort and is intended to
+ * supply an error return for compatibility with the other generic sort
+ * kinds.
+ */
+int
+npy_quicksort(void *base, size_t num, size_t size, npy_comparator cmp)
+{
+    qsort(base, num, size, cmp);
+    return 0;
+}
diff --git a/numpy/core/src/npysort/sort.c.src b/numpy/core/src/npysort/sort.c.src
deleted file mode 100644
index 618259f96..000000000
--- a/numpy/core/src/npysort/sort.c.src
+++ /dev/null
@@ -1,794 +0,0 @@
-/* -*- c -*- */
-
-/*
- * The purpose of this module is to add faster sort functions
- * that are type-specific.  This is done by altering the
- * function table for the builtin descriptors.
- *
- * These sorting functions are copied almost directly from numarray
- * with a few modifications (complex comparisons compare the imaginary
- * part if the real parts are equal, for example), and the names
- * are changed.
- *
- * The original sorting code is due to Charles R. Harris who wrote
- * it for numarray.
- */
-
-/*
- * Quick sort is usually the fastest, but the worst case scenario can
- * be slower than the merge and heap sorts.  The merge sort requires
- * extra memory and so for large arrays may not be useful.
- *
- * The merge sort is *stable*, meaning that equal components
- * are unmoved from their entry versions, so it can be used to
- * implement lexigraphic sorting on multiple keys.
- *
- * The heap sort is included for completeness.
- */
-
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include <stdlib.h>
-#include "npy_sort.h"
-#include "npysort_common.h"
-
-#define NOT_USED NPY_UNUSED(unused)
-#define PYA_QS_STACK 100
-#define SMALL_QUICKSORT 15
-#define SMALL_MERGESORT 20
-#define SMALL_STRING 16
-
-
-/*
- *****************************************************************************
- **                            NUMERIC SORTS                                **
- *****************************************************************************
- */
-
-
-/**begin repeat
- *
- * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
- *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE,
- *         CFLOAT, CDOUBLE, CLONGDOUBLE#
- * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
- *         longlong, ulonglong, half, float, double, longdouble,
- *         cfloat, cdouble, clongdouble#
- * #type = npy_bool, npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
- *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- *         npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat,
- *         npy_cdouble, npy_clongdouble#
- */
-
-int
-quicksort_@suff@(@type@ *start, npy_intp num, void *NOT_USED)
-{
-    @type@ *pl = start;
-    @type@ *pr = start + num - 1;
-    @type@ vp;
-    @type@ *stack[PYA_QS_STACK], **sptr = stack, *pm, *pi, *pj, *pk;
-
-    for (;;) {
-        while ((pr - pl) > SMALL_QUICKSORT) {
-            /* quicksort partition */
-            pm = pl + ((pr - pl) >> 1);
-            if (@TYPE@_LT(*pm, *pl)) @TYPE@_SWAP(*pm, *pl);
-            if (@TYPE@_LT(*pr, *pm)) @TYPE@_SWAP(*pr, *pm);
-            if (@TYPE@_LT(*pm, *pl)) @TYPE@_SWAP(*pm, *pl);
-            vp = *pm;
-            pi = pl;
-            pj = pr - 1;
-            @TYPE@_SWAP(*pm, *pj);
-            for (;;) {
-                do ++pi; while (@TYPE@_LT(*pi, vp));
-                do --pj; while (@TYPE@_LT(vp, *pj));
-                if (pi >= pj) {
-                    break;
-                }
-                @TYPE@_SWAP(*pi,*pj);
-            }
-            pk = pr - 1;
-            @TYPE@_SWAP(*pi, *pk);
-            /* push largest partition on stack */
-            if (pi - pl < pr - pi) {
-                *sptr++ = pi + 1;
-                *sptr++ = pr;
-                pr = pi - 1;
-            }
-            else {
-                *sptr++ = pl;
-                *sptr++ = pi - 1;
-                pl = pi + 1;
-            }
-        }
-
-        /* insertion sort */
-        for (pi = pl + 1; pi <= pr; ++pi) {
-            vp = *pi;
-            pj = pi;
-            pk = pi - 1;
-            while (pj > pl && @TYPE@_LT(vp, *pk)) {
-                *pj-- = *pk--;
-            }
-            *pj = vp;
-        }
-        if (sptr == stack) {
-            break;
-        }
-        pr = *(--sptr);
-        pl = *(--sptr);
-    }
-
-    return 0;
-}
-
-int
-aquicksort_@suff@(@type@ *v, npy_intp* tosort, npy_intp num, void *NOT_USED)
-{
-    @type@ vp;
-    npy_intp *pl, *pr;
-    npy_intp *stack[PYA_QS_STACK], **sptr=stack, *pm, *pi, *pj, *pk, vi;
-
-    pl = tosort;
-    pr = tosort + num - 1;
-
-    for (;;) {
-        while ((pr - pl) > SMALL_QUICKSORT) {
-            /* quicksort partition */
-            pm = pl + ((pr - pl) >> 1);
-            if (@TYPE@_LT(v[*pm],v[*pl])) INTP_SWAP(*pm,*pl);
-            if (@TYPE@_LT(v[*pr],v[*pm])) INTP_SWAP(*pr,*pm);
-            if (@TYPE@_LT(v[*pm],v[*pl])) INTP_SWAP(*pm,*pl);
-            vp = v[*pm];
-            pi = pl;
-            pj = pr - 1;
-            INTP_SWAP(*pm,*pj);
-            for (;;) {
-                do ++pi; while (@TYPE@_LT(v[*pi],vp));
-                do --pj; while (@TYPE@_LT(vp,v[*pj]));
-                if (pi >= pj) {
-                    break;
-                }
-                INTP_SWAP(*pi,*pj);
-            }
-            pk = pr - 1;
-            INTP_SWAP(*pi,*pk);
-            /* push largest partition on stack */
-            if (pi - pl < pr - pi) {
-                *sptr++ = pi + 1;
-                *sptr++ = pr;
-                pr = pi - 1;
-            }
-            else {
-                *sptr++ = pl;
-                *sptr++ = pi - 1;
-                pl = pi + 1;
-            }
-        }
-
-        /* insertion sort */
-        for (pi = pl + 1; pi <= pr; ++pi) {
-            vi = *pi;
-            vp = v[vi];
-            pj = pi;
-            pk = pi - 1;
-            while (pj > pl && @TYPE@_LT(vp, v[*pk])) {
-                *pj-- = *pk--;
-            }
-            *pj = vi;
-        }
-        if (sptr == stack) {
-            break;
-        }
-        pr = *(--sptr);
-        pl = *(--sptr);
-    }
-
-    return 0;
-}
-
-
-int
-heapsort_@suff@(@type@ *start, npy_intp n, void *NOT_USED)
-{
-    @type@ tmp, *a;
-    npy_intp i,j,l;
-
-    /* The array needs to be offset by one for heapsort indexing */
-    a = start - 1;
-
-    for (l = n>>1; l > 0; --l) {
-        tmp = a[l];
-        for (i = l, j = l<<1; j <= n;) {
-            if (j < n && @TYPE@_LT(a[j], a[j+1])) {
-                j += 1;
-            }
-            if (@TYPE@_LT(tmp, a[j])) {
-                a[i] = a[j];
-                i = j;
-                j += j;
-            }
-            else {
-                break;
-            }
-        }
-        a[i] = tmp;
-    }
-
-    for (; n > 1;) {
-        tmp = a[n];
-        a[n] = a[1];
-        n -= 1;
-        for (i = 1, j = 2; j <= n;) {
-            if (j < n && @TYPE@_LT(a[j], a[j+1])) {
-                j++;
-            }
-            if (@TYPE@_LT(tmp, a[j])) {
-                a[i] = a[j];
-                i = j;
-                j += j;
-            }
-            else {
-                break;
-            }
-        }
-        a[i] = tmp;
-    }
-
-    return 0;
-}
-
-int
-aheapsort_@suff@(@type@ *v, npy_intp *tosort, npy_intp n, void *NOT_USED)
-{
-    npy_intp *a, i,j,l, tmp;
-    /* The arrays need to be offset by one for heapsort indexing */
-    a = tosort - 1;
-
-    for (l = n>>1; l > 0; --l) {
-        tmp = a[l];
-        for (i = l, j = l<<1; j <= n;) {
-            if (j < n && @TYPE@_LT(v[a[j]], v[a[j+1]])) {
-                j += 1;
-            }
-            if (@TYPE@_LT(v[tmp], v[a[j]])) {
-                a[i] = a[j];
-                i = j;
-                j += j;
-            }
-            else {
-                break;
-            }
-        }
-        a[i] = tmp;
-    }
-
-    for (; n > 1;) {
-        tmp = a[n];
-        a[n] = a[1];
-        n -= 1;
-        for (i = 1, j = 2; j <= n;) {
-            if (j < n && @TYPE@_LT(v[a[j]], v[a[j+1]])) {
-                j++;
-            }
-            if (@TYPE@_LT(v[tmp], v[a[j]])) {
-                a[i] = a[j];
-                i = j;
-                j += j;
-            }
-            else {
-                break;
-            }
-        }
-        a[i] = tmp;
-    }
-
-    return 0;
-}
-
-static void
-mergesort0_@suff@(@type@ *pl, @type@ *pr, @type@ *pw)
-{
-    @type@ vp, *pi, *pj, *pk, *pm;
-
-    if (pr - pl > SMALL_MERGESORT) {
-        /* merge sort */
-        pm = pl + ((pr - pl) >> 1);
-        mergesort0_@suff@(pl, pm, pw);
-        mergesort0_@suff@(pm, pr, pw);
-        for (pi = pw, pj = pl; pj < pm;) {
-            *pi++ = *pj++;
-        }
-        pj = pw;
-        pk = pl;
-        while (pj < pi && pm < pr) {
-            if (@TYPE@_LT(*pm, *pj)) {
-                *pk = *pm++;
-            }
-            else {
-                *pk = *pj++;
-            }
-            pk++;
-        }
-        while(pj < pi) {
-            *pk++ = *pj++;
-        }
-    }
-    else {
-        /* insertion sort */
-        for (pi = pl + 1; pi < pr; ++pi) {
-            vp = *pi;
-            pj = pi;
-            pk = pi -1;
-            while (pj > pl && @TYPE@_LT(vp, *pk)) {
-                *pj-- = *pk--;
-            }
-            *pj = vp;
-        }
-    }
-}
-
-int
-mergesort_@suff@(@type@ *start, npy_intp num, void *NOT_USED)
-{
-    @type@ *pl, *pr, *pw;
-
-    pl = start;
-    pr = pl + num;
-    pw = (@type@ *) PyDataMem_NEW((num/2)*sizeof(@type@));
-    if (!pw) {
-        PyErr_NoMemory();
-        return -1;
-    }
-    mergesort0_@suff@(pl, pr, pw);
-
-    PyDataMem_FREE(pw);
-    return 0;
-}
-
-static void
-amergesort0_@suff@(npy_intp *pl, npy_intp *pr, @type@ *v, npy_intp *pw)
-{
-    @type@ vp;
-    npy_intp vi, *pi, *pj, *pk, *pm;
-
-    if (pr - pl > SMALL_MERGESORT) {
-        /* merge sort */
-        pm = pl + ((pr - pl + 1)>>1);
-        amergesort0_@suff@(pl, pm-1, v, pw);
-        amergesort0_@suff@(pm, pr, v, pw);
-        for (pi = pw, pj = pl; pj < pm; ++pi, ++pj) {
-            *pi = *pj;
-        }
-        for (pk = pw, pm = pl; pk < pi && pj <= pr; ++pm) {
-            if (@TYPE@_LT(v[*pj], v[*pk])) {
-                *pm = *pj;
-                ++pj;
-            }
-            else {
-                *pm = *pk;
-                ++pk;
-            }
-        }
-        for (; pk < pi; ++pm, ++pk) {
-            *pm = *pk;
-        }
-    }
-    else {
-        /* insertion sort */
-        for (pi = pl + 1; pi <= pr; ++pi) {
-            vi = *pi;
-            vp = v[vi];
-            for (pj = pi, pk = pi - 1; pj > pl && @TYPE@_LT(vp, v[*pk]); --pj, --pk) {
-                *pj = *pk;
-            }
-            *pj = vi;
-        }
-    }
-}
-
-int
-amergesort_@suff@(@type@ *v, npy_intp *tosort, npy_intp num, void *NOT_USED)
-{
-    npy_intp *pl, *pr, *pw;
-
-    pl = tosort; pr = pl + num - 1;
-    pw = PyDimMem_NEW((1+num/2));
-
-    if (!pw) {
-        PyErr_NoMemory();
-        return -1;
-    }
-
-    amergesort0_@suff@(pl, pr, v, pw);
-    PyDimMem_FREE(pw);
-
-    return 0;
-}
-
-
-/**end repeat**/
-
-/*
- *****************************************************************************
- **                             STRING SORTS                                **
- *****************************************************************************
- */
-
-
-/**begin repeat
- *
- * #TYPE = STRING, UNICODE#
- * #suff = string, unicode#
- * #type = npy_char, npy_ucs4#
- */
-
-static void
-mergesort0_@suff@(@type@ *pl, @type@ *pr, @type@ *pw, @type@ *vp, size_t len)
-{
-    @type@ *pi, *pj, *pk, *pm;
-
-    if ((size_t)(pr - pl) > SMALL_MERGESORT*len) {
-        /* merge sort */
-        pm = pl + (((pr - pl)/len) >> 1)*len;
-        mergesort0_@suff@(pl, pm, pw, vp, len);
-        mergesort0_@suff@(pm, pr, pw, vp, len);
-        @TYPE@_COPY(pw, pl, pm - pl);
-        pi = pw + (pm - pl);
-        pj = pw;
-        pk = pl;
-        while (pj < pi && pm < pr) {
-            if (@TYPE@_LT(pm, pj, len)) {
-                @TYPE@_COPY(pk, pm, len);
-                pm += len;
-            }
-            else {
-                @TYPE@_COPY(pk, pj, len);
-                pj += len;
-            }
-            pk += len;
-        }
-        @TYPE@_COPY(pk, pj, pi - pj);
-    }
-    else {
-        /* insertion sort */
-        for (pi = pl + len; pi < pr; pi += len) {
-            @TYPE@_COPY(vp, pi, len);
-            pj = pi;
-            pk = pi - len;
-            while (pj > pl && @TYPE@_LT(vp, pk, len)) {
-                @TYPE@_COPY(pj, pk, len);
-                pj -= len;
-                pk -= len;
-            }
-            @TYPE@_COPY(pj, vp, len);
-        }
-    }
-}
-
-int
-mergesort_@suff@(@type@ *start, npy_intp num, PyArrayObject *arr)
-{
-    const size_t elsize = PyArray_ITEMSIZE(arr);
-    const size_t len = elsize / sizeof(@type@);
-    @type@ *pl, *pr, *pw, *vp;
-    int err = 0;
-
-    pl = start;
-    pr = pl + num*len;
-    pw = (@type@ *) PyDataMem_NEW((num/2)*elsize);
-    if (!pw) {
-        PyErr_NoMemory();
-        err = -1;
-        goto fail_0;
-    }
-    vp = (@type@ *) PyDataMem_NEW(elsize);
-    if (!vp) {
-        PyErr_NoMemory();
-        err = -1;
-        goto fail_1;
-    }
-    mergesort0_@suff@(pl, pr, pw, vp, len);
-
-    PyDataMem_FREE(vp);
-fail_1:
-    PyDataMem_FREE(pw);
-fail_0:
-    return err;
-}
-
-int
-quicksort_@suff@(@type@ *start, npy_intp num, PyArrayObject *arr)
-{
-    const size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
-    @type@ *vp = malloc(PyArray_ITEMSIZE(arr));
-    @type@ *pl = start;
-    @type@ *pr = start + (num - 1)*len;
-    @type@ *stack[PYA_QS_STACK], **sptr = stack, *pm, *pi, *pj, *pk;
-
-    for (;;) {
-        while ((size_t)(pr - pl) > SMALL_QUICKSORT*len) {
-            /* quicksort partition */
-            pm = pl + (((pr - pl)/len) >> 1)*len;
-            if (@TYPE@_LT(pm, pl, len)) @TYPE@_SWAP(pm, pl, len);
-            if (@TYPE@_LT(pr, pm, len)) @TYPE@_SWAP(pr, pm, len);
-            if (@TYPE@_LT(pm, pl, len)) @TYPE@_SWAP(pm, pl, len);
-            @TYPE@_COPY(vp, pm, len);
-            pi = pl;
-            pj = pr - len;
-            @TYPE@_SWAP(pm, pj, len);
-            for (;;) {
-                do pi += len; while (@TYPE@_LT(pi, vp, len));
-                do pj -= len; while (@TYPE@_LT(vp, pj, len));
-                if (pi >= pj) {
-                    break;
-                }
-                @TYPE@_SWAP(pi, pj, len);
-            }
-            pk = pr - len;
-            @TYPE@_SWAP(pi, pk, len);
-            /* push largest partition on stack */
-            if (pi - pl < pr - pi) {
-                *sptr++ = pi + len;
-                *sptr++ = pr;
-                pr = pi - len;
-            }
-            else {
-                *sptr++ = pl;
-                *sptr++ = pi - len;
-                pl = pi + len;
-            }
-        }
-
-        /* insertion sort */
-        for (pi = pl + len; pi <= pr; pi += len) {
-            @TYPE@_COPY(vp, pi, len);
-            pj = pi;
-            pk = pi - len;
-            while (pj > pl && @TYPE@_LT(vp, pk, len)) {
-                @TYPE@_COPY(pj, pk, len);
-                pj -= len;
-                pk -= len;
-            }
-            @TYPE@_COPY(pj, vp, len);
-        }
-        if (sptr == stack) {
-            break;
-        }
-        pr = *(--sptr);
-        pl = *(--sptr);
-    }
-
-    free(vp);
-    return 0;
-}
-
-
-int
-heapsort_@suff@(@type@ *start, npy_intp n, PyArrayObject *arr)
-{
-    size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
-    @type@ *tmp = malloc(PyArray_ITEMSIZE(arr));
-    @type@ *a = start - len;
-    npy_intp i,j,l;
-
-    for (l = n>>1; l > 0; --l) {
-        @TYPE@_COPY(tmp, a + l*len, len);
-        for (i = l, j = l<<1; j <= n;) {
-            if (j < n && @TYPE@_LT(a + j*len, a + (j+1)*len, len))
-                j += 1;
-            if (@TYPE@_LT(tmp, a + j*len, len)) {
-                @TYPE@_COPY(a + i*len, a + j*len, len);
-                i = j;
-                j += j;
-            }
-            else {
-                break;
-            }
-        }
-        @TYPE@_COPY(a + i*len, tmp, len);
-    }
-
-    for (; n > 1;) {
-        @TYPE@_COPY(tmp, a + n*len, len);
-        @TYPE@_COPY(a + n*len, a + len, len);
-        n -= 1;
-        for (i = 1, j = 2; j <= n;) {
-            if (j < n && @TYPE@_LT(a + j*len, a + (j+1)*len, len))
-                j++;
-            if (@TYPE@_LT(tmp, a + j*len, len)) {
-                @TYPE@_COPY(a + i*len, a + j*len, len);
-                i = j;
-                j += j;
-            }
-            else {
-                break;
-            }
-        }
-        @TYPE@_COPY(a + i*len, tmp, len);
-    }
-
-    free(tmp);
-    return 0;
-}
-
-
-int
-aheapsort_@suff@(@type@ *v, npy_intp *tosort, npy_intp n, PyArrayObject *arr)
-{
-    size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
-    npy_intp *a, i,j,l, tmp;
-
-    /* The array needs to be offset by one for heapsort indexing */
-    a = tosort - 1;
-
-    for (l = n>>1; l > 0; --l) {
-        tmp = a[l];
-        for (i = l, j = l<<1; j <= n;) {
-            if (j < n && @TYPE@_LT(v + a[j]*len, v + a[j+1]*len, len))
-                j += 1;
-            if (@TYPE@_LT(v + tmp*len, v + a[j]*len, len)) {
-                a[i] = a[j];
-                i = j;
-                j += j;
-            }
-            else {
-                break;
-            }
-        }
-        a[i] = tmp;
-    }
-
-    for (; n > 1;) {
-        tmp = a[n];
-        a[n] = a[1];
-        n -= 1;
-        for (i = 1, j = 2; j <= n;) {
-            if (j < n && @TYPE@_LT(v + a[j]*len, v + a[j+1]*len, len))
-                j++;
-            if (@TYPE@_LT(v + tmp*len, v + a[j]*len, len)) {
-                a[i] = a[j];
-                i = j;
-                j += j;
-            }
-            else {
-                break;
-            }
-        }
-        a[i] = tmp;
-    }
-
-    return 0;
-}
-
-
-int
-aquicksort_@suff@(@type@ *v, npy_intp* tosort, npy_intp num, PyArrayObject *arr)
-{
-    size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
-    @type@ *vp;
-    npy_intp *pl = tosort;
-    npy_intp *pr = tosort + num - 1;
-    npy_intp *stack[PYA_QS_STACK];
-    npy_intp **sptr=stack;
-    npy_intp *pm, *pi, *pj, *pk, vi;
-
-    for (;;) {
-        while ((pr - pl) > SMALL_QUICKSORT) {
-            /* quicksort partition */
-            pm = pl + ((pr - pl) >> 1);
-            if (@TYPE@_LT(v + (*pm)*len, v + (*pl)*len, len)) INTP_SWAP(*pm, *pl);
-            if (@TYPE@_LT(v + (*pr)*len, v + (*pm)*len, len)) INTP_SWAP(*pr, *pm);
-            if (@TYPE@_LT(v + (*pm)*len, v + (*pl)*len, len)) INTP_SWAP(*pm, *pl);
-            vp = v + (*pm)*len;
-            pi = pl;
-            pj = pr - 1;
-            INTP_SWAP(*pm,*pj);
-            for (;;) {
-                do ++pi; while (@TYPE@_LT(v + (*pi)*len, vp, len));
-                do --pj; while (@TYPE@_LT(vp, v + (*pj)*len, len));
-                if (pi >= pj) {
-                    break;
-                }
-                INTP_SWAP(*pi,*pj);
-            }
-            pk = pr - 1;
-            INTP_SWAP(*pi,*pk);
-            /* push largest partition on stack */
-            if (pi - pl < pr - pi) {
-                *sptr++ = pi + 1;
-                *sptr++ = pr;
-                pr = pi - 1;
-            }
-            else {
-                *sptr++ = pl;
-                *sptr++ = pi - 1;
-                pl = pi + 1;
-            }
-        }
-
-        /* insertion sort */
-        for (pi = pl + 1; pi <= pr; ++pi) {
-            vi = *pi;
-            vp = v + vi*len;
-            pj = pi;
-            pk = pi - 1;
-            while (pj > pl && @TYPE@_LT(vp, v + (*pk)*len, len)) {
-                *pj-- = *pk--;
-            }
-            *pj = vi;
-        }
-        if (sptr == stack) {
-            break;
-        }
-        pr = *(--sptr);
-        pl = *(--sptr);
-    }
-
-    return 0;
-}
-
-
-static void
-amergesort0_@suff@(npy_intp *pl, npy_intp *pr, @type@ *v, npy_intp *pw, int len)
-{
-    @type@ *vp;
-    npy_intp vi, *pi, *pj, *pk, *pm;
-
-    if (pr - pl > SMALL_MERGESORT) {
-        /* merge sort */
-        pm = pl + ((pr - pl) >> 1);
-        amergesort0_@suff@(pl, pm, v, pw, len);
-        amergesort0_@suff@(pm, pr, v, pw, len);
-        for (pi = pw, pj = pl; pj < pm;) {
-            *pi++ = *pj++;
-        }
-        pj = pw;
-        pk = pl;
-        while (pj < pi && pm < pr) {
-            if (@TYPE@_LT(v + (*pm)*len, v + (*pj)*len, len)) {
-                *pk = *pm++;
-            } else {
-                *pk = *pj++;
-            }
-            pk++;
-        }
-        while (pj < pi) {
-            *pk++ = *pj++;
-        }
-    } else {
-        /* insertion sort */
-        for (pi = pl + 1; pi < pr; ++pi) {
-            vi = *pi;
-            vp = v + vi*len;
-            pj = pi;
-            pk = pi -1;
-            while (pj > pl && @TYPE@_LT(vp, v + (*pk)*len, len)) {
-                *pj-- = *pk--;
-            }
-            *pj = vi;
-        }
-    }
-}
-
-
-int
-amergesort_@suff@(@type@ *v, npy_intp *tosort, npy_intp num, PyArrayObject *arr)
-{
-    const size_t elsize = PyArray_ITEMSIZE(arr);
-    const size_t len = elsize / sizeof(@type@);
-    npy_intp *pl, *pr, *pw;
-
-    pl = tosort;
-    pr = pl + num;
-    pw = PyDimMem_NEW(num/2);
-    if (!pw) {
-        PyErr_NoMemory();
-        return -1;
-    }
-    amergesort0_@suff@(pl, pr, v, pw, len);
-
-    PyDimMem_FREE(pw);
-    return 0;
-}
-/**end repeat**/
diff --git a/numpy/core/src/private/npy_pycompat.h b/numpy/core/src/private/npy_pycompat.h
new file mode 100644
index 000000000..6b49c7476
--- /dev/null
+++ b/numpy/core/src/private/npy_pycompat.h
@@ -0,0 +1,24 @@
+#ifndef _NPY_PYCOMPAT_H_
+#define _NPY_PYCOMPAT_H_
+
+#include "numpy/npy_3kcompat.h"
+
+/*
+ * Accessing items of ob_base
+ */
+
+#if (PY_VERSION_HEX < 0x02060000)
+#define Py_TYPE(o)    (((PyObject*)(o))->ob_type)
+#define Py_REFCNT(o)  (((PyObject*)(o))->ob_refcnt)
+#define Py_SIZE(o)    (((PyVarObject*)(o))->ob_size)
+#endif
+
+/*
+ * PyIndex_Check
+ */
+#if (PY_VERSION_HEX < 0x02050000)
+#undef PyIndex_Check
+#define PyIndex_Check(o)     0
+#endif
+
+#endif /* _NPY_COMPAT_H_ */
diff --git a/numpy/core/src/private/npy_sort.h b/numpy/core/src/private/npy_sort.h
index 94d831f86..120e7efc2 100644
--- a/numpy/core/src/private/npy_sort.h
+++ b/numpy/core/src/private/npy_sort.h
@@ -6,6 +6,10 @@
 #include <numpy/npy_common.h>
 #include <numpy/ndarraytypes.h>
 
+#define NPY_ENOMEM 1
+#define NPY_ECOMP 2
+
+typedef int (*npy_comparator)(const void *, const void *);
 
 int quicksort_bool(npy_bool *vec, npy_intp cnt, void *null);
 int heapsort_bool(npy_bool *vec, npy_intp cnt, void *null);
@@ -166,4 +170,9 @@ int aquicksort_unicode(npy_ucs4 *vec, npy_intp *ind, npy_intp cnt, PyArrayObject
 int aheapsort_unicode(npy_ucs4 *vec, npy_intp *ind, npy_intp cnt, PyArrayObject *arr);
 int amergesort_unicode(npy_ucs4 *vec, npy_intp *ind, npy_intp cnt, PyArrayObject *arr);
 
+
+int npy_quicksort(void *base, size_t num, size_t size, npy_comparator cmp);
+int npy_heapsort(void *base, size_t num, size_t size, npy_comparator cmp);
+int npy_mergesort(void *base, size_t num, size_t size, npy_comparator cmp);
+
 #endif
diff --git a/numpy/core/src/scalarmathmodule.c.src b/numpy/core/src/scalarmathmodule.c.src
index d9f7abc4e..4b56c1510 100644
--- a/numpy/core/src/scalarmathmodule.c.src
+++ b/numpy/core/src/scalarmathmodule.c.src
@@ -13,7 +13,7 @@
 #include "numpy/ufuncobject.h"
 #include "numpy/arrayscalars.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "numpy/halffloat.h"
 
diff --git a/numpy/core/src/umath/funcs.inc.src b/numpy/core/src/umath/funcs.inc.src
index f3fefcfc5..6d46efc44 100644
--- a/numpy/core/src/umath/funcs.inc.src
+++ b/numpy/core/src/umath/funcs.inc.src
@@ -7,7 +7,7 @@
  */
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 
 /*
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 7eede45d6..040486a32 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -16,7 +16,7 @@
 #include "numpy/npy_math.h"
 #include "numpy/halffloat.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "ufunc_object.h"
 
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 3e0306bd2..2712f0474 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -34,7 +34,7 @@
 #define NO_IMPORT_ARRAY
 #endif
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "numpy/arrayobject.h"
 #include "numpy/ufuncobject.h"
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index da1b2e811..372edc07a 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -19,7 +19,7 @@
 #define NO_IMPORT_ARRAY
 #endif
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "numpy/ufuncobject.h"
 #include "ufunc_type_resolution.h"
diff --git a/numpy/core/src/umath/umath_tests.c.src b/numpy/core/src/umath/umath_tests.c.src
index 43583d119..0ba13f15f 100644
--- a/numpy/core/src/umath/umath_tests.c.src
+++ b/numpy/core/src/umath/umath_tests.c.src
@@ -11,7 +11,7 @@
 #include "numpy/arrayobject.h"
 #include "numpy/ufuncobject.h"
 
-#include "numpy/npy_3kcompat.h"
+#include "npy_pycompat.h"
 
 #include "npy_config.h"
 
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 82ead8958..3427800a2 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -10,7 +10,8 @@ from numpy.testing.utils import WarningManager
 from numpy.compat import asbytes, getexception, strchar
 from test_print import in_foreign_locale
 from numpy.core.multiarray_tests import (
-        test_neighborhood_iterator, test_neighborhood_iterator_oob
+        test_neighborhood_iterator, test_neighborhood_iterator_oob,
+        test_pydatamem_seteventhook_start, test_pydatamem_seteventhook_end,
         )
 from numpy.testing import (
         TestCase, run_module_suite, assert_, assert_raises,
@@ -490,7 +491,7 @@ class TestMethods(TestCase):
         # of sorted items must be greater than ~50 to check the actual
         # algorithm because quick and merge sort fall over to insertion
         # sort for small arrays.
-        a = np.arange(100)
+        a = np.arange(101)
         b = a[::-1].copy()
         for kind in ['q','m','h'] :
             msg = "scalar sort, kind=%s" % kind
@@ -526,7 +527,7 @@ class TestMethods(TestCase):
 
         # test string sorts.
         s = 'aaaaaaaa'
-        a = np.array([s + chr(i) for i in range(100)])
+        a = np.array([s + chr(i) for i in range(101)])
         b = a[::-1].copy()
         for kind in ['q', 'm', 'h'] :
             msg = "string sort, kind=%s" % kind
@@ -537,9 +538,9 @@ class TestMethods(TestCase):
             c.sort(kind=kind)
             assert_equal(c, a, msg)
 
-        # test unicode sort.
+        # test unicode sorts.
         s = 'aaaaaaaa'
-        a = np.array([s + chr(i) for i in range(100)], dtype=np.unicode)
+        a = np.array([s + chr(i) for i in range(101)], dtype=np.unicode)
         b = a[::-1].copy()
         for kind in ['q', 'm', 'h'] :
             msg = "unicode sort, kind=%s" % kind
@@ -550,7 +551,55 @@ class TestMethods(TestCase):
             c.sort(kind=kind)
             assert_equal(c, a, msg)
 
-        # todo, check object array sorts.
+        # test object array sorts.
+        a = np.empty((101,), dtype=np.object)
+        a[:] = range(101)
+        b = a[::-1]
+        for kind in ['q', 'h', 'm'] :
+            msg = "object sort, kind=%s" % kind
+            c = a.copy();
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+            c = b.copy();
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+
+        # test record array sorts.
+        dt = np.dtype([('f',float),('i',int)])
+        a = array([(i,i) for i in range(101)], dtype = dt)
+        b = a[::-1]
+        for kind in ['q', 'h', 'm'] :
+            msg = "object sort, kind=%s" % kind
+            c = a.copy();
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+            c = b.copy();
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+
+        # test datetime64 sorts.
+        a = np.arange(0, 101, dtype='datetime64[D]')
+        b = a[::-1]
+        for kind in ['q', 'h', 'm'] :
+            msg = "datetime64 sort, kind=%s" % kind
+            c = a.copy();
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+            c = b.copy();
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+
+        # test timedelta64 sorts.
+        a = np.arange(0, 101, dtype='timedelta64[D]')
+        b = a[::-1]
+        for kind in ['q', 'h', 'm'] :
+            msg = "timedelta64 sort, kind=%s" % kind
+            c = a.copy();
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+            c = b.copy();
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
 
         # check axis handling. This should be the same for all type
         # specific sorts, so we only check it for one type and one kind
@@ -566,10 +615,6 @@ class TestMethods(TestCase):
         d = a.copy()
         d.sort()
         assert_equal(d, c, "test sort with default axis")
-        # using None is known fail at this point
-        # d = a.copy()
-        # d.sort(axis=None)
-        #assert_equal(d, c, "test sort with axis=None")
 
 
     def test_sort_order(self):
@@ -612,7 +657,7 @@ class TestMethods(TestCase):
         # of sorted items must be greater than ~50 to check the actual
         # algorithm because quick and merge sort fall over to insertion
         # sort for small arrays.
-        a = np.arange(100)
+        a = np.arange(101)
         b = a[::-1].copy()
         for kind in ['q','m','h'] :
             msg = "scalar argsort, kind=%s" % kind
@@ -636,10 +681,10 @@ class TestMethods(TestCase):
 
         # test string argsorts.
         s = 'aaaaaaaa'
-        a = np.array([s + chr(i) for i in range(100)])
+        a = np.array([s + chr(i) for i in range(101)])
         b = a[::-1].copy()
-        r = arange(100)
-        rr = r[::-1].copy()
+        r = np.arange(101)
+        rr = r[::-1]
         for kind in ['q', 'm', 'h'] :
             msg = "string argsort, kind=%s" % kind
             assert_equal(a.copy().argsort(kind=kind), r, msg)
@@ -647,16 +692,57 @@ class TestMethods(TestCase):
 
         # test unicode argsorts.
         s = 'aaaaaaaa'
-        a = np.array([s + chr(i) for i in range(100)], dtype=np.unicode)
-        b = a[::-1].copy()
-        r = arange(100)
-        rr = r[::-1].copy()
+        a = np.array([s + chr(i) for i in range(101)], dtype=np.unicode)
+        b = a[::-1]
+        r = np.arange(101)
+        rr = r[::-1]
         for kind in ['q', 'm', 'h'] :
             msg = "unicode argsort, kind=%s" % kind
             assert_equal(a.copy().argsort(kind=kind), r, msg)
             assert_equal(b.copy().argsort(kind=kind), rr, msg)
 
-        # todo, check object array argsorts.
+        # test object array argsorts.
+        a = np.empty((101,), dtype=np.object)
+        a[:] = range(101)
+        b = a[::-1]
+        r = np.arange(101)
+        rr = r[::-1]
+        for kind in ['q', 'm', 'h'] :
+            msg = "object argsort, kind=%s" % kind
+            assert_equal(a.copy().argsort(kind=kind), r, msg)
+            assert_equal(b.copy().argsort(kind=kind), rr, msg)
+
+        # test structured array argsorts.
+        dt = np.dtype([('f',float),('i',int)])
+        a = array([(i,i) for i in range(101)], dtype = dt)
+        b = a[::-1]
+        r = np.arange(101)
+        rr = r[::-1]
+        for kind in ['q', 'm', 'h'] :
+            msg = "structured array argsort, kind=%s" % kind
+            assert_equal(a.copy().argsort(kind=kind), r, msg)
+            assert_equal(b.copy().argsort(kind=kind), rr, msg)
+
+        # test datetime64 argsorts.
+        a = np.arange(0, 101, dtype='datetime64[D]')
+        b = a[::-1]
+        r = np.arange(101)
+        rr = r[::-1]
+        for kind in ['q', 'h', 'm'] :
+            msg = "datetime64 argsort, kind=%s" % kind
+            assert_equal(a.copy().argsort(kind=kind), r, msg)
+            assert_equal(b.copy().argsort(kind=kind), rr, msg)
+
+        # test timedelta64 argsorts.
+        a = np.arange(0, 101, dtype='timedelta64[D]')
+        b = a[::-1]
+        r = np.arange(101)
+        rr = r[::-1]
+        for kind in ['q', 'h', 'm'] :
+            msg = "timedelta64 argsort, kind=%s" % kind
+            assert_equal(a.copy().argsort(kind=kind), r, msg)
+            assert_equal(b.copy().argsort(kind=kind), rr, msg)
+
 
         # check axis handling. This should be the same for all type
         # specific argsorts, so we only check it for one type and one kind
@@ -827,19 +913,19 @@ class TestMethods(TestCase):
         # All the different functions raise a warning, but not an error, and
         # 'a' is not modified:
         assert_equal(collect_warning_types(a.diagonal().__setitem__, 0, 10),
-                     [DeprecationWarning])
+                     [FutureWarning])
         assert_equal(a, np.arange(9).reshape(3, 3))
         assert_equal(collect_warning_types(np.diagonal(a).__setitem__, 0, 10),
-                     [DeprecationWarning])
+                     [FutureWarning])
         assert_equal(a, np.arange(9).reshape(3, 3))
         assert_equal(collect_warning_types(np.diag(a).__setitem__, 0, 10),
-                     [DeprecationWarning])
+                     [FutureWarning])
         assert_equal(a, np.arange(9).reshape(3, 3))
         # Views also warn
         d = np.diagonal(a)
         d_view = d.view()
         assert_equal(collect_warning_types(d_view.__setitem__, 0, 10),
-                     [DeprecationWarning])
+                     [FutureWarning])
         # But the write goes through:
         assert_equal(d[0], 10)
         # Only one warning per call to diagonal, though (even if there are
@@ -861,21 +947,21 @@ class TestMethods(TestCase):
                 buf_or_memoryview[0] = "x"
         assert_equal(collect_warning_types(get_data_and_write,
                                            lambda d: d.data),
-                     [DeprecationWarning])
+                     [FutureWarning])
         if hasattr(np, "getbuffer"):
             assert_equal(collect_warning_types(get_data_and_write,
                                                np.getbuffer),
-                         [DeprecationWarning])
+                         [FutureWarning])
         # PEP 3118:
         if have_memoryview:
             assert_equal(collect_warning_types(get_data_and_write, memoryview),
-                         [DeprecationWarning])
+                         [FutureWarning])
         # Void dtypes can give us a read-write buffer, but only in Python 2:
         import sys
         if sys.version_info[0] < 3:
             aV = np.empty((3, 3), dtype="V10")
             assert_equal(collect_warning_types(aV.diagonal().item, 0),
-                         [DeprecationWarning])
+                         [FutureWarning])
             # XX it seems that direct indexing of a void object returns a void
             # scalar, which ignores not just WARN_ON_WRITE but even WRITEABLE.
             # i.e. in this:
@@ -887,7 +973,7 @@ class TestMethods(TestCase):
         # __array_interface also lets a data pointer get away from us
         log = collect_warning_types(getattr, a.diagonal(),
                                     "__array_interface__")
-        assert_equal(log, [DeprecationWarning])
+        assert_equal(log, [FutureWarning])
         # ctypeslib goes via __array_interface__:
         try:
             # may not exist in python 2.4:
@@ -896,10 +982,10 @@ class TestMethods(TestCase):
             pass
         else:
             log = collect_warning_types(np.ctypeslib.as_ctypes, a.diagonal())
-            assert_equal(log, [DeprecationWarning])
+            assert_equal(log, [FutureWarning])
         # __array_struct__
         log = collect_warning_types(getattr, a.diagonal(), "__array_struct__")
-        assert_equal(log, [DeprecationWarning])
+        assert_equal(log, [FutureWarning])
 
         # Make sure that our recommendation to silence the warning by copying
         # the array actually works:
@@ -932,7 +1018,7 @@ class TestMethods(TestCase):
         ro_diag.flags.writeable = False
         assert_equal(collect_warning_types(getattr, ro_diag,
                                            "__array_struct__"), [])
-        
+
 
     def test_ravel(self):
         a = np.array([[0,1],[2,3]])
@@ -1092,7 +1178,7 @@ class TestFancyIndexing(TestCase):
         x = xorig.copy()
         x[m3] = 10
         assert_array_equal(x, array([[1,10,3,4],[5,6,7,8]]))
-                           
+
 
 class TestStringCompare(TestCase):
     def test_string(self):
@@ -1700,7 +1786,7 @@ class TestFlat(TestCase):
         self.b = a[::2,::2]
         self.a0 = a0
         self.b0 = a0[::2,::2]
-        
+
     def test_contiguous(self):
         testpassed = False
         try:
@@ -1734,7 +1820,7 @@ class TestFlat(TestCase):
         assert d.flags.updateifcopy is False
         assert e.flags.updateifcopy is False
         assert f.flags.updateifcopy is True
-        assert f.base is self.b0    
+        assert f.base is self.b0
 
 class TestResize(TestCase):
     def test_basic(self):
@@ -2649,6 +2735,16 @@ def test_flat_element_deletion():
     except:
         raise AssertionError
 
+class TestMemEventHook(TestCase):
+    def test_mem_seteventhook(self):
+        # The actual tests are within the C code in
+        # multiarray/multiarray_tests.c.src
+        test_pydatamem_seteventhook_start()
+        # force an allocation and free of a numpy array
+        a = np.zeros(10)
+        del a
+        test_pydatamem_seteventhook_end()
+
 
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/core/tests/test_multiarray_assignment.py b/numpy/core/tests/test_multiarray_assignment.py
new file mode 100644
index 000000000..29ddcf906
--- /dev/null
+++ b/numpy/core/tests/test_multiarray_assignment.py
@@ -0,0 +1,78 @@
+import numpy as np
+from numpy.testing import TestCase
+
+ndims = 2
+size = 10
+shape = tuple([size] * ndims)
+
+
+def _indices_for_nelems(nelems):
+    """Returns slices of length nelems, from start onwards, in direction sign."""
+
+    if nelems == 0:
+        return [size // 2]  # int index
+
+    res = []
+    for step in (1, 2):
+        for sign in (-1, 1):
+            start = size // 2 - nelems * step * sign // 2
+            stop = start + nelems * step * sign
+            res.append(slice(start, stop, step * sign))
+
+    return res
+
+
+def _indices_for_axis():
+    """Returns (src, dst) pairs of indices."""
+
+    res = []
+    for nelems in (0, 2, 3):
+        ind = _indices_for_nelems(nelems)
+
+        # no itertools.product available in Py2.4
+        res.extend([(a, b) for a in ind for b in ind])  # all assignments of size "nelems"
+
+    return res
+
+
+def _indices(ndims):
+    """Returns ((axis0_src, axis0_dst), (axis1_src, axis1_dst), ... ) index pairs."""
+
+    ind = _indices_for_axis()
+
+    # no itertools.product available in Py2.4
+
+    res = [[]]
+    for i in xrange(ndims):
+        newres = []
+        for elem in ind:
+            for others in res:
+                newres.append([elem] + others)
+        res = newres
+
+    return res
+
+
+def _check_assignment(srcidx, dstidx):
+    """Check assignment arr[dstidx] = arr[srcidx] works."""
+
+    arr = np.arange(np.product(shape)).reshape(shape)
+
+    cpy = arr.copy()
+
+    cpy[dstidx] = arr[srcidx]
+    arr[dstidx] = arr[srcidx]
+
+    assert np.all(arr == cpy), 'assigning arr[%s] = arr[%s]' % (dstidx, srcidx)
+
+
+def test_overlapping_assignments():
+    """Test automatically generated assignments which overlap in memory."""
+
+    inds = _indices(ndims)
+
+    for ind in inds:
+        srcidx = tuple([a[0] for a in ind])
+        dstidx = tuple([a[1] for a in ind])
+
+        yield _check_assignment, srcidx, dstidx
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 6aed19ca6..892b1fae6 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -1,10 +1,17 @@
 import sys
+import platform
 
 from numpy.testing import *
 import numpy.core.umath as ncu
 import numpy as np
 
 
+def on_powerpc():
+    """ True if we are running on a Power PC platform."""
+    return platform.processor() == 'powerpc' or \
+           platform.machine().startswith('ppc')
+
+
 class _FilterInvalids(object):
     def setUp(self):
         self.olderr = np.seterr(invalid='ignore')
@@ -1118,7 +1125,8 @@ def test_nextafter():
 def test_nextafterf():
     return _test_nextafter(np.float32)
 
-@dec.knownfailureif(sys.platform == 'win32', "Long double support buggy on win32")
+@dec.knownfailureif(sys.platform == 'win32' or on_powerpc(),
+            "Long double support buggy on win32 and PPC, ticket 1664.")
 def test_nextafterl():
     return _test_nextafter(np.longdouble)
 
@@ -1143,7 +1151,8 @@ def test_spacing():
 def test_spacingf():
     return _test_spacing(np.float32)
 
-@dec.knownfailureif(sys.platform == 'win32', "Long double support buggy on win32")
+@dec.knownfailureif(sys.platform == 'win32' or on_powerpc(),
+            "Long double support buggy on win32 and PPC, ticket 1664.")
 def test_spacingl():
     return _test_spacing(np.longdouble)
 
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index 7921b4116..2f2a4bc57 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -203,13 +203,17 @@ class LineSplitter(object):
             self._handyman = _handyman
     #
     def _delimited_splitter(self, line):
-        line = line.split(self.comments)[0].strip(asbytes(" \r\n"))
+        if self.comments is not None:
+            line = line.split(self.comments)[0]
+        line = line.strip(asbytes(" \r\n"))
         if not line:
             return []
         return line.split(self.delimiter)
     #
     def _fixedwidth_splitter(self, line):
-        line = line.split(self.comments)[0].strip(asbytes("\r\n"))
+        if self.comments is not None:
+            line = line.split(self.comments)[0]
+        line = line.strip(asbytes("\r\n"))
         if not line:
             return []
         fixed = self.delimiter
@@ -217,7 +221,8 @@ class LineSplitter(object):
         return [line[s] for s in slices]
     #
     def _variablewidth_splitter(self, line):
-        line = line.split(self.comments)[0]
+        if self.comments is not None:
+            line = line.split(self.comments)[0]
         if not line:
             return []
         slices = self.delimiter
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index b1e891f77..b63003f80 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -1291,7 +1291,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
 
     """
     # Py3 data conversions to bytes, for convenience
-    comments = asbytes(comments)
+    if comments is not None:
+        comments = asbytes(comments)
     if isinstance(delimiter, unicode):
         delimiter = asbytes(delimiter)
     if isinstance(missing, unicode):
diff --git a/numpy/lib/src/_compiled_base.c b/numpy/lib/src/_compiled_base.c
index c31ee5cd8..d389b7f8e 100644
--- a/numpy/lib/src/_compiled_base.c
+++ b/numpy/lib/src/_compiled_base.c
@@ -7,6 +7,10 @@
 #include "numpy/ufuncobject.h"
 #include "string.h"
 
+#if (PY_VERSION_HEX < 0x02060000)
+#define Py_TYPE(o)    (((PyObject*)(o))->ob_type)
+#endif
+
 static npy_intp
 incr_slot_(double x, double *bins, npy_intp lbins)
 {
@@ -670,7 +674,10 @@ arr_interp(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
 
     /* only pre-calculate slopes if there are relatively few of them. */
     if (lenxp <= lenx) {
-        slopes = (double *) PyDataMem_NEW((lenxp - 1)*sizeof(double));
+        slopes = (double *) PyArray_malloc((lenxp - 1)*sizeof(double));
+        if (! slopes) {
+            goto fail;
+        }
         NPY_BEGIN_ALLOW_THREADS;
         for (i = 0; i < lenxp - 1; i++) {
             slopes[i] = (dy[i + 1] - dy[i])/(dx[i + 1] - dx[i]);
@@ -692,7 +699,7 @@ arr_interp(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
             }
         }
         NPY_END_ALLOW_THREADS;
-        PyDataMem_FREE(slopes);
+        PyArray_free(slopes);
     }
     else {
         NPY_BEGIN_ALLOW_THREADS;
diff --git a/numpy/lib/tests/test_arraysetops.py b/numpy/lib/tests/test_arraysetops.py
index e40c155a4..b0d2ca7c3 100644
--- a/numpy/lib/tests/test_arraysetops.py
+++ b/numpy/lib/tests/test_arraysetops.py
@@ -8,31 +8,62 @@ from numpy.lib.arraysetops import *
 
 import warnings
 
-class TestAso(TestCase):
-    def test_unique( self ):
-        a = np.array( [5, 7, 1, 2, 1, 5, 7] )
-
-        ec = np.array( [1, 2, 5, 7] )
-        c = unique( a )
-        assert_array_equal( c, ec )
-
-        vals, indices = unique( a, return_index=True )
+class TestSetOps(TestCase):
 
 
-        ed = np.array( [2, 3, 0, 1] )
-        assert_array_equal(vals, ec)
-        assert_array_equal(indices, ed)
-
-        vals, ind0, ind1 = unique( a, return_index=True,
-                                     return_inverse=True )
-
+    def test_unique( self ):
 
-        ee = np.array( [2, 3, 0, 1, 0, 2, 3] )
-        assert_array_equal(vals, ec)
-        assert_array_equal(ind0, ed)
-        assert_array_equal(ind1, ee)
+        def check_all(a, b, i1, i2, dt):
+            msg = "check values failed for type '%s'" % dt
+            v = unique(a)
+            assert_array_equal(v, b, msg)
+
+            msg = "check indexes failed for type '%s'" % dt
+            v, j = unique(a, 1, 0)
+            assert_array_equal(v, b, msg)
+            assert_array_equal(j, i1, msg)
+
+            msg = "check reverse indexes failed for type '%s'" % dt
+            v, j = unique(a, 0, 1)
+            assert_array_equal(v, b, msg)
+            assert_array_equal(j, i2, msg)
+
+            msg = "check with all indexes failed for type '%s'" % dt
+            v, j1, j2 = unique(a, 1, 1)
+            assert_array_equal(v, b, msg)
+            assert_array_equal(j1, i1, msg)
+            assert_array_equal(j2, i2, msg)
+
+        a = [5, 7, 1, 2, 1, 5, 7]*10
+        b = [1, 2, 5, 7]
+        i1 = [2, 3, 0, 1]
+        i2 = [2, 3, 0, 1, 0, 2, 3]*10
+
+        # test for numeric arrays
+        types = []
+        types.extend(np.typecodes['AllInteger'])
+        types.extend(np.typecodes['AllFloat'])
+        types.append('datetime64[D]')
+        types.append('timedelta64[D]')
+        for dt in types:
+            aa = np.array(a, dt)
+            bb = np.array(b, dt)
+            check_all(aa, bb, i1, i2, dt)
+
+        # test for object arrays
+        dt = 'O'
+        aa = np.empty(len(a), dt)
+        aa[:] = a
+        bb = np.empty(len(b), dt)
+        bb[:] = b
+        check_all(aa, bb, i1, i2, dt)
+
+        # test for structured arrays
+        dt = [('', 'i'), ('', 'i')]
+        aa = np.array(zip(a,a), dt)
+        bb = np.array(zip(b,b), dt)
+        check_all(aa, bb, i1, i2, dt)
 
-        assert_array_equal([], unique([]))
 
     def test_intersect1d( self ):
         # unique inputs
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index c539c040a..f8caeedb6 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -187,6 +187,7 @@ class TestSavezLoad(RoundtripTest, TestCase):
             assert_(not fp.closed)
 
         finally:
+            fp.close()
             os.remove(tmp)
 
     def test_closing_fid(self):
@@ -1426,8 +1427,6 @@ M   33  21.99
                              usecols=("A", "C", "E"), names=True)
         assert_equal(test.dtype.names, ctrl_names)
 
-
-
     def test_fixed_width_names(self):
         "Test fix-width w/ names"
         data = "    A    B   C\n    0    1 2.3\n   45   67   9."
@@ -1451,6 +1450,14 @@ M   33  21.99
         test = np.ndfromtxt(StringIO(data), **kwargs)
         assert_equal(test, ctrl)
 
+    def test_comments_is_none(self):
+        # Github issue 329 (None was previously being converted to 'None').
+        test = np.genfromtxt(StringIO("test1,testNonetherestofthedata"),
+                             dtype=None, comments=None, delimiter=',')
+        assert_equal(test[1], asbytes('testNonetherestofthedata'))
+        test = np.genfromtxt(StringIO("test1, testNonetherestofthedata"),
+                             dtype=None, comments=None, delimiter=',')
+        assert_equal(test[1], asbytes(' testNonetherestofthedata'))
 
     def test_recfromtxt(self):
         #
@@ -1471,7 +1478,6 @@ M   33  21.99
         assert_equal(test.mask, control.mask)
         assert_equal(test.A, [0, 2])
 
-
     def test_recfromcsv(self):
         #
         data = StringIO('A,B\n0,1\n2,3')
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index f25452064..a7f1c074d 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -1420,8 +1420,8 @@ def matrix_rank(M, tol=None):
     """
     Return matrix rank of array using SVD method
 
-    Rank of the array is the number of SVD singular values of the
-    array that are greater than `tol`.
+    Rank of the array is the number of SVD singular values of the array that are
+    greater than `tol`.
 
     Parameters
     ----------
@@ -1431,27 +1431,53 @@ def matrix_rank(M, tol=None):
        threshold below which SVD values are considered zero. If `tol` is
        None, and ``S`` is an array with singular values for `M`, and
        ``eps`` is the epsilon value for datatype of ``S``, then `tol` is
-       set to ``S.max() * eps``.
+       set to ``S.max() * max(M.shape) * eps``.
 
     Notes
     -----
-    Golub and van Loan [1]_ define "numerical rank deficiency" as using
-    tol=eps*S[0] (where S[0] is the maximum singular value and thus the
-    2-norm of the matrix). This is one definition of rank deficiency,
-    and the one we use here.  When floating point roundoff is the main
-    concern, then "numerical rank deficiency" is a reasonable choice. In
-    some cases you may prefer other definitions. The most useful measure
-    of the tolerance depends on the operations you intend to use on your
-    matrix. For example, if your data come from uncertain measurements
-    with uncertainties greater than floating point epsilon, choosing a
-    tolerance near that uncertainty may be preferable.  The tolerance
-    may be absolute if the uncertainties are absolute rather than
-    relative.
+    The default threshold to detect rank deficiency is a test on the magnitude
+    of the singular values of `M`.  By default, we identify singular values less
+    than ``S.max() * max(M.shape) * eps`` as indicating rank deficiency (with
+    the symbols defined above). This is the algorithm MATLAB uses [1].  It also
+    appears in *Numerical recipes* in the discussion of SVD solutions for linear
+    least squares [2].
+
+    This default threshold is designed to detect rank deficiency accounting for
+    the numerical errors of the SVD computation.  Imagine that there is a column
+    in `M` that is an exact (in floating point) linear combination of other
+    columns in `M`. Computing the SVD on `M` will not produce a singular value
+    exactly equal to 0 in general: any difference of the smallest SVD value from
+    0 will be caused by numerical imprecision in the calculation of the SVD.
+    Our threshold for small SVD values takes this numerical imprecision into
+    account, and the default threshold will detect such numerical rank
+    deficiency.  The threshold may declare a matrix `M` rank deficient even if
+    the linear combination of some columns of `M` is not exactly equal to
+    another column of `M` but only numerically very close to another column of
+    `M`.
+
+    We chose our default threshold because it is in wide use.  Other thresholds
+    are possible.  For example, elsewhere in the 2007 edition of *Numerical
+    recipes* there is an alternative threshold of ``S.max() *
+    np.finfo(M.dtype).eps / 2. * np.sqrt(m + n + 1.)``. The authors describe
+    this threshold as being based on "expected roundoff error" (p 71).
+
+    The thresholds above deal with floating point roundoff error in the
+    calculation of the SVD.  However, you may have more information about the
+    sources of error in `M` that would make you consider other tolerance values
+    to detect *effective* rank deficiency.  The most useful measure of the
+    tolerance depends on the operations you intend to use on your matrix.  For
+    example, if your data come from uncertain measurements with uncertainties
+    greater than floating point epsilon, choosing a tolerance near that
+    uncertainty may be preferable.  The tolerance may be absolute if the
+    uncertainties are absolute rather than relative.
 
     References
     ----------
-    .. [1] G. H. Golub and C. F. Van Loan, *Matrix Computations*.
-       Baltimore: Johns Hopkins University Press, 1996.
+    .. [1] MATLAB reference documention, "Rank"
+           http://www.mathworks.com/help/techdoc/ref/rank.html
+    .. [2] W. H. Press, S. A. Teukolsky, W. T. Vetterling and B. P. Flannery,
+           "Numerical Recipes (3rd edition)", Cambridge University Press, 2007,
+           page 795.
 
     Examples
     --------
@@ -1465,7 +1491,6 @@ def matrix_rank(M, tol=None):
     1
     >>> matrix_rank(np.zeros((4,)))
     0
-
     """
     M = asarray(M)
     if M.ndim > 2:
@@ -1474,7 +1499,7 @@ def matrix_rank(M, tol=None):
         return int(not all(M==0))
     S = svd(M, compute_uv=False)
     if tol is None:
-        tol = S.max() * finfo(S.dtype).eps
+        tol = S.max() * max(M.shape) * finfo(S.dtype).eps
     return sum(S > tol)
 
 
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index 2a2ec3da5..623939863 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -3,7 +3,9 @@
 import sys
 
 import numpy as np
-from numpy.testing import *
+from numpy.testing import (TestCase, assert_, assert_equal, assert_raises,
+                           assert_array_equal, assert_almost_equal,
+                           run_module_suite)
 from numpy import array, single, double, csingle, cdouble, dot, identity
 from numpy import multiply, atleast_2d, inf, asarray, matrix
 from numpy import linalg
@@ -451,6 +453,19 @@ class TestMatrixRank(object):
         yield assert_equal, matrix_rank(1), 1
 
 
+def test_reduced_rank():
+    # Test matrices with reduced rank
+    rng = np.random.RandomState(20120714)
+    for i in range(100):
+        # Make a rank deficient matrix
+        X = rng.normal(size=(40, 10))
+        X[:, 0] = X[:, 1] + X[:, 2]
+        # Assert that matrix_rank detected deficiency
+        assert_equal(matrix_rank(X), 9)
+        X[:, 3] = X[:, 4] + X[:, 5]
+        assert_equal(matrix_rank(X), 8)
+
+
 class TestQR(TestCase):
     def test_qr_empty(self):
         a = np.zeros((0,2))
diff --git a/numpy/testing/decorators.py b/numpy/testing/decorators.py
index a7cc9a847..053b99211 100644
--- a/numpy/testing/decorators.py
+++ b/numpy/testing/decorators.py
@@ -210,7 +210,7 @@ def knownfailureif(fail_condition, msg=None):
         from noseclasses import KnownFailureTest
         def knownfailer(*args, **kwargs):
             if fail_val():
-                raise KnownFailureTest, msg
+                raise KnownFailureTest(msg)
             else:
                 return f(*args, **kwargs)
         return nose.tools.make_decorator(f)(knownfailer)
diff --git a/tools/allocation_tracking/alloc_hook.pyx b/tools/allocation_tracking/alloc_hook.pyx
new file mode 100644
index 000000000..d1e656f90
--- /dev/null
+++ b/tools/allocation_tracking/alloc_hook.pyx
@@ -0,0 +1,42 @@
+# A cython wrapper for using python functions as callbacks for
+# PyDataMem_SetEventHook.
+
+cimport numpy as np
+
+cdef extern from "Python.h":
+    object PyLong_FromVoidPtr(void *)
+    void *PyLong_AsVoidPtr(object)
+
+ctypedef void PyDataMem_EventHookFunc(void *inp, void *outp, size_t size,
+                                      void *user_data)
+cdef extern from "numpy/arrayobject.h":
+    PyDataMem_EventHookFunc * \
+        PyDataMem_SetEventHook(PyDataMem_EventHookFunc *newhook,
+                               void *user_data, void **old_data)
+
+np.import_array()
+
+cdef void pyhook(void *old, void *new, size_t size, void *user_data):
+    cdef object pyfunc = <object> user_data
+    pyfunc(PyLong_FromVoidPtr(old),
+           PyLong_FromVoidPtr(new),
+           size)
+
+class NumpyAllocHook(object):
+    def __init__(self, callback):
+        self.callback = callback
+
+    def __enter__(self):
+        cdef void *old_hook, *old_data
+        old_hook = <void *> \
+            PyDataMem_SetEventHook(<PyDataMem_EventHookFunc *> pyhook,
+                                    <void *> self.callback,
+                                    <void **> &old_data)
+        self.old_hook = PyLong_FromVoidPtr(old_hook)
+        self.old_data = PyLong_FromVoidPtr(old_data)
+
+    def __exit__(self):
+        PyDataMem_SetEventHook(<PyDataMem_EventHookFunc *> \
+                                    PyLong_AsVoidPtr(self.old_hook),
+                                <void *> PyLong_AsVoidPtr(self.old_data),
+                                <void **> 0)
diff --git a/tools/allocation_tracking/setup.py b/tools/allocation_tracking/setup.py
new file mode 100644
index 000000000..4462f9f4e
--- /dev/null
+++ b/tools/allocation_tracking/setup.py
@@ -0,0 +1,9 @@
+from distutils.core import setup
+from distutils.extension import Extension
+from Cython.Distutils import build_ext
+import numpy
+
+setup(
+    cmdclass = {'build_ext': build_ext},
+    ext_modules = [Extension("alloc_hook", ["alloc_hook.pyx"],
+                             include_dirs=[numpy.get_include()])])
diff --git a/tools/allocation_tracking/sorttable.js b/tools/allocation_tracking/sorttable.js
new file mode 100644
index 000000000..25bccb2b6
--- /dev/null
+++ b/tools/allocation_tracking/sorttable.js
@@ -0,0 +1,493 @@
+/*
+  SortTable
+  version 2
+  7th April 2007
+  Stuart Langridge, http://www.kryogenix.org/code/browser/sorttable/
+  
+  Instructions:
+  Download this file
+  Add <script src="sorttable.js"></script> to your HTML
+  Add class="sortable" to any table you'd like to make sortable
+  Click on the headers to sort
+  
+  Thanks to many, many people for contributions and suggestions.
+  Licenced as X11: http://www.kryogenix.org/code/browser/licence.html
+  This basically means: do what you want with it.
+*/
+
+ 
+var stIsIE = /*@cc_on!@*/false;
+
+sorttable = {
+  init: function() {
+    // quit if this function has already been called
+    if (arguments.callee.done) return;
+    // flag this function so we don't do the same thing twice
+    arguments.callee.done = true;
+    // kill the timer
+    if (_timer) clearInterval(_timer);
+    
+    if (!document.createElement || !document.getElementsByTagName) return;
+    
+    sorttable.DATE_RE = /^(\d\d?)[\/\.-](\d\d?)[\/\.-]((\d\d)?\d\d)$/;
+    
+    forEach(document.getElementsByTagName('table'), function(table) {
+      if (table.className.search(/\bsortable\b/) != -1) {
+        sorttable.makeSortable(table);
+      }
+    });
+    
+  },
+  
+  makeSortable: function(table) {
+    if (table.getElementsByTagName('thead').length == 0) {
+      // table doesn't have a tHead. Since it should have, create one and
+      // put the first table row in it.
+      the = document.createElement('thead');
+      the.appendChild(table.rows[0]);
+      table.insertBefore(the,table.firstChild);
+    }
+    // Safari doesn't support table.tHead, sigh
+    if (table.tHead == null) table.tHead = table.getElementsByTagName('thead')[0];
+    
+    if (table.tHead.rows.length != 1) return; // can't cope with two header rows
+    
+    // Sorttable v1 put rows with a class of "sortbottom" at the bottom (as
+    // "total" rows, for example). This is B&R, since what you're supposed
+    // to do is put them in a tfoot. So, if there are sortbottom rows,
+    // for backwards compatibility, move them to tfoot (creating it if needed).
+    sortbottomrows = [];
+    for (var i=0; i<table.rows.length; i++) {
+      if (table.rows[i].className.search(/\bsortbottom\b/) != -1) {
+        sortbottomrows[sortbottomrows.length] = table.rows[i];
+      }
+    }
+    if (sortbottomrows) {
+      if (table.tFoot == null) {
+        // table doesn't have a tfoot. Create one.
+        tfo = document.createElement('tfoot');
+        table.appendChild(tfo);
+      }
+      for (var i=0; i<sortbottomrows.length; i++) {
+        tfo.appendChild(sortbottomrows[i]);
+      }
+      delete sortbottomrows;
+    }
+    
+    // work through each column and calculate its type
+    headrow = table.tHead.rows[0].cells;
+    for (var i=0; i<headrow.length; i++) {
+      // manually override the type with a sorttable_type attribute
+      if (!headrow[i].className.match(/\bsorttable_nosort\b/)) { // skip this col
+        mtch = headrow[i].className.match(/\bsorttable_([a-z0-9]+)\b/);
+        if (mtch) { override = mtch[1]; }
+	      if (mtch && typeof sorttable["sort_"+override] == 'function') {
+	        headrow[i].sorttable_sortfunction = sorttable["sort_"+override];
+	      } else {
+	        headrow[i].sorttable_sortfunction = sorttable.guessType(table,i);
+	      }
+	      // make it clickable to sort
+	      headrow[i].sorttable_columnindex = i;
+	      headrow[i].sorttable_tbody = table.tBodies[0];
+	      dean_addEvent(headrow[i],"click", function(e) {
+
+          if (this.className.search(/\bsorttable_sorted\b/) != -1) {
+            // if we're already sorted by this column, just 
+            // reverse the table, which is quicker
+            sorttable.reverse(this.sorttable_tbody);
+            this.className = this.className.replace('sorttable_sorted',
+                                                    'sorttable_sorted_reverse');
+            this.removeChild(document.getElementById('sorttable_sortfwdind'));
+            sortrevind = document.createElement('span');
+            sortrevind.id = "sorttable_sortrevind";
+            sortrevind.innerHTML = stIsIE ? '&nbsp<font face="webdings">5</font>' : '&nbsp;&#x25B4;';
+            this.appendChild(sortrevind);
+            return;
+          }
+          if (this.className.search(/\bsorttable_sorted_reverse\b/) != -1) {
+            // if we're already sorted by this column in reverse, just 
+            // re-reverse the table, which is quicker
+            sorttable.reverse(this.sorttable_tbody);
+            this.className = this.className.replace('sorttable_sorted_reverse',
+                                                    'sorttable_sorted');
+            this.removeChild(document.getElementById('sorttable_sortrevind'));
+            sortfwdind = document.createElement('span');
+            sortfwdind.id = "sorttable_sortfwdind";
+            sortfwdind.innerHTML = stIsIE ? '&nbsp<font face="webdings">6</font>' : '&nbsp;&#x25BE;';
+            this.appendChild(sortfwdind);
+            return;
+          }
+          
+          // remove sorttable_sorted classes
+          theadrow = this.parentNode;
+          forEach(theadrow.childNodes, function(cell) {
+            if (cell.nodeType == 1) { // an element
+              cell.className = cell.className.replace('sorttable_sorted_reverse','');
+              cell.className = cell.className.replace('sorttable_sorted','');
+            }
+          });
+          sortfwdind = document.getElementById('sorttable_sortfwdind');
+          if (sortfwdind) { sortfwdind.parentNode.removeChild(sortfwdind); }
+          sortrevind = document.getElementById('sorttable_sortrevind');
+          if (sortrevind) { sortrevind.parentNode.removeChild(sortrevind); }
+          
+          this.className += ' sorttable_sorted';
+          sortfwdind = document.createElement('span');
+          sortfwdind.id = "sorttable_sortfwdind";
+          sortfwdind.innerHTML = stIsIE ? '&nbsp<font face="webdings">6</font>' : '&nbsp;&#x25BE;';
+          this.appendChild(sortfwdind);
+
+	        // build an array to sort. This is a Schwartzian transform thing,
+	        // i.e., we "decorate" each row with the actual sort key,
+	        // sort based on the sort keys, and then put the rows back in order
+	        // which is a lot faster because you only do getInnerText once per row
+	        row_array = [];
+	        col = this.sorttable_columnindex;
+	        rows = this.sorttable_tbody.rows;
+	        for (var j=0; j<rows.length; j++) {
+	          row_array[row_array.length] = [sorttable.getInnerText(rows[j].cells[col]), rows[j]];
+	        }
+	        /* If you want a stable sort, uncomment the following line */
+	        //sorttable.shaker_sort(row_array, this.sorttable_sortfunction);
+	        /* and comment out this one */
+	        row_array.sort(this.sorttable_sortfunction);
+	        
+	        tb = this.sorttable_tbody;
+	        for (var j=0; j<row_array.length; j++) {
+	          tb.appendChild(row_array[j][1]);
+	        }
+	        
+	        delete row_array;
+	      });
+	    }
+    }
+  },
+  
+  guessType: function(table, column) {
+    // guess the type of a column based on its first non-blank row
+    sortfn = sorttable.sort_alpha;
+    for (var i=0; i<table.tBodies[0].rows.length; i++) {
+      text = sorttable.getInnerText(table.tBodies[0].rows[i].cells[column]);
+      if (text != '') {
+        if (text.match(/^-?[�$�]?[\d,.]+%?$/)) {
+          return sorttable.sort_numeric;
+        }
+        // check for a date: dd/mm/yyyy or dd/mm/yy 
+        // can have / or . or - as separator
+        // can be mm/dd as well
+        possdate = text.match(sorttable.DATE_RE)
+        if (possdate) {
+          // looks like a date
+          first = parseInt(possdate[1]);
+          second = parseInt(possdate[2]);
+          if (first > 12) {
+            // definitely dd/mm
+            return sorttable.sort_ddmm;
+          } else if (second > 12) {
+            return sorttable.sort_mmdd;
+          } else {
+            // looks like a date, but we can't tell which, so assume
+            // that it's dd/mm (English imperialism!) and keep looking
+            sortfn = sorttable.sort_ddmm;
+          }
+        }
+      }
+    }
+    return sortfn;
+  },
+  
+  getInnerText: function(node) {
+    // gets the text we want to use for sorting for a cell.
+    // strips leading and trailing whitespace.
+    // this is *not* a generic getInnerText function; it's special to sorttable.
+    // for example, you can override the cell text with a customkey attribute.
+    // it also gets .value for <input> fields.
+    
+    hasInputs = (typeof node.getElementsByTagName == 'function') &&
+                 node.getElementsByTagName('input').length;
+    
+    if (node.getAttribute("sorttable_customkey") != null) {
+      return node.getAttribute("sorttable_customkey");
+    }
+    else if (typeof node.textContent != 'undefined' && !hasInputs) {
+      return node.textContent.replace(/^\s+|\s+$/g, '');
+    }
+    else if (typeof node.innerText != 'undefined' && !hasInputs) {
+      return node.innerText.replace(/^\s+|\s+$/g, '');
+    }
+    else if (typeof node.text != 'undefined' && !hasInputs) {
+      return node.text.replace(/^\s+|\s+$/g, '');
+    }
+    else {
+      switch (node.nodeType) {
+        case 3:
+          if (node.nodeName.toLowerCase() == 'input') {
+            return node.value.replace(/^\s+|\s+$/g, '');
+          }
+        case 4:
+          return node.nodeValue.replace(/^\s+|\s+$/g, '');
+          break;
+        case 1:
+        case 11:
+          var innerText = '';
+          for (var i = 0; i < node.childNodes.length; i++) {
+            innerText += sorttable.getInnerText(node.childNodes[i]);
+          }
+          return innerText.replace(/^\s+|\s+$/g, '');
+          break;
+        default:
+          return '';
+      }
+    }
+  },
+  
+  reverse: function(tbody) {
+    // reverse the rows in a tbody
+    newrows = [];
+    for (var i=0; i<tbody.rows.length; i++) {
+      newrows[newrows.length] = tbody.rows[i];
+    }
+    for (var i=newrows.length-1; i>=0; i--) {
+       tbody.appendChild(newrows[i]);
+    }
+    delete newrows;
+  },
+  
+  /* sort functions
+     each sort function takes two parameters, a and b
+     you are comparing a[0] and b[0] */
+  sort_numeric: function(a,b) {
+    aa = parseFloat(a[0].replace(/[^0-9.-]/g,''));
+    if (isNaN(aa)) aa = 0;
+    bb = parseFloat(b[0].replace(/[^0-9.-]/g,'')); 
+    if (isNaN(bb)) bb = 0;
+    return aa-bb;
+  },
+  sort_alpha: function(a,b) {
+    if (a[0]==b[0]) return 0;
+    if (a[0]<b[0]) return -1;
+    return 1;
+  },
+  sort_ddmm: function(a,b) {
+    mtch = a[0].match(sorttable.DATE_RE);
+    y = mtch[3]; m = mtch[2]; d = mtch[1];
+    if (m.length == 1) m = '0'+m;
+    if (d.length == 1) d = '0'+d;
+    dt1 = y+m+d;
+    mtch = b[0].match(sorttable.DATE_RE);
+    y = mtch[3]; m = mtch[2]; d = mtch[1];
+    if (m.length == 1) m = '0'+m;
+    if (d.length == 1) d = '0'+d;
+    dt2 = y+m+d;
+    if (dt1==dt2) return 0;
+    if (dt1<dt2) return -1;
+    return 1;
+  },
+  sort_mmdd: function(a,b) {
+    mtch = a[0].match(sorttable.DATE_RE);
+    y = mtch[3]; d = mtch[2]; m = mtch[1];
+    if (m.length == 1) m = '0'+m;
+    if (d.length == 1) d = '0'+d;
+    dt1 = y+m+d;
+    mtch = b[0].match(sorttable.DATE_RE);
+    y = mtch[3]; d = mtch[2]; m = mtch[1];
+    if (m.length == 1) m = '0'+m;
+    if (d.length == 1) d = '0'+d;
+    dt2 = y+m+d;
+    if (dt1==dt2) return 0;
+    if (dt1<dt2) return -1;
+    return 1;
+  },
+  
+  shaker_sort: function(list, comp_func) {
+    // A stable sort function to allow multi-level sorting of data
+    // see: http://en.wikipedia.org/wiki/Cocktail_sort
+    // thanks to Joseph Nahmias
+    var b = 0;
+    var t = list.length - 1;
+    var swap = true;
+
+    while(swap) {
+        swap = false;
+        for(var i = b; i < t; ++i) {
+            if ( comp_func(list[i], list[i+1]) > 0 ) {
+                var q = list[i]; list[i] = list[i+1]; list[i+1] = q;
+                swap = true;
+            }
+        } // for
+        t--;
+
+        if (!swap) break;
+
+        for(var i = t; i > b; --i) {
+            if ( comp_func(list[i], list[i-1]) < 0 ) {
+                var q = list[i]; list[i] = list[i-1]; list[i-1] = q;
+                swap = true;
+            }
+        } // for
+        b++;
+
+    } // while(swap)
+  }  
+}
+
+/* ******************************************************************
+   Supporting functions: bundled here to avoid depending on a library
+   ****************************************************************** */
+
+// Dean Edwards/Matthias Miller/John Resig
+
+/* for Mozilla/Opera9 */
+if (document.addEventListener) {
+    document.addEventListener("DOMContentLoaded", sorttable.init, false);
+}
+
+/* for Internet Explorer */
+/*@cc_on @*/
+/*@if (@_win32)
+    document.write("<script id=__ie_onload defer src=javascript:void(0)><\/script>");
+    var script = document.getElementById("__ie_onload");
+    script.onreadystatechange = function() {
+        if (this.readyState == "complete") {
+            sorttable.init(); // call the onload handler
+        }
+    };
+/*@end @*/
+
+/* for Safari */
+if (/WebKit/i.test(navigator.userAgent)) { // sniff
+    var _timer = setInterval(function() {
+        if (/loaded|complete/.test(document.readyState)) {
+            sorttable.init(); // call the onload handler
+        }
+    }, 10);
+}
+
+/* for other browsers */
+window.onload = sorttable.init;
+
+// written by Dean Edwards, 2005
+// with input from Tino Zijdel, Matthias Miller, Diego Perini
+
+// http://dean.edwards.name/weblog/2005/10/add-event/
+
+function dean_addEvent(element, type, handler) {
+	if (element.addEventListener) {
+		element.addEventListener(type, handler, false);
+	} else {
+		// assign each event handler a unique ID
+		if (!handler.$$guid) handler.$$guid = dean_addEvent.guid++;
+		// create a hash table of event types for the element
+		if (!element.events) element.events = {};
+		// create a hash table of event handlers for each element/event pair
+		var handlers = element.events[type];
+		if (!handlers) {
+			handlers = element.events[type] = {};
+			// store the existing event handler (if there is one)
+			if (element["on" + type]) {
+				handlers[0] = element["on" + type];
+			}
+		}
+		// store the event handler in the hash table
+		handlers[handler.$$guid] = handler;
+		// assign a global event handler to do all the work
+		element["on" + type] = handleEvent;
+	}
+};
+// a counter used to create unique IDs
+dean_addEvent.guid = 1;
+
+function removeEvent(element, type, handler) {
+	if (element.removeEventListener) {
+		element.removeEventListener(type, handler, false);
+	} else {
+		// delete the event handler from the hash table
+		if (element.events && element.events[type]) {
+			delete element.events[type][handler.$$guid];
+		}
+	}
+};
+
+function handleEvent(event) {
+	var returnValue = true;
+	// grab the event object (IE uses a global event object)
+	event = event || fixEvent(((this.ownerDocument || this.document || this).parentWindow || window).event);
+	// get a reference to the hash table of event handlers
+	var handlers = this.events[event.type];
+	// execute each event handler
+	for (var i in handlers) {
+		this.$$handleEvent = handlers[i];
+		if (this.$$handleEvent(event) === false) {
+			returnValue = false;
+		}
+	}
+	return returnValue;
+};
+
+function fixEvent(event) {
+	// add W3C standard event methods
+	event.preventDefault = fixEvent.preventDefault;
+	event.stopPropagation = fixEvent.stopPropagation;
+	return event;
+};
+fixEvent.preventDefault = function() {
+	this.returnValue = false;
+};
+fixEvent.stopPropagation = function() {
+  this.cancelBubble = true;
+}
+
+// Dean's forEach: http://dean.edwards.name/base/forEach.js
+/*
+	forEach, version 1.0
+	Copyright 2006, Dean Edwards
+	License: http://www.opensource.org/licenses/mit-license.php
+*/
+
+// array-like enumeration
+if (!Array.forEach) { // mozilla already supports this
+	Array.forEach = function(array, block, context) {
+		for (var i = 0; i < array.length; i++) {
+			block.call(context, array[i], i, array);
+		}
+	};
+}
+
+// generic enumeration
+Function.prototype.forEach = function(object, block, context) {
+	for (var key in object) {
+		if (typeof this.prototype[key] == "undefined") {
+			block.call(context, object[key], key, object);
+		}
+	}
+};
+
+// character enumeration
+String.forEach = function(string, block, context) {
+	Array.forEach(string.split(""), function(chr, index) {
+		block.call(context, chr, index, string);
+	});
+};
+
+// globally resolve forEach enumeration
+var forEach = function(object, block, context) {
+	if (object) {
+		var resolve = Object; // default
+		if (object instanceof Function) {
+			// functions have a "length" property
+			resolve = Function;
+		} else if (object.forEach instanceof Function) {
+			// the object implements a custom forEach method so use that
+			object.forEach(block, context);
+			return;
+		} else if (typeof object == "string") {
+			// the object is a string
+			resolve = String;
+		} else if (typeof object.length == "number") {
+			// the object is array-like
+			resolve = Array;
+		}
+		resolve.forEach(object, block, context);
+	}
+};
+
diff --git a/tools/allocation_tracking/track_allocations.py b/tools/allocation_tracking/track_allocations.py
new file mode 100644
index 000000000..e9d937817
--- /dev/null
+++ b/tools/allocation_tracking/track_allocations.py
@@ -0,0 +1,131 @@
+import numpy as np
+import inspect
+from alloc_hook import NumpyAllocHook
+
+class AllocationTracker(object):
+    def __init__(self, threshold=0):
+        '''track numpy allocations of size threshold bytes or more.'''
+
+        self.threshold = threshold
+
+        # The total number of bytes currently allocated with size above
+        # threshold
+        self.total_bytes = 0
+
+        # We buffer requests line by line and move them into the allocation
+        # trace when a new line occurs
+        self.current_line = None
+        self.pending_allocations = []
+
+        self.blocksizes = {}
+
+        # list of (lineinfo, bytes allocated, bytes freed, # allocations, #
+        # frees, maximum memory usage, long-lived bytes allocated)
+        self.allocation_trace = []
+
+        self.numpy_hook = NumpyAllocHook(self.hook)
+
+    def __enter__(self):
+        self.numpy_hook.__enter__()
+
+    def __exit__(self, type, value, traceback):
+        self.check_line_changed()  # forces pending events to be handled
+        self.numpy_hook.__exit__()
+
+    def hook(self, inptr, outptr, size):
+        if outptr == 0:  # it's a free
+            self.free_cb(inptr)
+        elif inptr != 0:  # realloc
+            self.realloc_cb(inptr, outptr, size)
+        else:  # malloc
+            self.alloc_cb(outptr, size)
+
+    def alloc_cb(self, ptr, size):
+        if size >= self.threshold:
+            self.check_line_changed()
+            self.blocksizes[ptr] = size
+            self.pending_allocations.append(size)
+
+    def free_cb(self, ptr):
+        size = self.blocksizes.pop(ptr, 0)
+        if size:
+            self.check_line_changed()
+            self.pending_allocations.append(-size)
+
+    def realloc_cb(self, newptr, oldptr, size):
+        if (size >= self.threshold) or (oldptr in self.blocksizes):
+            self.check_line_changed()
+            oldsize = self.blocksizes.pop(oldptr, 0)
+            self.pending_allocations.append(size - oldsize)
+            self.blocksizes[newptr] = size
+
+    def get_code_line(self):
+        # first frame is this line, then check_line_changed(), then 2 callbacks,
+        # then actual code.
+        try:
+            return inspect.stack()[4][1:]
+        except:
+            return inspect.stack()[0][1:]
+
+    def check_line_changed(self):
+        line = self.get_code_line()
+        if line != self.current_line and (self.current_line is not None):
+            # move pending events into the allocation_trace
+            max_size = self.total_bytes
+            bytes_allocated = 0
+            bytes_freed = 0
+            num_allocations = 0
+            num_frees = 0
+            before_size = self.total_bytes
+            for allocation in self.pending_allocations:
+                self.total_bytes += allocation
+                if allocation > 0:
+                    bytes_allocated += allocation
+                    num_allocations += 1
+                else:
+                    bytes_freed += -allocation
+                    num_frees += 1
+                max_size = max(max_size, self.total_bytes)
+            long_lived = max(self.total_bytes - before_size, 0)
+            self.allocation_trace.append((self.current_line, bytes_allocated,
+                                          bytes_freed, num_allocations,
+                                          num_frees, max_size, long_lived))
+            # clear pending allocations
+            self.pending_allocations = []
+        # move to the new line
+        self.current_line = line
+
+    def write_html(self, filename):
+        f = open(filename, "w")
+        f.write('<HTML><HEAD><script src="sorttable.js"></script></HEAD><BODY>\n')
+        f.write('<TABLE class="sortable" width=100%>\n')
+        f.write("<TR>\n")
+        cols = "event#,lineinfo,bytes allocated,bytes freed,#allocations,#frees,max memory usage,long lived bytes".split(',')
+        for header in cols:
+            f.write("  <TH>{0}</TH>".format(header))
+        f.write("\n</TR>\n")
+        for idx, event in enumerate(self.allocation_trace):
+            f.write("<TR>\n")
+            event = [idx] + list(event)
+            for col, val in zip(cols, event):
+                if col == 'lineinfo':
+                    # special handling
+                    try:
+                        filename, line, module, code, index = val
+                        val = "{0}({1}): {2}".format(filename, line, code[index])
+                    except:
+                        # sometimes this info is not available (from eval()?)
+                        val = str(val)
+                f.write("  <TD>{0}</TD>".format(val))
+            f.write("\n</TR>\n")
+        f.write("</TABLE></BODY></HTML>\n")
+        f.close()
+
+
+if __name__ == '__main__':
+    tracker = AllocationTracker(1000)
+    with tracker:
+        for i in range(100):
+            np.zeros(i * 100)
+            np.zeros(i * 200)
+    tracker.write_html("allocations.html")