diff options
author | Charles Harris <charlesr.harris@gmail.com> | 2019-02-06 19:01:59 -0700 |
---|---|---|
committer | Charles Harris <charlesr.harris@gmail.com> | 2019-02-07 14:54:27 -0700 |
commit | d3eb626ef41e1302631b5154037567b9cc02630d (patch) | |
tree | 4dd09f4f86cf1f422a45b2d7c901516a3b3dcd06 | |
parent | de1ca61e038548c73ede1e8bd11a9c7dfa02794d (diff) | |
download | numpy-d3eb626ef41e1302631b5154037567b9cc02630d.tar.gz |
BUG: Add timsort without breaking the API.
In order to maintain forward compatibility it is necessary to keep the
size of PyArray_ArrFuncs struct fixed. The usual trick of adding new
elements to the end of the structure is not available in this case
because the struct may be instanciated by user types and we have no way
to know whether the new or old struct is in play.
The solution adopted here is the reuse the (a)mergesort slots for stable
sorts of all kinds, with the actual kind set when the struct is
initialized. The '(a)mergesort' option thus becomes an alias for
'stable', but we keep it for backwards compatibility.
-rw-r--r-- | numpy/core/fromnumeric.py | 37 | ||||
-rw-r--r-- | numpy/core/include/numpy/ndarraytypes.h | 11 | ||||
-rw-r--r-- | numpy/core/src/multiarray/arraytypes.c.src | 4 | ||||
-rw-r--r-- | numpy/core/src/multiarray/conversion_utils.c | 21 | ||||
-rw-r--r-- | numpy/core/src/multiarray/item_selection.c | 20 | ||||
-rw-r--r-- | numpy/core/tests/test_multiarray.py | 6 |
6 files changed, 63 insertions, 36 deletions
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py index f336ae248..acfe92a63 100644 --- a/numpy/core/fromnumeric.py +++ b/numpy/core/fromnumeric.py @@ -828,8 +828,16 @@ def sort(a, axis=-1, kind='quicksort', order=None): axis : int or None, optional Axis along which to sort. If None, the array is flattened before sorting. The default is -1, which sorts along the last axis. - kind : {'quicksort', 'mergesort', 'heapsort', 'timsort', 'stable'}, optional - Sorting algorithm. Default is 'quicksort'. + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional + Sorting algorithm. The default is 'quicksort'. Note that both 'stable' + and 'mergesort' use timsort under the covers and, in general, the + actual implementation will vary with datatype. The 'mergesort' option + is retained for backwards compatibility. + + .. versionchanged:: 1.17.0. + The 'stable' option was added together with stable sorting + algorithms other than 'mergesort'. + order : str or list of str, optional When `a` is an array with fields defined, this argument specifies which fields to compare first, second, etc. A single field can @@ -855,18 +863,22 @@ def sort(a, axis=-1, kind='quicksort', order=None): The various sorting algorithms are characterized by their average speed, worst case performance, work space size, and whether they are stable. A stable sort keeps items with the same key in the same relative - order. The four available algorithms have the following + order. The four algorithms implemented in NumPy have the following properties: =========== ======= ============= ============ ======== kind speed worst case work space stable =========== ======= ============= ============ ======== 'quicksort' 1 O(n^2) 0 no - 'mergesort' 2 O(n*log(n)) ~n/2 yes 'heapsort' 3 O(n*log(n)) 0 no + 'mergesort' 2 O(n*log(n)) ~n/2 yes 'timsort' 2 O(n*log(n)) ~n/2 yes =========== ======= ============= ============ ======== + .. note:: The datatype determines which of 'mergesort' or 'timsort' + is actually used, even if 'mergesort' is specified. User selection + at a finer scale is not currently available. + All the sort algorithms make temporary copies of the data when sorting along any but the last axis. Consequently, sorting along the last axis is faster and uses less space than sorting along @@ -895,7 +907,10 @@ def sort(a, axis=-1, kind='quicksort', order=None): worst case O(n*log(n)). 'stable' automatically choses the best stable sorting algorithm - for the data type being sorted. It is currently mapped to timsort. + for the data type being sorted. It, along with 'mergesort' is + currently mapped to timsort. API forward compatibility currently limits the + ability to select the implementation and it is hardwired for the different + data types. .. versionadded:: 1.17.0 Timsort is added for better performance on already or nearly @@ -967,8 +982,16 @@ def argsort(a, axis=-1, kind='quicksort', order=None): axis : int or None, optional Axis along which to sort. The default is -1 (the last axis). If None, the flattened array is used. - kind : {'quicksort', 'mergesort', 'heapsort', 'timsort', 'stable'}, optional - Sorting algorithm. + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional + Sorting algorithm. The default is 'quicksort'. Note that both 'stable' + and 'mergesort' use timsort under the covers and, in general, the + actual implementation will vary with datatype. The 'mergesort' option + is retained for backwards compatibility. + + .. versionchanged:: 1.17.0. + The 'stable' option was added together with stable sorting + algorithms other than 'mergesort'. + order : str or list of str, optional When `a` is an array with fields defined, this argument specifies which fields to compare first, second, etc. A single field can diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h index 62895049c..1221aeece 100644 --- a/numpy/core/include/numpy/ndarraytypes.h +++ b/numpy/core/include/numpy/ndarraytypes.h @@ -156,13 +156,20 @@ enum NPY_TYPECHAR { NPY_COMPLEXLTR = 'c' }; +/* + * Changing this may break Numpy API compatibility + * due to changing offsets in PyArray_ArrFuncs, so be + * careful. Here we have reused the mergesort slot for + * any kind of stable sort, the actual implementation will + * depend on the data type. + */ typedef enum { NPY_QUICKSORT=0, NPY_HEAPSORT=1, NPY_MERGESORT=2, - NPY_TIMSORT=3, + NPY_STABLESORT=2, } NPY_SORTKIND; -#define NPY_NSORTS (NPY_TIMSORT + 1) +#define NPY_NSORTS (NPY_STABLESORT + 1) typedef enum { diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src index ddef0de52..49819ca4a 100644 --- a/numpy/core/src/multiarray/arraytypes.c.src +++ b/numpy/core/src/multiarray/arraytypes.c.src @@ -4320,13 +4320,11 @@ static PyArray_ArrFuncs _Py@NAME@_ArrFuncs = { { quicksort_@suff@, heapsort_@suff@, - mergesort_@suff@, timsort_@suff@ }, { aquicksort_@suff@, aheapsort_@suff@, - amergesort_@suff@, atimsort_@suff@ }, #else @@ -4463,13 +4461,11 @@ static PyArray_ArrFuncs _Py@NAME@_ArrFuncs = { { quicksort_@suff@, heapsort_@suff@, - mergesort_@suff@, timsort_@suff@ }, { aquicksort_@suff@, aheapsort_@suff@, - amergesort_@suff@, atimsort_@suff@ }, #else diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c index 437ea78d8..fa8de8b37 100644 --- a/numpy/core/src/multiarray/conversion_utils.c +++ b/numpy/core/src/multiarray/conversion_utils.c @@ -419,16 +419,23 @@ PyArray_SortkindConverter(PyObject *obj, NPY_SORTKIND *sortkind) *sortkind = NPY_HEAPSORT; } else if (str[0] == 'm' || str[0] == 'M') { - *sortkind = NPY_MERGESORT; - } - else if (str[0] == 't' || str[0] == 'T'){ - *sortkind = NPY_TIMSORT; + /* + * Mergesort is an alias for NPY_STABLESORT. + * That maintains backwards compatibility while + * allowing other types of stable sorts to be used. + */ + *sortkind = NPY_STABLESORT; } else if (str[0] == 's' || str[0] == 'S') { - /* available options: mergesort and timsort - * among which timsort is assumed to be better + /* + * NPY_STABLESORT is one of + * + * - mergesort + * - timsort + * + * Which one is used depends on the data type. */ - *sortkind = NPY_TIMSORT; + *sortkind = NPY_STABLESORT; } else { PyErr_Format(PyExc_ValueError, diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index 7f560363c..4888224f3 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -1132,10 +1132,7 @@ PyArray_Sort(PyArrayObject *op, int axis, NPY_SORTKIND which) case NPY_HEAPSORT: sort = npy_heapsort; break; - case NPY_MERGESORT: - sort = npy_mergesort; - break; - case NPY_TIMSORT: + case NPY_STABLESORT: sort = npy_timsort; break; } @@ -1286,10 +1283,7 @@ PyArray_ArgSort(PyArrayObject *op, int axis, NPY_SORTKIND which) case NPY_HEAPSORT: argsort = npy_aheapsort; break; - case NPY_MERGESORT: - argsort = npy_amergesort; - break; - case NPY_TIMSORT: + case NPY_STABLESORT: argsort = npy_atimsort; break; } @@ -1431,7 +1425,7 @@ PyArray_LexSort(PyObject *sort_keys, int axis) goto fail; } } - if (!PyArray_DESCR(mps[i])->f->argsort[NPY_MERGESORT] + if (!PyArray_DESCR(mps[i])->f->argsort[NPY_STABLESORT] && !PyArray_DESCR(mps[i])->f->compare) { PyErr_Format(PyExc_TypeError, "item %zd type does not have compare function", i); @@ -1527,9 +1521,9 @@ PyArray_LexSort(PyObject *sort_keys, int axis) int rcode; elsize = PyArray_DESCR(mps[j])->elsize; astride = PyArray_STRIDES(mps[j])[axis]; - argsort = PyArray_DESCR(mps[j])->f->argsort[NPY_MERGESORT]; + argsort = PyArray_DESCR(mps[j])->f->argsort[NPY_STABLESORT]; if(argsort == NULL) { - argsort = npy_amergesort; + argsort = npy_atimsort; } _unaligned_strided_byte_copy(valbuffer, (npy_intp) elsize, its[j]->dataptr, astride, N, elsize); @@ -1566,9 +1560,9 @@ PyArray_LexSort(PyObject *sort_keys, int axis) } for (j = 0; j < n; j++) { int rcode; - argsort = PyArray_DESCR(mps[j])->f->argsort[NPY_MERGESORT]; + argsort = PyArray_DESCR(mps[j])->f->argsort[NPY_STABLESORT]; if(argsort == NULL) { - argsort = npy_amergesort; + argsort = npy_atimsort; } rcode = argsort(its[j]->dataptr, (npy_intp *)rit->dataptr, N, mps[j]); diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index 62184de4e..cf197df38 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -1395,10 +1395,10 @@ class TestZeroSizeFlexible(object): sort_func(zs, kind=kind, **kwargs) def test_sort(self): - self._test_sort_partition('sort', kinds='qhmt') + self._test_sort_partition('sort', kinds='qhs') def test_argsort(self): - self._test_sort_partition('argsort', kinds='qhmt') + self._test_sort_partition('argsort', kinds='qhs') def test_partition(self): self._test_sort_partition('partition', kinds=['introselect'], kth=2) @@ -1450,7 +1450,7 @@ class TestZeroSizeFlexible(object): class TestMethods(object): - sort_kinds = [r'm', 'q', 'h', 't'] + sort_kinds = ['quicksort', 'heapsort', 'stable'] def test_compress(self): tgt = [[5, 6, 7, 8, 9]] |