13 files changed, 1153 insertions, 0 deletions
diff --git a/benchmarks/README.rst b/benchmarks/README.rst
new file mode 100644
index 000000000..2ed5d150f
--- /dev/null
+++ b/benchmarks/README.rst
@@ -0,0 +1,62 @@
+..  -*- rst -*-
+
+================
+NumPy benchmarks
+================
+
+Benchmarking NumPy with Airspeed Velocity.
+
+
+Usage
+-----
+
+Airspeed Velocity manages building and Python virtualenvs by itself,
+unless told otherwise. Some of the benchmarking features in
+``runtests.py`` also tell ASV to use the NumPy compiled by
+``runtests.py``. To run the benchmarks, you do not need to install a
+development version of NumPy to your current Python environment.
+
+Run a benchmark against currently checked out NumPy version (don't
+record the result)::
+
+    python runtests.py --bench bench_core
+
+Compare change in benchmark results to another version::
+
+    python runtests.py --bench-compare v1.6.2 bench_core
+
+Run ASV commands (record results and generate HTML)::
+
+    cd benchmarks
+    asv run --skip-existing-commits --steps 10 ALL
+    asv publish
+    asv preview
+
+More on how to use ``asv`` can be found in `ASV documentation`_
+Command-line help is available as usual via ``asv --help`` and
+``asv run --help``.
+
+.. _ASV documentation: https://spacetelescope.github.io/asv/
+
+
+Writing benchmarks
+------------------
+
+See `ASV documentation`_ for basics on how to write benchmarks.
+
+Some things to consider:
+
+- The benchmark suite should be importable with any NumPy version.
+
+- The benchmark parameters etc. should not depend on which NumPy version
+  is installed.
+
+- Try to keep the runtime of the benchmark reasonable.
+
+- Prefer ASV's ``time_`` methods for benchmarking times rather than cooking up
+  time measurements via ``time.clock``, even if it requires some juggling when
+  writing the benchmark.
+
+- Preparing arrays etc. should generally be put in the ``setup`` method rather
+  than the ``time_`` methods, to avoid counting preparation time together with
+  the time of the benchmarked operation.
diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
new file mode 100644
index 000000000..d837b0d67
--- /dev/null
+++ b/benchmarks/asv.conf.json
@@ -0,0 +1,85 @@
+{
+    // The version of the config file format.  Do not change, unless
+    // you know what you are doing.
+    "version": 1,
+
+    // The name of the project being benchmarked
+    "project": "numpy",
+
+    // The project's homepage
+    "project_url": "http://numpy.org/",
+
+    // The URL or local path of the source code repository for the
+    // project being benchmarked
+    "repo": "..",
+
+    // List of branches to benchmark. If not provided, defaults to "master"
+    // (for git) or "tip" (for mercurial).
+    "branches": ["master"],
+
+    // The DVCS being used.  If not set, it will be automatically
+    // determined from "repo" by looking at the protocol in the URL
+    // (if remote), or by looking for special directories, such as
+    // ".git" (if local).
+    "dvcs": "git",
+
+    // The tool to use to create environments.  May be "conda",
+    // "virtualenv" or other value depending on the plugins in use.
+    // If missing or the empty string, the tool will be automatically
+    // determined by looking for tools on the PATH environment
+    // variable.
+    "environment_type": "virtualenv",
+
+    // the base URL to show a commit for the project.
+    "show_commit_url": "https://github.com/numpy/numpy/commit/",
+
+    // The Pythons you'd like to test against.  If not provided, defaults
+    // to the current version of Python used to run `asv`.
+    "pythons": ["2.7"],
+
+    // The matrix of dependencies to test.  Each key is the name of a
+    // package (in PyPI) and the values are version numbers.  An empty
+    // list indicates to just test against the default (latest)
+    // version.
+    "matrix": {
+    	"six": [],
+    },
+
+    // The directory (relative to the current directory) that benchmarks are
+    // stored in.  If not provided, defaults to "benchmarks"
+    "benchmark_dir": "benchmarks",
+
+    // The directory (relative to the current directory) to cache the Python
+    // environments in.  If not provided, defaults to "env"
+    "env_dir": "env",
+
+
+    // The directory (relative to the current directory) that raw benchmark
+    // results are stored in.  If not provided, defaults to "results".
+    "results_dir": "results",
+
+    // The directory (relative to the current directory) that the html tree
+    // should be written to.  If not provided, defaults to "html".
+    "html_dir": "html",
+
+    // The number of characters to retain in the commit hashes.
+    // "hash_length": 8,
+
+    // `asv` will cache wheels of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // number of builds to keep, per environment.
+    "wheel_cache_size": 2,
+
+    // The commits after which the regression search in `asv publish`
+    // should start looking for regressions. Dictionary whose keys are
+    // regexps matching to benchmark names, and values corresponding to
+    // the commit (exclusive) after which to start looking for
+    // regressions.  The default is to start from the first commit
+    // with results. If the commit is `null`, regression detection is
+    // skipped for the matching benchmark.
+    //
+    // "regressions_first_commits": {
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+    //    "another_benchmark": null,   // Skip regression detection altogether
+    // }
+}
diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py
new file mode 100644
index 000000000..e8a859ff4
--- /dev/null
+++ b/benchmarks/benchmarks/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import absolute_import, division, print_function
+
+from . import common
diff --git a/benchmarks/benchmarks/bench_app.py b/benchmarks/benchmarks/bench_app.py
new file mode 100644
index 000000000..ccf6e4c4a
--- /dev/null
+++ b/benchmarks/benchmarks/bench_app.py
@@ -0,0 +1,89 @@
+from __future__ import absolute_import, division, print_function
+
+from .common import Benchmark
+
+import numpy as np
+
+from six.moves import xrange
+
+
+class LaplaceInplace(Benchmark):
+    params = ['inplace', 'normal']
+    param_names = ['update']
+
+    def setup(self, update):
+        N = 150
+        Niter = 1000
+        dx = 0.1
+        dy = 0.1
+        dx2 = (dx * dx)
+        dy2 = (dy * dy)
+
+        def num_update(u, dx2, dy2):
+            u[1:(-1), 1:(-1)] = ((((u[2:, 1:(-1)] + u[:(-2), 1:(-1)]) * dy2) +
+                                  ((u[1:(-1), 2:] + u[1:(-1), :(-2)]) * dx2))
+                                 / (2 * (dx2 + dy2)))
+
+        def num_inplace(u, dx2, dy2):
+            tmp = u[:(-2), 1:(-1)].copy()
+            np.add(tmp, u[2:, 1:(-1)], out=tmp)
+            np.multiply(tmp, dy2, out=tmp)
+            tmp2 = u[1:(-1), 2:].copy()
+            np.add(tmp2, u[1:(-1), :(-2)], out=tmp2)
+            np.multiply(tmp2, dx2, out=tmp2)
+            np.add(tmp, tmp2, out=tmp)
+            np.multiply(tmp, (1.0 / (2.0 * (dx2 + dy2))),
+                        out=u[1:(-1), 1:(-1)])
+
+        def laplace(N, Niter=100, func=num_update, args=()):
+            u = np.zeros([N, N], order='C')
+            u[0] = 1
+            for i in range(Niter):
+                func(u, *args)
+            return u
+
+        func = {'inplace': num_inplace, 'normal': num_update}[update]
+
+        def run():
+            laplace(N, Niter, func, args=(dx2, dy2))
+
+        self.run = run
+
+    def time_it(self, update):
+        self.run()
+
+
+class MaxesOfDots(Benchmark):
+    def setup(self):
+        np.random.seed(1)
+        nsubj = 5
+        nfeat = 100
+        ntime = 200
+
+        self.arrays = [np.random.normal(size=(ntime, nfeat))
+                       for i in xrange(nsubj)]
+
+    def maxes_of_dots(self, arrays):
+        """
+        A magical feature score for each feature in each dataset
+        :ref:`Haxby et al., Neuron (2011) <HGC+11>`.
+        If arrays are column-wise zscore-d before computation it
+        results in characterizing each column in each array with
+        sum of maximal correlations of that column with columns
+        in other arrays.
+
+        Arrays must agree only on the first dimension.
+
+        For numpy it a join benchmark of dot products and max()
+        on a set of arrays.
+        """
+        feature_scores = ([0] * len(arrays))
+        for (i, sd) in enumerate(arrays):
+            for (j, sd2) in enumerate(arrays[(i + 1):]):
+                corr_temp = np.dot(sd.T, sd2)
+                feature_scores[i] += np.max(corr_temp, axis=1)
+                feature_scores[((j + i) + 1)] += np.max(corr_temp, axis=0)
+        return feature_scores
+
+    def time_it(self):
+        self.maxes_of_dots(self.arrays)
diff --git a/benchmarks/benchmarks/bench_core.py b/benchmarks/benchmarks/bench_core.py
new file mode 100644
index 000000000..6701917cc
--- /dev/null
+++ b/benchmarks/benchmarks/bench_core.py
@@ -0,0 +1,132 @@
+from __future__ import absolute_import, division, print_function
+
+from .common import Benchmark
+
+import numpy as np
+
+
+class Core(Benchmark):
+    def setup(self):
+        self.l100 = range(100)
+        self.l50 = range(50)
+        self.l = [np.arange(1000), np.arange(1000)]
+        self.l10x10 = np.ones((10, 10))
+
+    def time_array_1(self):
+        np.array(1)
+
+    def time_array_empty(self):
+        np.array([])
+
+    def time_array_l1(self):
+        np.array([1])
+
+    def time_array_l100(self):
+        np.array(self.l100)
+
+    def time_array_l(self):
+        np.array(self.l)
+
+    def time_vstack_l(self):
+        np.vstack(self.l)
+
+    def time_hstack_l(self):
+        np.hstack(self.l)
+
+    def time_dstack_l(self):
+        np.dstack(self.l)
+
+    def time_arange_100(self):
+        np.arange(100)
+
+    def time_zeros_100(self):
+        np.zeros(100)
+
+    def time_ones_100(self):
+        np.ones(100)
+
+    def time_empty_100(self):
+        np.empty(100)
+
+    def time_eye_100(self):
+        np.eye(100)
+
+    def time_identity_100(self):
+        np.identity(100)
+
+    def time_eye_3000(self):
+        np.eye(3000)
+
+    def time_identity_3000(self):
+        np.identity(3000)
+
+    def time_diag_l100(self):
+        np.diag(self.l100)
+
+    def time_diagflat_l100(self):
+        np.diagflat(self.l100)
+
+    def time_diagflat_l50_l50(self):
+        np.diagflat([self.l50, self.l50])
+
+    def time_triu_l10x10(self):
+        np.triu(self.l10x10)
+
+    def time_tril_l10x10(self):
+        np.tril(self.l10x10)
+
+
+class MA(Benchmark):
+    def setup(self):
+        self.l100 = range(100)
+        self.t100 = ([True] * 100)
+
+    def time_masked_array(self):
+        np.ma.masked_array()
+
+    def time_masked_array_l100(self):
+        np.ma.masked_array(self.l100)
+
+    def time_masked_array_l100_t100(self):
+        np.ma.masked_array(self.l100, self.t100)
+
+
+class CorrConv(Benchmark):
+    params = [[50, 1000, 1e5],
+              [10, 100, 1000, 1e4],
+              ['valid', 'same', 'full']]
+    param_names = ['size1', 'size2', 'mode']
+
+    def setup(self, size1, size2, mode):
+        self.x1 = np.linspace(0, 1, num=size1)
+        self.x2 = np.cos(np.linspace(0, 2*np.pi, num=size2))
+
+    def time_correlate(self, size1, size2, mode):
+        np.correlate(self.x1, self.x2, mode=mode)
+
+    def time_convolve(self, size1, size2, mode):
+        np.convolve(self.x1, self.x2, mode=mode)
+
+
+class CountNonzero(Benchmark):
+    param_names = ['numaxes', 'size', 'dtype']
+    params = [
+        [1, 2, 3],
+        [100, 10000, 1000000],
+        [bool, int, str, object]
+    ]
+
+    def setup(self, numaxes, size, dtype):
+        self.x = np.empty(shape=(
+            numaxes, size), dtype=dtype)
+
+    def time_count_nonzero(self, numaxes, size, dtype):
+        np.count_nonzero(self.x)
+
+    def time_count_nonzero_axis(self, numaxes, size, dtype):
+        np.count_nonzero(self.x, axis=self.x.ndim - 1)
+
+    def time_count_nonzero_multi_axis(self, numaxes, size, dtype):
+        if self.x.ndim >= 2:
+            np.count_nonzero(self.x, axis=(
+                self.x.ndim - 1, self.x.ndim - 2))
diff --git a/benchmarks/benchmarks/bench_function_base.py b/benchmarks/benchmarks/bench_function_base.py
new file mode 100644
index 000000000..23103ba66
--- /dev/null
+++ b/benchmarks/benchmarks/bench_function_base.py
@@ -0,0 +1,126 @@
+from __future__ import absolute_import, division, print_function
+
+from .common import Benchmark
+
+import numpy as np
+
+
+class Bincount(Benchmark):
+    def setup(self):
+        self.d = np.arange(80000, dtype=np.intp)
+        self.e = self.d.astype(np.float64)
+
+    def time_bincount(self):
+        np.bincount(self.d)
+
+    def time_weights(self):
+        np.bincount(self.d, weights=self.e)
+
+
+class Median(Benchmark):
+    def setup(self):
+        self.e = np.arange(10000, dtype=np.float32)
+        self.o = np.arange(10001, dtype=np.float32)
+
+    def time_even(self):
+        np.median(self.e)
+
+    def time_odd(self):
+        np.median(self.o)
+
+    def time_even_inplace(self):
+        np.median(self.e, overwrite_input=True)
+
+    def time_odd_inplace(self):
+        np.median(self.o, overwrite_input=True)
+
+    def time_even_small(self):
+        np.median(self.e[:500], overwrite_input=True)
+
+    def time_odd_small(self):
+        np.median(self.o[:500], overwrite_input=True)
+
+
+class Percentile(Benchmark):
+    def setup(self):
+        self.e = np.arange(10000, dtype=np.float32)
+        self.o = np.arange(10001, dtype=np.float32)
+
+    def time_quartile(self):
+        np.percentile(self.e, [25, 75])
+
+    def time_percentile(self):
+        np.percentile(self.e, [25, 35, 55, 65, 75])
+
+
+class Select(Benchmark):
+    def setup(self):
+        self.d = np.arange(20000)
+        self.e = self.d.copy()
+        self.cond = [(self.d > 4), (self.d < 2)]
+        self.cond_large = [(self.d > 4), (self.d < 2)] * 10
+
+    def time_select(self):
+        np.select(self.cond, [self.d, self.e])
+
+    def time_select_larger(self):
+        np.select(self.cond_large, ([self.d, self.e] * 10))
+
+
+class Sort(Benchmark):
+    def setup(self):
+        self.e = np.arange(10000, dtype=np.float32)
+        self.o = np.arange(10001, dtype=np.float32)
+        np.random.seed(25)
+        np.random.shuffle(self.o)
+        # quicksort implementations can have issues with equal elements
+        self.equal = np.ones(10000)
+        self.many_equal = np.sort(np.arange(10000) % 10)
+
+        # quicksort median of 3 worst case
+        self.worst = np.arange(1000000)
+        x = self.worst
+        while x.size > 3:
+            mid = x.size // 2
+            x[mid], x[-2] = x[-2], x[mid]
+            x = x[:-2]
+
+    def time_sort(self):
+        np.sort(self.e)
+
+    def time_sort_random(self):
+        np.sort(self.o)
+
+    def time_sort_inplace(self):
+        self.e.sort()
+
+    def time_sort_equal(self):
+        self.equal.sort()
+
+    def time_sort_many_equal(self):
+        self.many_equal.sort()
+
+    def time_sort_worst(self):
+        np.sort(self.worst)
+
+    def time_argsort(self):
+        self.e.argsort()
+
+    def time_argsort_random(self):
+        self.o.argsort()
+
+
+class Where(Benchmark):
+    def setup(self):
+        self.d = np.arange(20000)
+        self.e = self.d.copy()
+        self.cond = (self.d > 5000)
+
+    def time_1(self):
+        np.where(self.cond)
+
+    def time_2(self):
+        np.where(self.cond, self.d, self.e)
+
+    def time_2_broadcast(self):
+        np.where(self.cond, self.d, 0)
diff --git a/benchmarks/benchmarks/bench_indexing.py b/benchmarks/benchmarks/bench_indexing.py
new file mode 100644
index 000000000..a62a2050e
--- /dev/null
+++ b/benchmarks/benchmarks/bench_indexing.py
@@ -0,0 +1,81 @@
+from __future__ import absolute_import, division, print_function
+
+from .common import Benchmark, get_squares_, get_indexes_, get_indexes_rand_
+
+from os.path import join as pjoin
+import shutil
+import sys
+import six
+from numpy import memmap, float32, array
+import numpy as np
+from tempfile import mkdtemp
+
+
+class Indexing(Benchmark):
+    params = [["indexes_", "indexes_rand_"],
+              ['I', ':,I', 'np.ix_(I, I)'],
+              ['', '=1']]
+    param_names = ['indexes', 'sel', 'op']
+
+    def setup(self, indexes, sel, op):
+        sel = sel.replace('I', indexes)
+
+        ns = {'squares_': get_squares_(),
+              'np': np,
+              'indexes_': get_indexes_(),
+              'indexes_rand_': get_indexes_rand_()}
+
+        if sys.version_info[0] >= 3:
+            code = "def run():\n    for a in squares_.values(): a[%s]%s"
+        else:
+            code = "def run():\n    for a in squares_.itervalues(): a[%s]%s"
+        code = code % (sel, op)
+
+        six.exec_(code, ns)
+        self.func = ns['run']
+
+    def time_op(self, indexes, sel, op):
+        self.func()
+
+
+class IndexingSeparate(Benchmark):
+    def setup(self):
+        self.tmp_dir = mkdtemp()
+        self.fp = memmap(pjoin(self.tmp_dir, 'tmp.dat'),
+                         dtype=float32, mode='w+', shape=(50, 60))
+        self.indexes = array([3, 4, 6, 10, 20])
+
+    def teardown(self):
+        del self.fp
+        shutil.rmtree(self.tmp_dir)
+
+    def time_mmap_slicing(self):
+        for i in range(1000):
+            self.fp[5:10]
+
+    def time_mmap_fancy_indexing(self):
+        for i in range(1000):
+            self.fp[self.indexes]
+
+
+class IndexingStructured0D(Benchmark):
+    def setup(self):
+        self.dt = np.dtype([('a', 'f4', 256)])
+
+        self.A = np.zeros((), self.dt)
+        self.B = self.A.copy()
+
+        self.a = np.zeros(1, self.dt)[0]
+        self.b = self.a.copy()
+
+    def time_array_slice(self):
+        self.B['a'][:] = self.A['a']
+
+    def time_array_all(self):
+        self.B['a'] = self.A['a']
+
+    def time_scalar_slice(self):
+        self.b['a'][:] = self.a['a']
+
+    def time_scalar_all(self):
+        self.b['a'] = self.a['a']
diff --git a/benchmarks/benchmarks/bench_io.py b/benchmarks/benchmarks/bench_io.py
new file mode 100644
index 000000000..782d4ab30
--- /dev/null
+++ b/benchmarks/benchmarks/bench_io.py
@@ -0,0 +1,64 @@
+from __future__ import absolute_import, division, print_function
+
+from .common import Benchmark, get_squares
+
+import numpy as np
+
+
+class Copy(Benchmark):
+    params = ["int8", "int16", "float32", "float64",
+              "complex64", "complex128"]
+    param_names = ['type']
+
+    def setup(self, typename):
+        dtype = np.dtype(typename)
+        self.d = np.arange((50 * 500), dtype=dtype).reshape((500, 50))
+        self.e = np.arange((50 * 500), dtype=dtype).reshape((50, 500))
+        self.e_d = self.e.reshape(self.d.shape)
+        self.dflat = np.arange((50 * 500), dtype=dtype)
+
+    def time_memcpy(self, typename):
+        self.d[...] = self.e_d
+
+    def time_cont_assign(self, typename):
+        self.d[...] = 1
+
+    def time_strided_copy(self, typename):
+        self.d[...] = self.e.T
+
+    def time_strided_assign(self, typename):
+        self.dflat[::2] = 2
+
+
+class CopyTo(Benchmark):
+    def setup(self):
+        self.d = np.ones(50000)
+        self.e = self.d.copy()
+        self.m = (self.d == 1)
+        self.im = (~ self.m)
+        self.m8 = self.m.copy()
+        self.m8[::8] = (~ self.m[::8])
+        self.im8 = (~ self.m8)
+
+    def time_copyto(self):
+        np.copyto(self.d, self.e)
+
+    def time_copyto_sparse(self):
+        np.copyto(self.d, self.e, where=self.m)
+
+    def time_copyto_dense(self):
+        np.copyto(self.d, self.e, where=self.im)
+
+    def time_copyto_8_sparse(self):
+        np.copyto(self.d, self.e, where=self.m8)
+
+    def time_copyto_8_dense(self):
+        np.copyto(self.d, self.e, where=self.im8)
+
+
+class Savez(Benchmark):
+    def setup(self):
+        self.squares = get_squares()
+
+    def time_vb_savez_squares(self):
+        np.savez('tmp.npz', self.squares)
diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py
new file mode 100644
index 000000000..a65d510be
--- /dev/null
+++ b/benchmarks/benchmarks/bench_linalg.py
@@ -0,0 +1,109 @@
+from __future__ import absolute_import, division, print_function
+
+from .common import Benchmark, get_squares_, get_indexes_rand, TYPES1
+
+import numpy as np
+
+
+class Eindot(Benchmark):
+    def setup(self):
+        self.a = np.arange(60000.0).reshape(150, 400)
+        self.ac = self.a.copy()
+        self.at = self.a.T
+        self.atc = self.a.T.copy()
+        self.b = np.arange(240000.0).reshape(400, 600)
+        self.c = np.arange(600)
+        self.d = np.arange(400)
+
+        self.a3 = np.arange(480000.).reshape(60, 80, 100)
+        self.b3 = np.arange(192000.).reshape(80, 60, 40)
+
+    def time_dot_a_b(self):
+        np.dot(self.a, self.b)
+
+    def time_dot_d_dot_b_c(self):
+        np.dot(self.d, np.dot(self.b, self.c))
+
+    def time_dot_trans_a_at(self):
+        np.dot(self.a, self.at)
+
+    def time_dot_trans_a_atc(self):
+        np.dot(self.a, self.atc)
+
+    def time_dot_trans_at_a(self):
+        np.dot(self.at, self.a)
+
+    def time_dot_trans_atc_a(self):
+        np.dot(self.atc, self.a)
+
+    def time_einsum_i_ij_j(self):
+        np.einsum('i,ij,j', self.d, self.b, self.c)
+
+    def time_einsum_ij_jk_a_b(self):
+        np.einsum('ij,jk', self.a, self.b)
+
+    def time_einsum_ijk_jil_kl(self):
+        np.einsum('ijk,jil->kl', self.a3, self.b3)
+
+    def time_inner_trans_a_a(self):
+        np.inner(self.a, self.a)
+
+    def time_inner_trans_a_ac(self):
+        np.inner(self.a, self.ac)
+
+    def time_matmul_a_b(self):
+        np.matmul(self.a, self.b)
+
+    def time_matmul_d_matmul_b_c(self):
+        np.matmul(self.d, np.matmul(self.b, self.c))
+
+    def time_matmul_trans_a_at(self):
+        np.matmul(self.a, self.at)
+
+    def time_matmul_trans_a_atc(self):
+        np.matmul(self.a, self.atc)
+
+    def time_matmul_trans_at_a(self):
+        np.matmul(self.at, self.a)
+
+    def time_matmul_trans_atc_a(self):
+        np.matmul(self.atc, self.a)
+
+    def time_tensordot_a_b_axes_1_0_0_1(self):
+        np.tensordot(self.a3, self.b3, axes=([1, 0], [0, 1]))
+
+
+class Linalg(Benchmark):
+    params = [['svd', 'pinv', 'det', 'norm'],
+              TYPES1]
+    param_names = ['op', 'type']
+
+    def setup(self, op, typename):
+        np.seterr(all='ignore')
+
+        self.func = getattr(np.linalg, op)
+
+        if op == 'cholesky':
+            # we need a positive definite
+            self.a = np.dot(get_squares_()[typename],
+                            get_squares_()[typename].T)
+        else:
+            self.a = get_squares_()[typename]
+
+        # check that dtype is supported at all
+        try:
+            self.func(self.a[:2, :2])
+        except TypeError:
+            raise NotImplementedError()
+
+    def time_op(self, op, typename):
+        self.func(self.a)
+
+
+class Lstsq(Benchmark):
+    def setup(self):
+        self.a = get_squares_()['float64']
+        self.b = get_indexes_rand()[:100].astype(np.float64)
+
+    def time_numpy_linalg_lstsq_a__b_float64(self):
+        np.linalg.lstsq(self.a, self.b)
diff --git a/benchmarks/benchmarks/bench_random.py b/benchmarks/benchmarks/bench_random.py
new file mode 100644
index 000000000..18444b9a1
--- /dev/null
+++ b/benchmarks/benchmarks/bench_random.py
@@ -0,0 +1,67 @@
+from __future__ import absolute_import, division, print_function
+
+from .common import Benchmark
+
+import numpy as np
+from numpy.lib import NumpyVersion
+
+
+class Random(Benchmark):
+    params = ['normal', 'uniform', 'weibull 1', 'binomial 10 0.5',
+              'poisson 10']
+
+    def setup(self, name):
+        items = name.split()
+        name = items.pop(0)
+        params = [float(x) for x in items]
+
+        self.func = getattr(np.random, name)
+        self.params = tuple(params) + ((100, 100),)
+
+    def time_rng(self, name):
+        self.func(*self.params)
+
+
+class Shuffle(Benchmark):
+    def setup(self):
+        self.a = np.arange(100000)
+
+    def time_100000(self):
+        np.random.shuffle(self.a)
+
+
+class Randint(Benchmark):
+
+    def time_randint_fast(self):
+        """Compare to uint32 below"""
+        np.random.randint(0, 2**30, size=10**5)
+
+    def time_randint_slow(self):
+        """Compare to uint32 below"""
+        np.random.randint(0, 2**30 + 1, size=10**5)
+
+
+class Randint_dtype(Benchmark):
+    high = {
+        'bool': 1,
+        'uint8': 2**7,
+        'uint16': 2**15,
+        'uint32': 2**31,
+        'uint64': 2**63
+        }
+
+    param_names = ['dtype']
+    params = ['bool', 'uint8', 'uint16', 'uint32', 'uint64']
+
+    def setup(self, name):
+        if NumpyVersion(np.__version__) < '1.11.0.dev0':
+            raise NotImplementedError
+
+    def time_randint_fast(self, name):
+        high = self.high[name]
+        np.random.randint(0, high, size=10**5, dtype=name)
+
+    def time_randint_slow(self, name):
+        high = self.high[name]
+        np.random.randint(0, high + 1, size=10**5, dtype=name)
+
diff --git a/benchmarks/benchmarks/bench_reduce.py b/benchmarks/benchmarks/bench_reduce.py
new file mode 100644
index 000000000..704023528
--- /dev/null
+++ b/benchmarks/benchmarks/bench_reduce.py
@@ -0,0 +1,67 @@
+from __future__ import absolute_import, division, print_function
+
+from .common import Benchmark, TYPES1, get_squares
+
+import numpy as np
+
+
+class AddReduce(Benchmark):
+    def setup(self):
+        self.squares = get_squares().values()
+
+    def time_axis_0(self):
+        [np.add.reduce(a, axis=0) for a in self.squares]
+
+    def time_axis_1(self):
+        [np.add.reduce(a, axis=1) for a in self.squares]
+
+
+class AddReduceSeparate(Benchmark):
+    params = [[0, 1], TYPES1]
+    param_names = ['axis', 'type']
+
+    def setup(self, axis, typename):
+        self.a = get_squares()[typename]
+
+    def time_reduce(self, axis, typename):
+        np.add.reduce(self.a, axis=axis)
+
+
+class AnyAll(Benchmark):
+    def setup(self):
+        self.zeros = np.zeros(100000, np.bool)
+        self.ones = np.ones(100000, np.bool)
+
+    def time_all_fast(self):
+        self.zeros.all()
+
+    def time_all_slow(self):
+        self.ones.all()
+
+    def time_any_fast(self):
+        self.ones.any()
+
+    def time_any_slow(self):
+        self.zeros.any()
+
+
+class MinMax(Benchmark):
+    params = [np.float32, np.float64, np.intp]
+    param_names = ['dtype']
+
+    def setup(self, dtype):
+        self.d = np.ones(20000, dtype=dtype)
+
+    def time_min(self, dtype):
+        np.min(self.d)
+
+    def time_max(self, dtype):
+        np.max(self.d)
+
+
+class SmallReduction(Benchmark):
+    def setup(self):
+        self.d = np.ones(100, dtype=np.float32)
+
+    def time_small(self):
+        np.sum(self.d)
diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py
new file mode 100644
index 000000000..1baee1340
--- /dev/null
+++ b/benchmarks/benchmarks/bench_ufunc.py
@@ -0,0 +1,152 @@
+from __future__ import absolute_import, division, print_function
+
+from .common import Benchmark, get_squares_
+
+import numpy as np
+
+
+ufuncs = ['abs', 'absolute', 'add', 'arccos', 'arccosh', 'arcsin',
+          'arcsinh', 'arctan', 'arctan2', 'arctanh', 'bitwise_and',
+          'bitwise_not', 'bitwise_or', 'bitwise_xor', 'cbrt', 'ceil',
+          'conj', 'conjugate', 'copysign', 'cos', 'cosh', 'deg2rad',
+          'degrees', 'divide', 'equal', 'exp', 'exp2', 'expm1',
+          'fabs', 'floor', 'floor_divide', 'fmax', 'fmin', 'fmod',
+          'frexp', 'greater', 'greater_equal', 'hypot', 'invert',
+          'isfinite', 'isinf', 'isnan', 'ldexp', 'left_shift', 'less',
+          'less_equal', 'log', 'log10', 'log1p', 'log2', 'logaddexp',
+          'logaddexp2', 'logical_and', 'logical_not', 'logical_or',
+          'logical_xor', 'maximum', 'minimum', 'mod', 'modf',
+          'multiply', 'negative', 'nextafter', 'not_equal', 'power',
+          'rad2deg', 'radians', 'reciprocal', 'remainder',
+          'right_shift', 'rint', 'sign', 'signbit', 'sin', 'sinh',
+          'spacing', 'sqrt', 'square', 'subtract', 'tan', 'tanh',
+          'true_divide', 'trunc']
+
+for name in dir(np):
+    if isinstance(getattr(np, name, None), np.ufunc) and name not in ufuncs:
+        print("Missing ufunc %r" % (name,))
+
+
+class Broadcast(Benchmark):
+    def setup(self):
+        self.d = np.ones((50000, 100), dtype=np.float64)
+        self.e = np.ones((100,), dtype=np.float64)
+
+    def time_broadcast(self):
+        self.d - self.e
+
+
+class UFunc(Benchmark):
+    params = [ufuncs]
+    param_names = ['ufunc']
+    timeout = 10
+
+    def setup(self, ufuncname):
+        np.seterr(all='ignore')
+        try:
+            self.f = getattr(np, ufuncname)
+        except AttributeError:
+            raise NotImplementedError()
+        self.args = []
+        for t, a in get_squares_().items():
+            arg = (a,) * self.f.nin
+            try:
+                self.f(*arg)
+            except TypeError:
+                continue
+            self.args.append(arg)
+
+    def time_ufunc_types(self, ufuncname):
+        [self.f(*arg) for arg in self.args]
+
+
+class Custom(Benchmark):
+    def setup(self):
+        self.b = np.ones(20000, dtype=np.bool)
+
+    def time_nonzero(self):
+        np.nonzero(self.b)
+
+    def time_not_bool(self):
+        (~self.b)
+
+    def time_and_bool(self):
+        (self.b & self.b)
+
+    def time_or_bool(self):
+        (self.b | self.b)
+
+
+class CustomInplace(Benchmark):
+    def setup(self):
+        self.c = np.ones(500000, dtype=np.int8)
+        self.i = np.ones(150000, dtype=np.int32)
+        self.f = np.zeros(150000, dtype=np.float32)
+        self.d = np.zeros(75000, dtype=np.float64)
+        # fault memory
+        self.f *= 1.
+        self.d *= 1.
+
+    def time_char_or(self):
+        np.bitwise_or(self.c, 0, out=self.c)
+        np.bitwise_or(0, self.c, out=self.c)
+
+    def time_char_or_temp(self):
+        0 | self.c | 0
+
+    def time_int_or(self):
+        np.bitwise_or(self.i, 0, out=self.i)
+        np.bitwise_or(0, self.i, out=self.i)
+
+    def time_int_or_temp(self):
+        0 | self.i | 0
+
+    def time_float_add(self):
+        np.add(self.f, 1., out=self.f)
+        np.add(1., self.f, out=self.f)
+
+    def time_float_add_temp(self):
+        1. + self.f + 1.
+
+    def time_double_add(self):
+        np.add(self.d, 1., out=self.d)
+        np.add(1., self.d, out=self.d)
+
+    def time_double_add_temp(self):
+        1. + self.d + 1.
+
+
+class CustomScalar(Benchmark):
+    params = [np.float32, np.float64]
+    param_names = ['dtype']
+
+    def setup(self, dtype):
+        self.d = np.ones(20000, dtype=dtype)
+
+    def time_add_scalar2(self, dtype):
+        np.add(self.d, 1)
+
+    def time_divide_scalar2(self, dtype):
+        np.divide(self.d, 1)
+
+    def time_divide_scalar2_inplace(self, dtype):
+        np.divide(self.d, 1, out=self.d)
+
+    def time_less_than_scalar2(self, dtype):
+        (self.d < 1)
+
+
+class Scalar(Benchmark):
+    def setup(self):
+        self.x = np.asarray(1.0)
+        self.y = np.asarray((1.0 + 1j))
+        self.z = complex(1.0, 1.0)
+
+    def time_add_scalar(self):
+        (self.x + self.x)
+
+    def time_add_scalar_conv(self):
+        (self.x + 1.0)
+
+    def time_add_scalar_conv_complex(self):
+        (self.y + self.z)
diff --git a/benchmarks/benchmarks/common.py b/benchmarks/benchmarks/common.py
new file mode 100644
index 000000000..18a09fd40
--- /dev/null
+++ b/benchmarks/benchmarks/common.py
@@ -0,0 +1,116 @@
+from __future__ import absolute_import, division, print_function
+
+import numpy
+import random
+
+# Various pre-crafted datasets/variables for testing
+# !!! Must not be changed -- only appended !!!
+# while testing numpy we better not rely on numpy to produce random
+# sequences
+random.seed(1)
+# but will seed it nevertheless
+numpy.random.seed(1)
+
+nx, ny = 1000, 1000
+# reduced squares based on indexes_rand, primarily for testing more
+# time-consuming functions (ufunc, linalg, etc)
+nxs, nys = 100, 100
+
+# a set of interesting types to test
+TYPES1 = [
+    'int16', 'float16',
+    'int32', 'float32',
+    'int64', 'float64',  'complex64',
+    'longfloat', 'complex128',
+]
+if 'complex256' in numpy.typeDict:
+    TYPES1.append('complex256')
+
+
+def memoize(func):
+    result = []
+    def wrapper():
+        if not result:
+            result.append(func())
+        return result[0]
+    return wrapper
+
+
+# values which will be used to construct our sample data matrices
+# replicate 10 times to speed up initial imports of this helper
+# and generate some redundancy
+
+@memoize
+def get_values():
+    rnd = numpy.random.RandomState(1)
+    values = numpy.tile(rnd.uniform(0, 100, size=nx*ny//10), 10)
+    return values
+
+
+@memoize
+def get_squares():
+    values = get_values()
+    squares = {t: numpy.array(values,
+                              dtype=getattr(numpy, t)).reshape((nx, ny))
+               for t in TYPES1}
+
+    # adjust complex ones to have non-degenerated imagery part -- use
+    # original data transposed for that
+    for t, v in squares.items():
+        if t.startswith('complex'):
+            v += v.T*1j
+    return squares
+
+
+@memoize
+def get_squares_():
+    # smaller squares
+    squares_ = {t: s[:nxs, :nys] for t, s in get_squares().items()}
+    return squares_
+
+
+@memoize
+def get_vectors():
+    # vectors
+    vectors = {t: s[0] for t, s in get_squares().items()}
+    return vectors
+
+
+@memoize
+def get_indexes():
+    indexes = list(range(nx))
+    # so we do not have all items
+    indexes.pop(5)
+    indexes.pop(95)
+
+    indexes = numpy.array(indexes)
+    return indexes
+
+
+@memoize
+def get_indexes_rand():
+    rnd = random.Random(1)
+
+    indexes_rand = get_indexes().tolist()       # copy
+    rnd.shuffle(indexes_rand)         # in-place shuffle
+    indexes_rand = numpy.array(indexes_rand)
+    return indexes_rand
+
+
+@memoize
+def get_indexes_():
+    # smaller versions
+    indexes = get_indexes()
+    indexes_ = indexes[indexes < nxs]
+    return indexes_
+
+
+@memoize
+def get_indexes_rand_():
+    indexes_rand = get_indexes_rand()
+    indexes_rand_ = indexes_rand[indexes_rand < nxs]
+    return indexes_rand_
+
+
+class Benchmark(object):
+    goal_time = 0.25