{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Cython for NumPy users" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To follow the tutorial, see https://cython.readthedocs.io/en/latest/src/userguide/numpy_tutorial.html" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING: Disabling color, you really want to install colorlog.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0.29a0\n" ] } ], "source": [ "from __future__ import print_function\n", "%load_ext cython\n", "import Cython\n", "print(Cython.__version__)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "array_1 = np.random.uniform(0, 1000, size=(3000, 2000)).astype(np.intc)\n", "array_2 = np.random.uniform(0, 1000, size=(3000, 2000)).astype(np.intc)\n", "a = 4\n", "b = 3\n", "c = 9" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### The first Cython program" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Numpy version" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def compute_np(array_1, array_2, a, b, c):\n", " return np.clip(array_1, 2, 10) * a + array_2 * b + c" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "103 ms ± 2.68 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "timeit_result = %timeit -o compute_np(array_1, array_2, a, b, c)\n", "np_time = timeit_result.average" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "np_result = compute_np(array_1, array_2, a, b, c)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Pure Python version" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def clip(a, min_value, max_value):\n", " return min(max(a, min_value), max_value)\n", "\n", "\n", "def compute(array_1, array_2, a, b, c):\n", " \"\"\"\n", " This function must implement the formula\n", " np.clip(array_1, 2, 10) * a + array_2 * b + c\n", "\n", " array_1 and array_2 are 2D.\n", " \"\"\"\n", " x_max = array_1.shape[0]\n", " y_max = array_1.shape[1]\n", " \n", " assert array_1.shape == array_2.shape\n", "\n", " result = np.zeros((x_max, y_max), dtype=array_1.dtype)\n", "\n", " for x in range(x_max):\n", " for y in range(y_max):\n", "\n", " tmp = clip(array_1[x, y], 2, 10)\n", " tmp = tmp * a + array_2[x, y] * b\n", " result[x, y] = tmp + c\n", "\n", " return result" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "assert np.all(compute(array_1, array_2, a, b, c) == np_result)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1min 10s ± 844 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "timeit_result = %timeit -o compute(array_1, array_2, a, b, c)\n", "py_time = timeit_result.average" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### We make a function to be able to easily compare timings with the NumPy version and the pure Python version." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def compare_time(current, reference, name):\n", " ratio = reference/current\n", " if ratio > 1:\n", " word = \"faster\"\n", " else:\n", " ratio = 1 / ratio \n", " word = \"slower\"\n", " \n", " print(\"We are\", \"{0:.1f}\".format(ratio), \"times\", word, \"than the\", name, \"version.\")\n", "\n", "def print_report(compute_function):\n", " assert np.all(compute_function(array_1, array_2, a, b, c) == np_result)\n", " timeit_result = %timeit -o compute_function(array_1, array_2, a, b, c)\n", " run_time = timeit_result.average\n", " compare_time(run_time, py_time, \"pure Python\")\n", " compare_time(run_time, np_time, \"NumPy\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Pure Python version compiled with Cython:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "%%cython -a\n", "import numpy as np\n", "\n", "\n", "def clip(a, min_value, max_value):\n", " return min(max(a, min_value), max_value)\n", "\n", "\n", "def compute(array_1, array_2, a, b, c):\n", " \"\"\"\n", " This function must implement the formula\n", " np.clip(array_1, 2, 10) * a + array_2 * b + c\n", "\n", " array_1 and array_2 are 2D.\n", " \"\"\"\n", " x_max = array_1.shape[0]\n", " y_max = array_1.shape[1]\n", " \n", " assert array_1.shape == array_2.shape\n", "\n", " result = np.zeros((x_max, y_max), dtype=array_1.dtype)\n", "\n", " for x in range(x_max):\n", " for y in range(y_max):\n", "\n", " tmp = clip(array_1[x, y], 2, 10)\n", " tmp = tmp * a + array_2[x, y] * b\n", " result[x, y] = tmp + c\n", "\n", " return result" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "56.5 s ± 587 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", "We are 1.2 times faster than the pure Python version.\n", "We are 546.0 times slower than the NumPy version.\n" ] } ], "source": [ "print_report(compute)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Adding types:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "%%cython -a\n", "import numpy as np\n", "\n", "# We now need to fix a datatype for our arrays. I've used the variable\n", "# DTYPE for this, which is assigned to the usual NumPy runtime\n", "# type info object.\n", "DTYPE = np.intc\n", "\n", "# cdef means here that this function is a plain C function (so faster).\n", "# To get all the benefits, we type the arguments and the return value as int.\n", "cdef int clip(int a, int min_value, int max_value):\n", " return min(max(a, min_value), max_value)\n", "\n", "\n", "def compute(array_1, array_2, int a, int b, int c):\n", " \n", " # The \"cdef\" keyword is also used within functions to type variables. It\n", " # can only be used at the top indentation level (there are non-trivial\n", " # problems with allowing them in other places, though we'd love to see\n", " # good and thought out proposals for it).\n", " cdef Py_ssize_t x_max = array_1.shape[0]\n", " cdef Py_ssize_t y_max = array_1.shape[1]\n", " \n", " assert array_1.shape == array_2.shape\n", " assert array_1.dtype == DTYPE\n", " assert array_2.dtype == DTYPE\n", "\n", " result = np.zeros((x_max, y_max), dtype=DTYPE)\n", " \n", " # It is very important to type ALL your variables. You do not get any\n", " # warnings if not, only much slower code (they are implicitly typed as\n", " # Python objects).\n", " # For the \"tmp\" variable, we want to use the same data type as is\n", " # stored in the array, so we use int because it correspond to np.intc.\n", " # NB! An important side-effect of this is that if \"tmp\" overflows its\n", " # datatype size, it will simply wrap around like in C, rather than raise\n", " # an error like in Python.\n", "\n", " cdef int tmp\n", " cdef Py_ssize_t x, y\n", "\n", " for x in range(x_max):\n", " for y in range(y_max):\n", "\n", " tmp = clip(array_1[x, y], 2, 10)\n", " tmp = tmp * a + array_2[x, y] * b\n", " result[x, y] = tmp + c\n", "\n", " return result" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "26.5 s ± 422 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", "We are 2.7 times faster than the pure Python version.\n", "We are 256.2 times slower than the NumPy version.\n" ] } ], "source": [ "print_report(compute)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Efficient indexing with memoryviews:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "%%cython -a\n", "import numpy as np\n", "\n", "DTYPE = np.intc\n", "\n", "\n", "cdef int clip(int a, int min_value, int max_value):\n", " return min(max(a, min_value), max_value)\n", "\n", "\n", "def compute(int[:, :] array_1, int[:, :] array_2, int a, int b, int c):\n", " \n", " cdef Py_ssize_t x_max = array_1.shape[0]\n", " cdef Py_ssize_t y_max = array_1.shape[1]\n", " \n", " assert tuple(array_1.shape) == tuple(array_2.shape)\n", "\n", " result = np.zeros((x_max, y_max), dtype=DTYPE)\n", " cdef int[:, :] result_view = result\n", "\n", " cdef int tmp\n", " cdef Py_ssize_t x, y\n", "\n", " for x in range(x_max):\n", " for y in range(y_max):\n", "\n", " tmp = clip(array_1[x, y], 2, 10)\n", " tmp = tmp * a + array_2[x, y] * b\n", " result_view[x, y] = tmp + c\n", "\n", " return result" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "22.9 ms ± 197 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", "We are 3081.0 times faster than the pure Python version.\n", "We are 4.5 times faster than the NumPy version.\n" ] } ], "source": [ "print_report(compute)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Tuning indexing further:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "%%cython -a\n", "import numpy as np\n", "cimport cython\n", "\n", "DTYPE = np.intc\n", "\n", "\n", "cdef int clip(int a, int min_value, int max_value):\n", " return min(max(a, min_value), max_value)\n", "\n", "@cython.boundscheck(False)\n", "@cython.wraparound(False)\n", "def compute(int[:, :] array_1, int[:, :] array_2, int a, int b, int c):\n", " \n", " cdef Py_ssize_t x_max = array_1.shape[0]\n", " cdef Py_ssize_t y_max = array_1.shape[1]\n", " \n", " assert tuple(array_1.shape) == tuple(array_2.shape)\n", "\n", " result = np.zeros((x_max, y_max), dtype=DTYPE)\n", " cdef int[:, :] result_view = result\n", "\n", " cdef int tmp\n", " cdef Py_ssize_t x, y\n", "\n", " for x in range(x_max):\n", " for y in range(y_max):\n", "\n", " tmp = clip(array_1[x, y], 2, 10)\n", " tmp = tmp * a + array_2[x, y] * b\n", " result_view[x, y] = tmp + c\n", "\n", " return result" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "16.8 ms ± 25.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", "We are 4200.7 times faster than the pure Python version.\n", "We are 6.2 times faster than the NumPy version.\n" ] } ], "source": [ "print_report(compute)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Declaring the NumPy arrays as contiguous." ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "%%cython\n", "import numpy as np\n", "cimport cython\n", "\n", "DTYPE = np.intc\n", "\n", "\n", "cdef int clip(int a, int min_value, int max_value):\n", " return min(max(a, min_value), max_value)\n", "\n", "\n", "@cython.boundscheck(False)\n", "@cython.wraparound(False)\n", "def compute(int[:, ::1] array_1, int[:, ::1] array_2, int a, int b, int c):\n", " \n", " cdef Py_ssize_t x_max = array_1.shape[0]\n", " cdef Py_ssize_t y_max = array_1.shape[1]\n", " \n", " assert tuple(array_1.shape) == tuple(array_2.shape)\n", "\n", " result = np.zeros((x_max, y_max), dtype=DTYPE)\n", " cdef int[:, ::1] result_view = result\n", "\n", " cdef int tmp\n", " cdef Py_ssize_t x, y\n", "\n", " for x in range(x_max):\n", " for y in range(y_max):\n", "\n", " tmp = clip(array_1[x, y], 2, 10)\n", " tmp = tmp * a + array_2[x, y] * b\n", " result_view[x, y] = tmp + c\n", "\n", " return result" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "11.1 ms ± 30.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", "We are 6350.9 times faster than the pure Python version.\n", "We are 9.3 times faster than the NumPy version.\n" ] } ], "source": [ "print_report(compute)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Making the function cleaner" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%cython -a\n", "# cython: infer_types=True\n", "import numpy as np\n", "cimport cython\n", "\n", "DTYPE = np.intc\n", "\n", "\n", "cdef int clip(int a, int min_value, int max_value):\n", " return min(max(a, min_value), max_value)\n", "\n", "\n", "@cython.boundscheck(False)\n", "@cython.wraparound(False)\n", "def compute(int[:, ::1] array_1, int[:, ::1] array_2, int a, int b, int c):\n", " \n", " x_max = array_1.shape[0]\n", " y_max = array_1.shape[1]\n", " \n", " assert tuple(array_1.shape) == tuple(array_2.shape)\n", "\n", " result = np.zeros((x_max, y_max), dtype=DTYPE)\n", " cdef int[:, ::1] result_view = result\n", "\n", " cdef int tmp\n", " cdef Py_ssize_t x, y\n", "\n", " for x in range(x_max):\n", " for y in range(y_max):\n", "\n", " tmp = clip(array_1[x, y], 2, 10)\n", " tmp = tmp * a + array_2[x, y] * b\n", " result_view[x, y] = tmp + c\n", "\n", " return result" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "11.5 ms ± 261 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", "We are 6131.2 times faster than the pure Python version.\n", "We are 9.0 times faster than the NumPy version.\n" ] } ], "source": [ "print_report(compute)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### More generic code:" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "scrolled": true }, "outputs": [], "source": [ "%%cython\n", "# cython: infer_types=True\n", "import numpy as np\n", "cimport cython\n", "\n", "ctypedef fused my_type:\n", " int\n", " double\n", " long long\n", "\n", "\n", "cdef my_type clip(my_type a, my_type min_value, my_type max_value):\n", " return min(max(a, min_value), max_value)\n", "\n", "\n", "@cython.boundscheck(False)\n", "@cython.wraparound(False)\n", "def compute(my_type[:, ::1] array_1, my_type[:, ::1] array_2, my_type a, my_type b, my_type c):\n", " \n", " x_max = array_1.shape[0]\n", " y_max = array_1.shape[1]\n", " \n", " assert tuple(array_1.shape) == tuple(array_2.shape)\n", " \n", " if my_type is int:\n", " dtype = np.intc\n", " elif my_type is double:\n", " dtype = np.double\n", " elif my_type is cython.longlong:\n", " dtype = np.double\n", " \n", " result = np.zeros((x_max, y_max), dtype=dtype)\n", " cdef my_type[:, ::1] result_view = result\n", "\n", " cdef my_type tmp\n", " cdef Py_ssize_t x, y\n", "\n", " for x in range(x_max):\n", " for y in range(y_max):\n", "\n", " tmp = clip(array_1[x, y], 2, 10)\n", " tmp = tmp * a + array_2[x, y] * b\n", " result_view[x, y] = tmp + c\n", "\n", " return result" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "arr_1_float = array_1.astype(np.float64)\n", "arr_2_float = array_2.astype(np.float64)\n", "\n", "float_cython_result = compute(arr_1_float, arr_2_float, a, b, c)\n", "float_numpy_result = compute_np(arr_1_float, arr_2_float, a, b, c)\n", "\n", "assert np.all(float_cython_result == float_numpy_result)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "11.5 ms ± 258 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", "We are 6153.1 times faster than the pure Python version.\n", "We are 9.0 times faster than the NumPy version.\n" ] } ], "source": [ "print_report(compute)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using multiple threads" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "scrolled": true }, "outputs": [], "source": [ "%%cython --force\n", "# distutils: extra_compile_args=-fopenmp\n", "# distutils: extra_link_args=-fopenmp\n", "import numpy as np\n", "cimport cython\n", "from cython.parallel import prange\n", "\n", "ctypedef fused my_type:\n", " int\n", " double\n", " long long\n", "\n", "\n", "# We declare our plain c function nogil\n", "cdef my_type clip(my_type a, my_type min_value, my_type max_value) nogil:\n", " return min(max(a, min_value), max_value)\n", "\n", "\n", "@cython.boundscheck(False)\n", "@cython.wraparound(False)\n", "def compute(my_type[:, ::1] array_1, my_type[:, ::1] array_2, my_type a, my_type b, my_type c):\n", " \n", " cdef Py_ssize_t x_max = array_1.shape[0]\n", " cdef Py_ssize_t y_max = array_1.shape[1]\n", " \n", " assert tuple(array_1.shape) == tuple(array_2.shape)\n", " \n", " if my_type is int:\n", " dtype = np.intc\n", " elif my_type is double:\n", " dtype = np.double\n", " elif my_type is cython.longlong:\n", " dtype = np.longlong\n", " \n", " result = np.zeros((x_max, y_max), dtype=dtype)\n", " cdef my_type[:, ::1] result_view = result\n", "\n", " cdef my_type tmp\n", " cdef Py_ssize_t x, y\n", "\n", " # We use prange here.\n", " for x in prange(x_max, nogil=True):\n", " for y in range(y_max):\n", "\n", " tmp = clip(array_1[x, y], 2, 10)\n", " tmp = tmp * a + array_2[x, y] * b\n", " result_view[x, y] = tmp + c\n", "\n", " return result" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "9.33 ms ± 412 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", "We are 7559.0 times faster than the pure Python version.\n", "We are 11.1 times faster than the NumPy version.\n" ] } ], "source": [ "print_report(compute)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }