7 files changed, 188 insertions, 10 deletions
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index 126b861bf..69e690f28 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -177,6 +177,28 @@ NPY_INPLACE npy_long npy_lcml(npy_long a, npy_long b);
 NPY_INPLACE npy_longlong npy_gcdll(npy_longlong a, npy_longlong b);
 NPY_INPLACE npy_longlong npy_lcmll(npy_longlong a, npy_longlong b);
 
+NPY_INPLACE npy_ubyte npy_rshiftuhh(npy_ubyte a, npy_ubyte b);
+NPY_INPLACE npy_ubyte npy_lshiftuhh(npy_ubyte a, npy_ubyte b);
+NPY_INPLACE npy_ushort npy_rshiftuh(npy_ushort a, npy_ushort b);
+NPY_INPLACE npy_ushort npy_lshiftuh(npy_ushort a, npy_ushort b);
+NPY_INPLACE npy_uint npy_rshiftu(npy_uint a, npy_uint b);
+NPY_INPLACE npy_uint npy_lshiftu(npy_uint a, npy_uint b);
+NPY_INPLACE npy_ulong npy_rshiftul(npy_ulong a, npy_ulong b);
+NPY_INPLACE npy_ulong npy_lshiftul(npy_ulong a, npy_ulong b);
+NPY_INPLACE npy_ulonglong npy_rshiftull(npy_ulonglong a, npy_ulonglong b);
+NPY_INPLACE npy_ulonglong npy_lshiftull(npy_ulonglong a, npy_ulonglong b);
+
+NPY_INPLACE npy_byte npy_rshifthh(npy_byte a, npy_byte b);
+NPY_INPLACE npy_byte npy_lshifthh(npy_byte a, npy_byte b);
+NPY_INPLACE npy_short npy_rshifth(npy_short a, npy_short b);
+NPY_INPLACE npy_short npy_lshifth(npy_short a, npy_short b);
+NPY_INPLACE npy_int npy_rshift(npy_int a, npy_int b);
+NPY_INPLACE npy_int npy_lshift(npy_int a, npy_int b);
+NPY_INPLACE npy_long npy_rshiftl(npy_long a, npy_long b);
+NPY_INPLACE npy_long npy_lshiftl(npy_long a, npy_long b);
+NPY_INPLACE npy_longlong npy_rshiftll(npy_longlong a, npy_longlong b);
+NPY_INPLACE npy_longlong npy_lshiftll(npy_longlong a, npy_longlong b);
+
 /*
  * avx function has a common API for both sin & cos. This enum is used to
  * distinguish between the two
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 5ac7752cc..63b515b18 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -463,6 +463,12 @@ def configuration(parent_package='',top_path=None):
             rep = check_long_double_representation(config_cmd)
             moredefs.append(('HAVE_LDOUBLE_%s' % rep, 1))
 
+            if check_for_right_shift_internal_compiler_error(config_cmd):
+                moredefs.append('NPY_DO_NOT_OPTIMIZE_LONG_right_shift')
+                moredefs.append('NPY_DO_NOT_OPTIMIZE_ULONG_right_shift')
+                moredefs.append('NPY_DO_NOT_OPTIMIZE_LONGLONG_right_shift')
+                moredefs.append('NPY_DO_NOT_OPTIMIZE_ULONGLONG_right_shift')
+
             # Py3K check
             if sys.version_info[0] >= 3:
                 moredefs.append(('NPY_PY3K', 1))
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index a3f7acd6d..84b78b585 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -5,6 +5,7 @@ import sys
 import warnings
 import copy
 import binascii
+import textwrap
 
 from numpy.distutils.misc_util import mingw32
 
@@ -415,3 +416,41 @@ def long_double_representation(lines):
     else:
         # We never detected the after_sequence
         raise ValueError("Could not lock sequences (%s)" % saw)
+
+
+def check_for_right_shift_internal_compiler_error(cmd):
+    """
+    On our arm CI, this fails with an internal compilation error
+
+    The failure looks like the following, and can be reproduced on ARM64 GCC 5.4:
+
+        <source>: In function 'right_shift':
+        <source>:4:20: internal compiler error: in expand_shift_1, at expmed.c:2349
+               ip1[i] = ip1[i] >> in2;
+                      ^
+        Please submit a full bug report,
+        with preprocessed source if appropriate.
+        See <http://gcc.gnu.org/bugs.html> for instructions.
+        Compiler returned: 1
+
+    This function returns True if this compiler bug is present, and we need to
+    turn off optimization for the function
+    """
+    cmd._check_compiler()
+    has_optimize = cmd.try_compile(textwrap.dedent("""\
+        __attribute__((optimize("O3"))) void right_shift() {}
+        """), None, None)
+    if not has_optimize:
+        return False
+
+    no_err = cmd.try_compile(textwrap.dedent("""\
+        typedef long the_type;  /* fails also for unsigned and long long */
+        __attribute__((optimize("O3"))) void right_shift(the_type in2, the_type *ip1, int n) {
+            for (int i = 0; i < n; i++) {
+                if (in2 < (the_type)sizeof(the_type) * 8) {
+                    ip1[i] = ip1[i] >> in2;
+                }
+            }
+        }
+        """), None, None)
+    return not no_err
diff --git a/numpy/core/src/npymath/npy_math_internal.h.src b/numpy/core/src/npymath/npy_math_internal.h.src
index fa820baac..18b6d1434 100644
--- a/numpy/core/src/npymath/npy_math_internal.h.src
+++ b/numpy/core/src/npymath/npy_math_internal.h.src
@@ -716,3 +716,44 @@ npy_@func@@c@(@type@ a, @type@ b)
     return npy_@func@u@c@(a < 0 ? -a : a, b < 0 ? -b : b);
 }
 /**end repeat**/
+
+/* Unlike LCM and GCD, we need byte and short variants for the shift operators,
+ * since the result is dependent on the width of the type
+ */
+/**begin repeat
+ *
+ * #type = byte, short, int, long, longlong#
+ * #c = hh,h,,l,ll#
+ */
+/**begin repeat1
+ *
+ * #u         = u,#
+ * #is_signed = 0,1#
+ */
+NPY_INPLACE npy_@u@@type@
+npy_lshift@u@@c@(npy_@u@@type@ a, npy_@u@@type@ b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_@u@@type@
+npy_rshift@u@@c@(npy_@u@@type@ a, npy_@u@@type@ b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if @is_signed@
+    else if (a < 0) {
+        return (npy_@u@@type@)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+/**end repeat1**/
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 2028a5712..5443223ab 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -699,6 +699,7 @@ BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
  * #ftype = npy_float, npy_float, npy_float, npy_float, npy_double, npy_double,
  *          npy_double, npy_double, npy_double, npy_double#
  * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
+ * #c = hh,uhh,h,uh,,u,l,ul,ll,ull#
  */
 
 #define @TYPE@_floor_divide @TYPE@_divide
@@ -776,16 +777,15 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
 
 /**begin repeat2
  * Arithmetic
- * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
- *          left_shift, right_shift#
- * #OP = +, -,*, &, |, ^, <<, >>#
+ * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor#
+ * #OP = +, -, *, &, |, ^#
  */
 
 #if @CHK@
 NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
 @TYPE@_@kind@@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_BINARY_REDUCE) {
+    if (IS_BINARY_REDUCE) {
         BINARY_REDUCE_LOOP(@type@) {
             io1 @OP@= *(@type@ *)ip2;
         }
@@ -799,6 +799,47 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
 
 /**end repeat2**/
 
+/*
+ * Arithmetic bit shift operations.
+ *
+ * Intel hardware masks bit shift values, so large shifts wrap around
+ * and can produce surprising results. The special handling ensures that
+ * behavior is independent of compiler or hardware.
+ * TODO: We could implement consistent behavior for negative shifts,
+ *       which is undefined in C.
+ */
+
+#define INT_left_shift_needs_clear_floatstatus
+#define UINT_left_shift_needs_clear_floatstatus
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_left_shift@isa@(char **args, npy_intp *dimensions, npy_intp *steps,
+                  void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
+
+#ifdef @TYPE@_left_shift_needs_clear_floatstatus
+    // For some reason, our macOS CI sets an "invalid" flag here, but only
+    // for some types.
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#undef INT_left_shift_needs_clear_floatstatus
+#undef UINT_left_shift_needs_clear_floatstatus
+
+NPY_NO_EXPORT
+#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
+NPY_GCC_OPT_3
+#endif
+void
+@TYPE@_right_shift@isa@(char **args, npy_intp *dimensions, npy_intp *steps,
+                   void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
+}
+
+
 /**begin repeat2
  * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
  *         logical_and, logical_or#
diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
index a7987acda..df440e095 100644
--- a/numpy/core/src/umath/scalarmath.c.src
+++ b/numpy/core/src/umath/scalarmath.c.src
@@ -246,25 +246,26 @@ static void
 /**end repeat**/
 
 
-
-/* QUESTION:  Should we check for overflow / underflow in (l,r)shift? */
-
 /**begin repeat
  * #name = byte, ubyte, short, ushort, int, uint,
  *         long, ulong, longlong, ulonglong#
  * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
  *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ * #suffix = hh,uhh,h,uh,,u,l,ul,ll,ull#
  */
 
 /**begin repeat1
- * #oper = and, xor, or, lshift, rshift#
- * #op = &, ^, |, <<, >>#
+ * #oper = and, xor, or#
+ * #op = &, ^, |#
  */
 
 #define @name@_ctype_@oper@(arg1, arg2, out) *(out) = (arg1) @op@ (arg2)
 
 /**end repeat1**/
 
+#define @name@_ctype_lshift(arg1, arg2, out) *(out) = npy_lshift@suffix@(arg1, arg2)
+#define @name@_ctype_rshift(arg1, arg2, out) *(out) = npy_rshift@suffix@(arg1, arg2)
+
 /**end repeat**/
 
 /**begin repeat
@@ -570,7 +571,7 @@ static void
  * 1) Convert the types to the common type if both are scalars (0 return)
  * 2) If both are not scalars use ufunc machinery (-2 return)
  * 3) If both are scalars but cannot be cast to the right type
- * return NotImplmented (-1 return)
+ * return NotImplemented (-1 return)
  *
  * 4) Perform the function on the C-type.
  * 5) If an error condition occurred, check to see
diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py
index ebba457e3..854df5590 100644
--- a/numpy/core/tests/test_scalarmath.py
+++ b/numpy/core/tests/test_scalarmath.py
@@ -664,3 +664,31 @@ class TestAbs(object):
 
     def test_numpy_abs(self):
         self._test_abs_func(np.abs)
+
+
+class TestBitShifts(object):
+
+    @pytest.mark.parametrize('type_code', np.typecodes['AllInteger'])
+    @pytest.mark.parametrize('op',
+        [operator.rshift, operator.lshift], ids=['>>', '<<'])
+    def test_shift_all_bits(self, type_code, op):
+        """ Shifts where the shift amount is the width of the type or wider """
+        # gh-2449
+        dt = np.dtype(type_code)
+        nbits = dt.itemsize * 8
+        for val in [5, -5]:
+            for shift in [nbits, nbits + 4]:
+                val_scl = dt.type(val)
+                shift_scl = dt.type(shift)
+                res_scl = op(val_scl, shift_scl)
+                if val_scl < 0 and op is operator.rshift:
+                    # sign bit is preserved
+                    assert_equal(res_scl, -1)
+                else:
+                    assert_equal(res_scl, 0)
+
+                # Result on scalars should be the same as on arrays
+                val_arr = np.array([val]*32, dtype=dt)
+                shift_arr = np.array([shift]*32, dtype=dt)
+                res_arr = op(val_arr, shift_arr)
+                assert_equal(res_arr, res_scl)