9 files changed, 410 insertions, 52 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index bf1747272..ae871ea6f 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -662,6 +662,8 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.cos'),
           None,
+          TD('e', f='cos', astype={'e':'f'}),
+          TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='cos', astype={'e':'f'}),
           TD(P, f='cos'),
           ),
@@ -669,6 +671,8 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.sin'),
           None,
+          TD('e', f='sin', astype={'e':'f'}),
+          TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='sin', astype={'e':'f'}),
           TD(P, f='sin'),
           ),
@@ -705,7 +709,7 @@ defdict = {
           docstrings.get('numpy.core.umath.exp'),
           None,
           TD('e', f='exp', astype={'e':'f'}),
-          TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
+          TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='exp', astype={'e':'f'}),
           TD(P, f='exp'),
           ),
@@ -728,7 +732,7 @@ defdict = {
           docstrings.get('numpy.core.umath.log'),
           None,
           TD('e', f='log', astype={'e':'f'}),
-          TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
+          TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='log', astype={'e':'f'}),
           TD(P, f='log'),
           ),
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 108c0a202..27b83f7b5 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -44,10 +44,14 @@
 #else
 #define NPY_GCC_TARGET_AVX
 #endif
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
+#define HAVE_ATTRIBUTE_TARGET_FMA
+#define NPY_GCC_TARGET_FMA __attribute__((target("avx2,fma")))
+#endif
+
 #if defined HAVE_ATTRIBUTE_TARGET_AVX2 && defined HAVE_LINK_AVX2
 #define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
-#elif defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
-#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
 #else
 #define NPY_GCC_TARGET_AVX2
 #endif
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index 6a78ff3c2..126b861bf 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -144,7 +144,22 @@ NPY_INLINE static float __npy_nzerof(void)
 #define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f
 #define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f
 #define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f
-
+/*
+ * Constants used in vector implementation of sinf/cosf(x)
+ */
+#define NPY_TWO_O_PIf 0x1.45f306p-1f
+#define NPY_CODY_WAITE_PI_O_2_HIGHf -0x1.921fb0p+00f
+#define NPY_CODY_WAITE_PI_O_2_MEDf -0x1.5110b4p-22f
+#define NPY_CODY_WAITE_PI_O_2_LOWf -0x1.846988p-48f
+#define NPY_COEFF_INVF0_COSINEf 0x1.000000p+00f
+#define NPY_COEFF_INVF2_COSINEf -0x1.000000p-01f
+#define NPY_COEFF_INVF4_COSINEf 0x1.55553cp-05f
+#define NPY_COEFF_INVF6_COSINEf -0x1.6c06dcp-10f
+#define NPY_COEFF_INVF8_COSINEf 0x1.98e616p-16f
+#define NPY_COEFF_INVF3_SINEf -0x1.555556p-03f
+#define NPY_COEFF_INVF5_SINEf 0x1.11119ap-07f
+#define NPY_COEFF_INVF7_SINEf -0x1.a06bbap-13f
+#define NPY_COEFF_INVF9_SINEf 0x1.7d3bbcp-19f
 /*
  * Integer functions.
  */
@@ -163,6 +178,15 @@ NPY_INPLACE npy_longlong npy_gcdll(npy_longlong a, npy_longlong b);
 NPY_INPLACE npy_longlong npy_lcmll(npy_longlong a, npy_longlong b);
 
 /*
+ * avx function has a common API for both sin & cos. This enum is used to
+ * distinguish between the two
+ */
+typedef enum {
+    npy_compute_sin,
+    npy_compute_cos
+} NPY_TRIG_OP;
+
+/*
  * C99 double math funcs
  */
 NPY_INPLACE double npy_sin(double x);
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 6e3109ab5..a3f7acd6d 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -178,9 +178,10 @@ OPTIONAL_FUNCTION_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))',
 # gcc 4.8.4 support attributes but not with intrisics
 # tested via "#include<%s> int %s %s(void *){code; return 0;};" % (header, attribute, name, code)
 # function name will be converted to HAVE_<upper-case-name> preprocessor macro
-OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2")))',
+OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2,fma")))',
                                 'attribute_target_avx2_with_intrinsics',
-                                '__m256 temp = _mm256_set1_ps(1.0)',
+                                '__m256 temp = _mm256_set1_ps(1.0); temp = \
+                                _mm256_fmadd_ps(temp, temp, temp)',
                                 'immintrin.h'),
                                 ('__attribute__((target("avx512f")))',
                                 'attribute_target_avx512f_with_intrinsics',
diff --git a/numpy/core/src/umath/cpuid.c b/numpy/core/src/umath/cpuid.c
index 8673f1736..72c6493e8 100644
--- a/numpy/core/src/umath/cpuid.c
+++ b/numpy/core/src/umath/cpuid.c
@@ -48,6 +48,25 @@ int os_avx512_support(void)
 #endif
 }
 
+static NPY_INLINE
+int cpu_supports_fma(void)
+{
+#ifdef __x86_64__
+    unsigned int feature = 0x01;
+    unsigned int a, b, c, d;
+    __asm__ volatile (
+        "cpuid"				"\n\t"
+	: "=a" (a), "=b" (b), "=c" (c), "=d" (d)
+	: "a" (feature));
+    /*
+     * FMA is the 12th bit of ECX
+     */
+    return (c >> 12) & 1;
+#else
+    return 0;
+#endif
+}
+
 /*
  * Primitive cpu feature detect function
  * Currently only supports checking for avx on gcc compatible compilers.
@@ -63,6 +82,9 @@ npy_cpu_supports(const char * feature)
         return 0;
 #endif
     }
+    else if (strcmp(feature, "fma") == 0) {
+        return cpu_supports_fma() && __builtin_cpu_supports("avx2") && os_avx_support();
+    }
     else if (strcmp(feature, "avx2") == 0) {
         return __builtin_cpu_supports("avx2") && os_avx_support();
     }
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 1a4885133..2028a5712 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1594,8 +1594,8 @@ NPY_NO_EXPORT void
 /**end repeat**/
 
 /**begin repeat
- *  #func = exp, log#
- *  #scalarf = npy_expf, npy_logf#
+ *  #func = sin, cos, exp, log#
+ *  #scalarf = npy_sinf, npy_cosf, npy_expf, npy_logf#
  */
 
 NPY_NO_EXPORT NPY_GCC_OPT_3 void
@@ -1610,8 +1610,8 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE
 /**end repeat**/
 
 /**begin repeat
- * #isa = avx512f, avx2#
- * #ISA = AVX512F, AVX2#
+ * #isa = avx512f, fma#
+ * #ISA = AVX512F, FMA#
  * #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS#
  */
 
@@ -1642,6 +1642,31 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
 }
 
 /**end repeat1**/
+
+/**begin repeat1
+ *  #func = cos, sin#
+ *  #enum = npy_compute_cos, npy_compute_sin#
+ *  #scalarf = npy_cosf, npy_sinf#
+ */
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_sincos_FLOAT(args, dimensions, steps, @enum@)) {
+        UNARY_LOOP {
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+            @ISA@_sincos_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0], @enum@);
+#else
+	        const npy_float in1 = *(npy_float *)ip1;
+	        *(npy_float *)op1 = @scalarf@(in1);
+#endif
+        }
+    }
+}
+
+/**end repeat1**/
+
+
 /**end repeat**/
 
 /**begin repeat
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 7f05a693a..5070ab38b 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -178,13 +178,13 @@ NPY_NO_EXPORT void
 /**end repeat**/
 
 /**begin repeat
- *  #func = exp, log#
+ *  #func = sin, cos, exp, log#
  */
 NPY_NO_EXPORT void
 FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
 
 /**begin repeat1
- * #isa = avx512f, avx2#
+ * #isa = avx512f, fma#
  */
 
 NPY_NO_EXPORT void
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 9816a1da4..7aec1ff49 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -130,8 +130,9 @@ abs_ptrdiff(char *a, char *b)
  */
 
 /**begin repeat
- * #ISA = AVX2, AVX512F#
- * #isa = avx2, avx512f#
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512f#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
  * #REGISTER_SIZE = 32, 64#
  */
 
@@ -141,7 +142,7 @@ abs_ptrdiff(char *a, char *b)
  * #func = exp, log#
  */
 
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
 static NPY_INLINE void
 @ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp stride);
 #endif
@@ -149,7 +150,7 @@ static NPY_INLINE void
 static NPY_INLINE int
 run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
 {
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
     if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
         @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]);
         return 1;
@@ -162,6 +163,25 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
 
 /**end repeat1**/
 
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE void
+@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp steps, NPY_TRIG_OP);
+#endif
+
+static NPY_INLINE int
+run_unary_@isa@_sincos_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps, NPY_TRIG_OP my_trig_op)
+{
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
+        @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0], my_trig_op);
+        return 1;
+    }
+    else
+        return 0;
+#endif
+    return 0;
+}
+
 /**end repeat**/
 
 
@@ -1123,20 +1143,14 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
 /* bunch of helper functions used in ISA_exp/log_FLOAT*/
 
 #if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_fmadd(__m256 a, __m256 b, __m256 c)
-{
-    return _mm256_add_ps(_mm256_mul_ps(a, b), c);
-}
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_full_load_mask(void)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_get_full_load_mask(void)
 {
     return _mm256_set1_ps(-1.0);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
 {
     float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
                             1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
@@ -1144,8 +1158,8 @@ avx2_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
     return _mm256_loadu_ps(addr);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_masked_gather(__m256 src,
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_masked_gather(__m256 src,
                    npy_float* addr,
                    __m256i vindex,
                    __m256 mask)
@@ -1153,26 +1167,39 @@ avx2_masked_gather(__m256 src,
     return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_masked_load(__m256 mask, npy_float* addr)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_masked_load(__m256 mask, npy_float* addr)
 {
     return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_set_masked_lanes(__m256 x, __m256 val, __m256 mask)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_set_masked_lanes(__m256 x, __m256 val, __m256 mask)
 {
     return _mm256_blendv_ps(x, val, mask);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_blend(__m256 x, __m256 y, __m256 ymask)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_blend(__m256 x, __m256 y, __m256 ymask)
 {
     return _mm256_blendv_ps(x, y, ymask);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_exponent(__m256 x)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp)
+{
+   return _mm256_cvtepi32_ps(
+                _mm256_cmpeq_epi32(_mm256_and_si256(k, andop), cmp));
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_should_negate(__m256i k, __m256i andop, __m256i cmp)
+{
+    return fma_should_calculate_sine(k, andop, cmp);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_get_exponent(__m256 x)
 {
     /*
      * Special handling of denormals:
@@ -1198,8 +1225,8 @@ avx2_get_exponent(__m256 x)
     return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_mantissa(__m256 x)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_get_mantissa(__m256 x)
 {
     /*
      * Special handling of denormals:
@@ -1225,7 +1252,7 @@ avx2_get_mantissa(__m256 x)
 }
 
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_scalef_ps(__m256 poly, __m256 quadrant)
+fma_scalef_ps(__m256 poly, __m256 quadrant)
 {
     /*
      * Handle denormals (which occur when quadrant <= -125):
@@ -1305,6 +1332,18 @@ avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
     return _mm512_mask_mov_ps(x, ymask, y);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+avx512_should_calculate_sine(__m512i k, __m512i andop, __m512i cmp)
+{
+    return _mm512_cmpeq_epi32_mask(_mm512_and_epi32(k, andop), cmp);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+avx512_should_negate(__m512i k, __m512i andop, __m512i cmp)
+{
+    return avx512_should_calculate_sine(k, andop, cmp);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
 avx512_get_exponent(__m512 x)
 {
@@ -1325,17 +1364,28 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant)
 #endif
 
 /**begin repeat
- * #ISA = AVX2, AVX512F#
- * #isa = avx2, avx512#
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512#
  * #vtype = __m256, __m512#
  * #vsize = 256, 512#
  * #or = or_ps, kor#
  * #vsub = , _mask#
  * #mask = __m256, __mmask16#
- * #fmadd = avx2_fmadd,_mm512_fmadd_ps#
+ * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
  **/
 
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
+#if defined @CHK@
+
+/*
+ * Vectorized Cody-Waite range reduction technique
+ * Performs the reduction step x* = x - y*C in three steps:
+ * 1) x* = x - y*c1
+ * 2) x* = x - y*c2
+ * 3) x* = x - y*c3
+ * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
+ */
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
 @isa@_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)
 {
@@ -1344,12 +1394,56 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
     reduced_x = @fmadd@(y, c3, reduced_x);
     return reduced_x;
 }
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @mask@
+@isa@_in_range_mask(@vtype@ x, npy_float fmax, npy_float fmin)
+{
+    @mask@ m1 = _mm@vsize@_cmp_ps@vsub@(
+                                x, _mm@vsize@_set1_ps(fmax), _CMP_GT_OQ);
+    @mask@ m2 = _mm@vsize@_cmp_ps@vsub@(
+                                x, _mm@vsize@_set1_ps(fmin), _CMP_LT_OQ);
+    return _mm@vsize@_@or@(m1,m2);
+}
+
+/*
+ * Approximate cosine algorithm for x \in [-PI/4, PI/4]
+ * Maximum ULP across all 32-bit floats = 0.875
+ */
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+@isa@_cosine(@vtype@ x2, @vtype@ invf8, @vtype@ invf6, @vtype@ invf4,
+                                                @vtype@ invf2, @vtype@ invf0)
+{
+    @vtype@ cos = @fmadd@(invf8, x2, invf6);
+    cos = @fmadd@(cos, x2, invf4);
+    cos = @fmadd@(cos, x2, invf2);
+    cos = @fmadd@(cos, x2, invf0);
+    return cos;
+}
+
+/*
+ * Approximate sine algorithm for x \in [-PI/4, PI/4]
+ * Maximum ULP across all 32-bit floats = 0.647
+ */
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+@isa@_sine(@vtype@ x, @vtype@ x2, @vtype@ invf9, @vtype@ invf7,
+                                          @vtype@ invf5, @vtype@ invf3,
+                                          @vtype@ zero)
+{
+    @vtype@ sin = @fmadd@(invf9, x2, invf7);
+    sin = @fmadd@(sin, x2, invf5);
+    sin = @fmadd@(sin, x2, invf3);
+    sin = @fmadd@(sin, x2, zero);
+    sin = @fmadd@(sin, x, x);
+    return sin;
+}
 #endif
 /**end repeat**/
 
 /**begin repeat
- * #ISA = AVX2, AVX512F#
- * #isa = avx2, avx512#
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512#
  * #vtype = __m256, __m512#
  * #vsize = 256, 512#
  * #BYTES = 32, 64#
@@ -1358,13 +1452,165 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
  * #or_masks =_mm256_or_ps, _mm512_kor#
  * #and_masks =_mm256_and_ps, _mm512_kand#
  * #xor_masks =_mm256_xor_ps, _mm512_kxor#
- * #fmadd = avx2_fmadd,_mm512_fmadd_ps#
+ * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
  * #mask_to_int = _mm256_movemask_ps, #
  * #full_mask= 0xFF, 0xFFFF#
  * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
  * #cvtps_epi32 = _mm256_cvtps_epi32, #
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
+ */
+
+
+/*
+ * Vectorized approximate sine/cosine algorithms: The following code is a
+ * vectorized version of the algorithm presented here:
+ * https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
+ * (1) Load data in ZMM/YMM registers and generate mask for elements that are
+ * within range [-71476.0625f, 71476.0625f] for cosine and [-117435.992f,
+ * 117435.992f] for sine.
+ * (2) For elements within range, perform range reduction using Cody-Waite's
+ * method: x* = x - y*PI/2, where y = rint(x*2/PI). x* \in [-PI/4, PI/4].
+ * (3) Map cos(x) to (+/-)sine or (+/-)cosine of x* based on the quadrant k =
+ * int(y).
+ * (4) For elements outside that range, Cody-Waite reduction peforms poorly
+ * leading to catastrophic cancellation. We compute cosine by calling glibc in
+ * a scalar fashion.
+ * (5) Vectorized implementation has a max ULP of 1.49 and performs at least
+ * 5-7x faster than scalar implementations when magnitude of all elements in
+ * the array < 71476.0625f (117435.992f for sine). Worst case performance is
+ * when all the elements are large leading to about 1-2% reduction in
+ * performance.
  */
 
+#if defined @CHK@
+static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
+@ISA@_sincos_FLOAT(npy_float * op,
+                   npy_float * ip,
+                   const npy_intp array_size,
+                   const npy_intp steps,
+                   NPY_TRIG_OP my_trig_op)
+{
+    const npy_intp stride = steps/sizeof(npy_float);
+    const npy_int num_lanes = @BYTES@/sizeof(npy_float);
+    npy_float large_number = 71476.0625f;
+    if (my_trig_op == npy_compute_sin) {
+        large_number = 117435.992f;
+    }
+
+    /* Load up frequently used constants */
+    @vtype@i zeros = _mm@vsize@_set1_epi32(0);
+    @vtype@i ones = _mm@vsize@_set1_epi32(1);
+    @vtype@i twos = _mm@vsize@_set1_epi32(2);
+    @vtype@ two_over_pi = _mm@vsize@_set1_ps(NPY_TWO_O_PIf);
+    @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_HIGHf);
+    @vtype@ codyw_c2 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_MEDf);
+    @vtype@ codyw_c3 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_LOWf);
+    @vtype@ cos_invf0 = _mm@vsize@_set1_ps(NPY_COEFF_INVF0_COSINEf);
+    @vtype@ cos_invf2 = _mm@vsize@_set1_ps(NPY_COEFF_INVF2_COSINEf);
+    @vtype@ cos_invf4 = _mm@vsize@_set1_ps(NPY_COEFF_INVF4_COSINEf);
+    @vtype@ cos_invf6 = _mm@vsize@_set1_ps(NPY_COEFF_INVF6_COSINEf);
+    @vtype@ cos_invf8 = _mm@vsize@_set1_ps(NPY_COEFF_INVF8_COSINEf);
+    @vtype@ sin_invf3 = _mm@vsize@_set1_ps(NPY_COEFF_INVF3_SINEf);
+    @vtype@ sin_invf5 = _mm@vsize@_set1_ps(NPY_COEFF_INVF5_SINEf);
+    @vtype@ sin_invf7 = _mm@vsize@_set1_ps(NPY_COEFF_INVF7_SINEf);
+    @vtype@ sin_invf9 = _mm@vsize@_set1_ps(NPY_COEFF_INVF9_SINEf);
+    @vtype@ cvt_magic = _mm@vsize@_set1_ps(NPY_RINT_CVT_MAGICf);
+    @vtype@ zero_f = _mm@vsize@_set1_ps(0.0f);
+    @vtype@ quadrant, reduced_x, reduced_x2, cos, sin;
+    @vtype@i iquadrant;
+    @mask@ nan_mask, glibc_mask, sine_mask, negate_mask;
+    @mask@ load_mask = @isa@_get_full_load_mask();
+    npy_intp num_remaining_elements = array_size;
+    npy_int indexarr[16];
+    for (npy_int ii = 0; ii < 16; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
+
+    while (num_remaining_elements > 0) {
+
+        if (num_remaining_elements < num_lanes) {
+            load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
+                                                         num_lanes);
+        }
+
+        @vtype@ x;
+        if (stride == 1) {
+            x = @isa@_masked_load(load_mask, ip);
+        }
+        else {
+            x = @isa@_masked_gather(zero_f, ip, vindex, load_mask);
+        }
+
+        /*
+         * For elements outside of this range, Cody-Waite's range reduction
+         * becomes inaccurate and we will call glibc to compute cosine for
+         * these numbers
+         */
+
+        glibc_mask = @isa@_in_range_mask(x, large_number,-large_number);
+        glibc_mask = @and_masks@(load_mask, glibc_mask);
+        nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
+        x = @isa@_set_masked_lanes(x, zero_f, @or_masks@(nan_mask, glibc_mask));
+        npy_int iglibc_mask = @mask_to_int@(glibc_mask);
+
+        if (iglibc_mask != @full_mask@) {
+            quadrant = _mm@vsize@_mul_ps(x, two_over_pi);
+
+            /* round to nearest */
+            quadrant = _mm@vsize@_add_ps(quadrant, cvt_magic);
+            quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic);
+
+            /* Cody-Waite's range reduction algorithm */
+            reduced_x = @isa@_range_reduction(x, quadrant,
+                                                   codyw_c1, codyw_c2, codyw_c3);
+            reduced_x2 = _mm@vsize@_mul_ps(reduced_x, reduced_x);
+
+            /* compute cosine and sine */
+            cos = @isa@_cosine(reduced_x2, cos_invf8, cos_invf6, cos_invf4,
+                                                           cos_invf2, cos_invf0);
+            sin = @isa@_sine(reduced_x, reduced_x2, sin_invf9, sin_invf7,
+                                             sin_invf5, sin_invf3, zero_f);
+
+            iquadrant = _mm@vsize@_cvtps_epi32(quadrant);
+            if (my_trig_op == npy_compute_cos) {
+                iquadrant = _mm@vsize@_add_epi32(iquadrant, ones);
+            }
+
+            /* blend sin and cos based on the quadrant */
+            sine_mask = @isa@_should_calculate_sine(iquadrant, ones, zeros);
+            cos = @isa@_blend(cos, sin, sine_mask);
+
+            /* multiply by -1 for appropriate elements */
+            negate_mask = @isa@_should_negate(iquadrant, twos, twos);
+            cos = @isa@_blend(cos, _mm@vsize@_sub_ps(zero_f, cos), negate_mask);
+            cos = @isa@_set_masked_lanes(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
+
+            @masked_store@(op, @cvtps_epi32@(load_mask), cos);
+        }
+
+        /* process elements using glibc for large elements */
+        if (my_trig_op == npy_compute_cos) {
+            for (int ii = 0; iglibc_mask != 0; ii++) {
+                if (iglibc_mask & 0x01) {
+                    op[ii] = npy_cosf(ip[ii]);
+                }
+                iglibc_mask  = iglibc_mask >> 1;
+            }
+        }
+        else {
+            for (int ii = 0; iglibc_mask != 0; ii++) {
+                if (iglibc_mask & 0x01) {
+                    op[ii] = npy_sinf(ip[ii]);
+                }
+                iglibc_mask  = iglibc_mask >> 1;
+            }
+        }
+        ip += num_lanes*stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+}
 
 /*
  * Vectorized implementation of exp using AVX2 and AVX512:
@@ -1381,7 +1627,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
  * same x = 0xc2781e37)
  */
 
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
 static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 @ISA@_exp_FLOAT(npy_float * op,
                 npy_float * ip,
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 91d403d98..ef48fed05 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -678,20 +678,49 @@ class TestSpecialFloats(object):
             assert_raises(FloatingPointError, np.log, np.float32(-np.inf))
             assert_raises(FloatingPointError, np.log, np.float32(-1.0))
 
-class TestExpLogFloat32(object):
+    def test_sincos_values(self):
+        with np.errstate(all='ignore'):
+            x = [np.nan,  np.nan, np.nan, np.nan]
+            y = [np.nan, -np.nan, np.inf, -np.inf]
+            for dt in ['f', 'd', 'g']:
+                xf = np.array(x, dtype=dt)
+                yf = np.array(y, dtype=dt)
+                assert_equal(np.sin(yf), xf)
+                assert_equal(np.cos(yf), xf)
+
+        with np.errstate(invalid='raise'):
+            assert_raises(FloatingPointError, np.sin, np.float32(-np.inf))
+            assert_raises(FloatingPointError, np.sin, np.float32(np.inf))
+            assert_raises(FloatingPointError, np.cos, np.float32(-np.inf))
+            assert_raises(FloatingPointError, np.cos, np.float32(np.inf))
+
+
+class TestSIMDFloat32(object):
     def test_exp_float32(self):
         np.random.seed(42)
         x_f32 = np.float32(np.random.uniform(low=0.0,high=88.1,size=1000000))
         x_f64 = np.float64(x_f32)
-        assert_array_max_ulp(np.exp(x_f32), np.float32(np.exp(x_f64)), maxulp=2.6)
+        assert_array_max_ulp(np.exp(x_f32), np.float32(np.exp(x_f64)), maxulp=3)
 
     def test_log_float32(self):
         np.random.seed(42)
         x_f32 = np.float32(np.random.uniform(low=0.0,high=1000,size=1000000))
         x_f64 = np.float64(x_f32)
-        assert_array_max_ulp(np.log(x_f32), np.float32(np.log(x_f64)), maxulp=3.9)
+        assert_array_max_ulp(np.log(x_f32), np.float32(np.log(x_f64)), maxulp=4)
+
+    def test_sincos_float32(self):
+        np.random.seed(42)
+        N = 1000000
+        M = np.int(N/20)
+        index = np.random.randint(low=0, high=N, size=M)
+        x_f32 = np.float32(np.random.uniform(low=-100.,high=100.,size=N))
+        # test coverage for elements > 117435.992f for which glibc is used
+        x_f32[index] = np.float32(10E+10*np.random.rand(M))
+        x_f64 = np.float64(x_f32)
+        assert_array_max_ulp(np.sin(x_f32), np.float32(np.sin(x_f64)), maxulp=2)
+        assert_array_max_ulp(np.cos(x_f32), np.float32(np.cos(x_f64)), maxulp=2)
 
-    def test_strided_exp_log_float32(self):
+    def test_strided_float32(self):
         np.random.seed(42)
         strides = np.random.randint(low=-100, high=100, size=100)
         sizes = np.random.randint(low=1, high=2000, size=100)
@@ -699,9 +728,13 @@ class TestExpLogFloat32(object):
             x_f32 = np.float32(np.random.uniform(low=0.01,high=88.1,size=ii))
             exp_true = np.exp(x_f32)
             log_true = np.log(x_f32)
+            sin_true = np.sin(x_f32)
+            cos_true = np.cos(x_f32)
             for jj in strides:
                 assert_array_almost_equal_nulp(np.exp(x_f32[::jj]), exp_true[::jj], nulp=2)
                 assert_array_almost_equal_nulp(np.log(x_f32[::jj]), log_true[::jj], nulp=2)
+                assert_array_almost_equal_nulp(np.sin(x_f32[::jj]), sin_true[::jj], nulp=2)
+                assert_array_almost_equal_nulp(np.cos(x_f32[::jj]), cos_true[::jj], nulp=2)
 
 class TestLogAddExp(_FilterInvalids):
     def test_logaddexp_values(self):