summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSebastian Berg <sebastian@sipsolutions.net>2022-09-22 14:06:09 +0200
committerGitHub <noreply@github.com>2022-09-22 14:06:09 +0200
commit4a47d49f451de30d72dbcefe87fe141ee7a6e925 (patch)
tree3ccdc977456dc3f76ec54662b15ba13eddcabef0
parent54f9b4aba9f2b98b59a32eb0d003c6ea6dd8f4c8 (diff)
parentd51785c68ef7a91c16b4a762ac54b0322977f1b2 (diff)
downloadnumpy-4a47d49f451de30d72dbcefe87fe141ee7a6e925.tar.gz
Merge pull request #22164 from Developer-Ecosystem-Engineering/improve_tanh_for_apple_silicon
ENH: Improve tanh for architectures without efficient gather/scatter …
-rw-r--r--numpy/core/src/umath/loops_hyperbolic.dispatch.c.src253
1 files changed, 253 insertions, 0 deletions
diff --git a/numpy/core/src/umath/loops_hyperbolic.dispatch.c.src b/numpy/core/src/umath/loops_hyperbolic.dispatch.c.src
index ce4962ce3..fbe68aefb 100644
--- a/numpy/core/src/umath/loops_hyperbolic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_hyperbolic.dispatch.c.src
@@ -74,9 +74,119 @@
*
*/
#if NPY_SIMD_F64
+
+ // For architectures without efficient gather / scatter instructions, it is
+ // better to use a transposed LUT where we can load all coefficients for an
+ // index linearly. In order to keep the same vertical calculation, we
+ // transpose the coef. into lanes. 2 lane transpose is all that's
+ // implemented so we require `npyv_nlanes_f64` == 2.
+ #if npyv_nlanes_f64 == 2
+ #define TANH_TRANSPOSED_LUT
+ #endif // npyv_nlanes_f64 == 2
+
static void
simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_intp len)
{
+#if defined(TANH_TRANSPOSED_LUT)
+ static const npy_uint64 NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) lut18x16[] = {
+ // 0
+ 0x0ull, 0x0ull, 0x3ff0000000000000ull, 0xbbf0b3ea3fdfaa19ull, // b, c0, c1, c2
+ 0xbfd5555555555555ull, 0xbce6863ee44ed636ull, 0x3fc1111111112ab5ull, 0xbda1ea19ddddb3b4ull, // c3, c4, c5, c6
+ 0xbfaba1ba1990520bull, 0xbe351ca7f096011full, 0x3f9664f94e6ac14eull, 0xbea8c4c1fd7852feull, // c7, c8, c9, c10
+ 0xbf822404577aa9ddull, 0xbefdd99a221ed573ull, 0x3f6e3be689423841ull, 0xbf2a1306713a4f3aull, // c11, c12, c13, c14
+ 0xbf55d7e76dc56871ull, 0x3f35e67ab76a26e7ull, // c15, c16
+ // 1
+ 0x3fcc000000000000ull, 0x3fcb8fd0416a7c92ull, 0x3fee842ca3f08532ull, 0xbfca48aaeb53bc21ull,
+ 0xbfd183afc292ba11ull, 0x3fc04dcd0476c75eull, 0x3fb5c19efdfc08adull, 0xbfb0b8df995ce4dfull,
+ 0xbf96e37bba52f6fcull, 0x3f9eaaf3320c3851ull, 0xbf94d3343bae39ddull, 0xbfccce16b1046f13ull,
+ 0x403d8b07f7a82aa3ull, 0x4070593a3735bab4ull, 0xc0d263511f5baac1ull, 0xc1045e509116b066ull,
+ 0x41528c38809c90c7ull, 0x41848ee0627d8206ull,
+ // 2
+ 0x3fd4000000000000ull, 0x3fd35f98a0ea650eull, 0x3fed11574af58f1bull, 0xbfd19921f4329916ull,
+ 0xbfcc1a4b039c9bfaull, 0x3fc43d3449a80f08ull, 0x3fa74c98dc34fbacull, 0xbfb2955cf41e8164ull,
+ 0x3ecff7df18455399ull, 0x3f9cf823fe761fc1ull, 0xbf7bc748e60df843ull, 0xbf81a16f224bb7b6ull,
+ 0xbf9f44ab92fbab0aull, 0xbfccab654e44835eull, 0x40169f73b15ebe5cull, 0x4041fab9250984ceull,
+ 0xc076d57fb5190b02ull, 0xc0a216d618b489ecull,
+ // 3
+ 0x3fdc000000000000ull, 0x3fda5729ee488037ull, 0x3fea945b9c24e4f9ull, 0xbfd5e0f09bef8011ull,
+ 0xbfc16e1e6d8d0be6ull, 0x3fc5c26f3699b7e7ull, 0xbf790d6a8eff0a77ull, 0xbfaf9d05c309f7c6ull,
+ 0x3f97362834d33a4eull, 0x3f9022271754ff1full, 0xbf8c89372b43ba85ull, 0xbf62cbf00406bc09ull,
+ 0x3fb2eac604473d6aull, 0x3fd13ed80037dbacull, 0xc025c1dd41cd6cb5ull, 0xc0458d090ec3de95ull,
+ 0x4085f09f888f8adaull, 0x40a5b89107c8af4full,
+ // 4
+ 0x3fe4000000000000ull, 0x3fe1bf47eabb8f95ull, 0x3fe6284c3374f815ull, 0xbfd893b59c35c882ull, // b, c0, c1, c2
+ 0xbf92426c751e48a2ull, 0x3fc1a686f6ab2533ull, 0xbfac3c021789a786ull, 0xbf987d27ccff4291ull, // c3, c4, c5, c6
+ 0x3f9e7f8380184b45ull, 0xbf731fe77c9c60afull, 0xbf8129a092de747aull, 0x3f75b29bb02cf69bull, // c7, c8, c9, c10
+ 0x3f45f87d903aaac8ull, 0xbf6045b9076cc487ull, 0xbf58fd89fe05e0d1ull, 0xbf74949d60113d63ull, // c11, c12, c13, c14
+ 0x3fa246332a2fcba5ull, 0x3fb69d8374520edaull, // c15, c16
+ // 5
+ 0x3fec000000000000ull, 0x3fe686650b8c2015ull, 0x3fe02500a09f8d6eull, 0xbfd6ba7cb7576538ull,
+ 0x3fb4f152b2bad124ull, 0x3faf203c316ce730ull, 0xbfae2196b7326859ull, 0x3f8b2ca62572b098ull,
+ 0x3f869543e7c420d4ull, 0xbf84a6046865ec7dull, 0x3f60c85b4d538746ull, 0x3f607df0f9f90c17ull,
+ 0xbf5e104671036300ull, 0x3f2085ee7e8ac170ull, 0x3f73f7af01d5af7aull, 0x3f7c9fd6200d0adeull,
+ 0xbfb29d851a896fcdull, 0xbfbded519f981716ull,
+ // 6
+ 0x3ff4000000000000ull, 0x3feb2523bb6b2deeull, 0x3fd1f25131e3a8c0ull, 0xbfce7291743d7555ull,
+ 0x3fbbba40cbef72beull, 0xbf89c7a02788557cull, 0xbf93a7a011ff8c2aull, 0x3f8f1cf6c7f5b00aull,
+ 0xbf7326bd4914222aull, 0xbf4ca3f1f2b9192bull, 0x3f5be9392199ec18ull, 0xbf4b852a6e0758d5ull,
+ 0x3f19bc98ddf0f340ull, 0x3f23524622610430ull, 0xbf1e40bdead17e6bull, 0x3f02cd40e0ad0a9full,
+ 0x3ed9065ae369b212ull, 0xbef02d288b5b3371ull,
+ // 7
+ 0x3ffc000000000000ull, 0x3fee1fbf97e33527ull, 0x3fbd22ca1c24a139ull, 0xbfbb6d85a01efb80ull,
+ 0x3fb01ba038be6a3dull, 0xbf98157e26e0d541ull, 0x3f6e4709c7e8430eull, 0x3f60379811e43dd5ull,
+ 0xbf5fc15b0a9d98faull, 0x3f4c77dee0afd227ull, 0xbf2a0c68a4489f10ull, 0xbf0078c63d1b8445ull,
+ 0x3f0d4304bc9246e8ull, 0xbeff12a6626911b4ull, 0x3ee224cd6c4513e5ull, 0xbe858ab8e019f311ull,
+ 0xbeb8e1ba4c98a030ull, 0x3eb290981209c1a6ull,
+ // 8
+ 0x4004000000000000ull, 0x3fef9258260a71c2ull, 0x3f9b3afe1fba5c76ull, 0xbf9addae58c7141aull, // b, c0, c1, c2
+ 0x3f916df44871efc8ull, 0xbf807b55c1c7d278ull, 0x3f67682afa611151ull, 0xbf4793826f78537eull, // c3, c4, c5, c6
+ 0x3f14cffcfa69fbb6ull, 0x3f04055bce68597aull, 0xbf00462601dc2faaull, 0x3eec12eadd55be7aull, // c7, c8, c9, c10
+ 0xbed13c415f7b9d41ull, 0x3eab9008bca408afull, 0xbe24b645e68eeaa3ull, 0xbe792fa6323b7cf8ull, // c11, c12, c13, c14
+ 0x3e6ffd0766ad4016ull, 0xbe567e924bf5ff6eull, // c15, c16
+ // 9
+ 0x400c000000000000ull, 0x3feff112c63a9077ull, 0x3f6dd37d19b22b21ull, 0xbf6dc59376c7aa19ull,
+ 0x3f63c6869dfc8870ull, 0xbf53a18d5843190full, 0x3f3ef2ee77717cbfull, 0xbf2405695e36240full,
+ 0x3f057e48e5b79d10ull, 0xbee2bf0cb4a71647ull, 0x3eb7b6a219dea9f4ull, 0xbe6fa600f593181bull,
+ 0xbe722b8d9720cdb0ull, 0x3e634df71865f620ull, 0xbe4abfebfb72bc83ull, 0x3e2df04d67876402ull,
+ 0xbe0c63c29f505f5bull, 0x3de3f7f7de6b0eb6ull,
+ // 10
+ 0x4014000000000000ull, 0x3fefff419668df11ull, 0x3f27ccec13a9ef96ull, 0xbf27cc5e74677410ull,
+ 0x3f1fb9aef915d828ull, 0xbf0fb6bbc89b1a5bull, 0x3ef95a4482f180b7ull, 0xbee0e08de39ce756ull,
+ 0x3ec33b66d7d77264ull, 0xbea31eaafe73efd5ull, 0x3e80cbcc8d4c5c8aull, 0xbe5a3c935dce3f7dull,
+ 0x3e322666d739bec0ull, 0xbe05bb1bcf83ca73ull, 0x3dd51c38f8695ed3ull, 0xbd95c72be95e4d2cull,
+ 0xbd7fab216b9e0e49ull, 0x3d69ed18bae3ebbcull,
+ // 11
+ 0x401c000000000000ull, 0x3feffffc832750f2ull, 0x3ecbe6c3f33250aeull, 0xbecbe6c0e8b4cc87ull,
+ 0x3ec299d1e27c6e11ull, 0xbeb299c9c684a963ull, 0x3e9dc2c27da3b603ull, 0xbe83d709ba5f714eull,
+ 0x3e66ac4e578b9b10ull, 0xbe46abb02c4368edull, 0x3e2425bb231a5e29ull, 0xbe001c6d95e3ae96ull,
+ 0x3dd76a553d7e7918ull, 0xbdaf2ac143fb6762ull, 0x3d8313ac38c6832bull, 0xbd55a89c30203106ull,
+ 0x3d2826b62056aa27ull, 0xbcf7534c4f3dfa71ull,
+ // 12
+ 0x4024000000000000ull, 0x3feffffffdc96f35ull, 0x3e41b4865394f75full, 0xbe41b486526b0565ull, // b, c0, c1, c2
+ 0x3e379b5ddcca334cull, 0xbe279b5dd4fb3d01ull, 0x3e12e2afd9f7433eull, 0xbdf92e3fc5ee63e0ull, // c3, c4, c5, c6
+ 0x3ddcc74b8d3d5c42ull, 0xbdbcc749ca8079ddull, 0x3d9992a4beac8662ull, 0xbd74755a00ea1fd3ull, // c7, c8, c9, c10
+ 0x3d4de0fa59416a39ull, 0xbd23eae52a3dbf57ull, 0x3cf7787935626685ull, 0xbccad6b3bb9eff65ull, // c11, c12, c13, c14
+ 0x3ca313e31762f523ull, 0xbc730b73f1eaff20ull, // c15, c16
+ // 13
+ 0x402c000000000000ull, 0x3fefffffffffcf58ull, 0x3d8853f01bda5f28ull, 0xbd8853f01bef63a4ull,
+ 0x3d8037f57bc62c9aull, 0xbd7037f57ae72aa6ull, 0x3d59f320348679baull, 0xbd414cc030f2110eull,
+ 0x3d23c589137f92b4ull, 0xbd03c5883836b9d2ull, 0x3ce191ba5ed3fb67ull, 0xbcbc1c6c063bb7acull,
+ 0x3c948716cf3681b4ull, 0xbc6b5e3e9ca0955eull, 0x3c401ffc49c6bc29ull, 0xbc12705ccd3dd884ull,
+ 0x3bea37aa21895319ull, 0xbbba2cff8135d462ull,
+ // 14
+ 0x4034000000000000ull, 0x3ff0000000000000ull, 0x3c73953c0197ef58ull, 0xbc73955be519be31ull,
+ 0x3c6a2d4b50a2cff7ull, 0xbc5a2ca2bba78e86ull, 0x3c44b61d9bbcc940ull, 0xbc2ba022e8d82a87ull,
+ 0x3c107f8e2c8707a1ull, 0xbbf07a5416264aecull, 0x3bc892450bad44c4ull, 0xbba3be9a4460fe00ull,
+ 0x3b873f9f2d2fda99ull, 0xbb5eca68e2c1ba2eull, 0xbabf0b21acfa52abull, 0xba8e0a4c47ae75f5ull,
+ 0x3ae5c7f1fd871496ull, 0xbab5a71b5f7d9035ull,
+ // 15
+ 0x0ull, 0x3ff0000000000000ull, 0x0ull, 0x0ull,
+ 0x0ull, 0x0ull, 0x0ull, 0x0ull,
+ 0x0ull, 0x0ull, 0x0ull, 0x0ull,
+ 0x0ull, 0x0ull, 0x0ull, 0x0ull,
+ 0x0ull, 0x0ull,
+ };
+#else
static const npy_uint64 NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) lut16x18[] = {
// 0
0x0ull, 0x3fcc000000000000ull, 0x3fd4000000000000ull, 0x3fdc000000000000ull,
@@ -169,6 +279,8 @@ simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_
0xbe567e924bf5ff6eull, 0x3de3f7f7de6b0eb6ull, 0x3d69ed18bae3ebbcull, 0xbcf7534c4f3dfa71ull,
0xbc730b73f1eaff20ull, 0xbbba2cff8135d462ull, 0xbab5a71b5f7d9035ull, 0x0ull
};
+#endif // defined(TANH_TRANSPOSED_LUT)
+
const int nlanes = npyv_nlanes_f64;
const npyv_f64 qnan = npyv_setall_f64(NPY_NAN);
for (; len > 0; len -= nlanes, src += ssrc*nlanes, dst += sdst*nlanes) {
@@ -189,6 +301,25 @@ simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_
idxl = npyv_min_s32(idxl, npyv_setall_s32(0x780000));
npyv_u64 idx = npyv_shri_u64(npyv_reinterpret_u64_s32(idxl), 51);
+#if defined(TANH_TRANSPOSED_LUT)
+ npyv_f64 e0e1[npyv_nlanes_f64];
+ npyv_lanetype_u64 index[npyv_nlanes_f64];
+ npyv_store_u64(index, idx);
+
+ /**begin repeat
+ * #off= 0, 2, 4, 6, 8, 10, 12, 14, 16#
+ * #e0 = b, c1, c3, c5, c7, c9, c11, c13, c15#
+ * #e1 = c0,c2, c4, c6, c8, c10,c12, c14, c16#
+ */
+ /**begin repeat1
+ * #lane = 0, 1#
+ */
+ e0e1[@lane@] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[@lane@] * 18 + @off@));
+ /**end repeat1**/
+ npyv_f64 @e0@ = npyv_combinel_f64(e0e1[0], e0e1[1]);
+ npyv_f64 @e1@ = npyv_combineh_f64(e0e1[0], e0e1[1]);
+ /**end repeat**/
+#else
npyv_f64 b = npyv_lut16_f64((const double*)lut16x18 + 16*0, idx);
npyv_f64 c0 = npyv_lut16_f64((const double*)lut16x18 + 1*16, idx);
npyv_f64 c1 = npyv_lut16_f64((const double*)lut16x18 + 2*16, idx);
@@ -207,6 +338,7 @@ simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_
npyv_f64 c14 = npyv_lut16_f64((const double*)lut16x18 + 15*16, idx);
npyv_f64 c15 = npyv_lut16_f64((const double*)lut16x18 + 16*16, idx);
npyv_f64 c16 = npyv_lut16_f64((const double*)lut16x18 + 17*16, idx);
+#endif // defined(TANH_TRANSPOSED_LUT)
// no need to zerofy nans or avoid FP exceptions by NO_EXC like SVML does
// since we're clearing the FP status anyway.
@@ -240,12 +372,82 @@ simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_
}
}
}
+
+#undef TANH_TRANSPOSED_LUT
+
#endif // NPY_SIMD_F64
#if NPY_SIMD_F32
+
+
+ // For architectures without efficient gather / scatter instructions, it is
+ // better to use a transposed LUT where we can load all coefficients for an
+ // index linearly. In order to keep the same vertical calculation, we
+ // transpose the coef. into lanes. A 4x4 transpose is all that's
+ // supported so we require `npyv_nlanes_f32` == 4.
+ #if npyv_nlanes_f32 == 4
+ #define TANHF_TRANSPOSED_LUT
+ // Define missing universal intrinsics used below
+ #if !defined(npyv_get_lane_u32)
+ #if defined(NPY_HAVE_ASIMD)
+ #define UNDEF_npyv_get_lane_u32
+ #define npyv_get_lane_u32 vgetq_lane_u32
+ #elif defined(NPY_HAVE_SSE41)
+ #define UNDEF_npyv_get_lane_u32
+ #define npyv_get_lane_u32 _mm_extract_epi32
+ #else
+ #undef TANHF_TRANSPOSED_LUT
+ #endif
+ #endif // !defined(npyv_get_lane_u32)
+ #endif // npyv_nlanes_f32 == 4
+
static void
simd_tanh_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, npy_intp len)
{
+#if defined(TANHF_TRANSPOSED_LUT)
+ static const npy_uint32 NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) lut8x32[] = {
+ // c6 c5 c4 c3 c2 c1 c0 b
+ 0xbc0e2f66, 0x3e0910e9, 0xb76dd6b9, 0xbeaaaaa5, 0xb0343c7b, 0x3f800000, 0x0, 0x0,
+ 0x460bda12, 0x43761143, 0xbe1c276d, 0xbeab0612, 0xbd6ee69d, 0x3f7f1f84, 0x3d6fb9c9, 0x3d700000,
+ 0x43d638ef, 0x4165ecdc, 0x3c1dcf2f, 0xbea7f01f, 0xbd8f0da7, 0x3f7ebd11, 0x3d8fc35f, 0x3d900000,
+ 0xc3e11c3e, 0xc190f756, 0x3dc1a78d, 0xbea4e120, 0xbdae477d, 0x3f7e1e5f, 0x3daf9169, 0x3db00000,
+ // 4
+ 0xc2baa4e9, 0xc08c097d, 0x3d96f985, 0xbea387b7, 0xbdcd2a1f, 0x3f7d609f, 0x3dcf49ab, 0x3dd00000,
+ 0xc249da2d, 0xc02ba813, 0x3da2b61b, 0xbea15962, 0xbdeba80d, 0x3f7c842d, 0x3deee849, 0x3df00000,
+ 0xc1859b82, 0xbf7f6bda, 0x3dc13397, 0xbe9d57f7, 0xbe0c443b, 0x3f7b00e5, 0x3e0f0ee8, 0x3e100000,
+ 0x40dd5b57, 0x3f2b1dc0, 0x3dd2f670, 0xbe976b5a, 0xbe293cf3, 0x3f789580, 0x3e2e4984, 0x3e300000,
+ // 8
+ 0x40494640, 0x3ece105d, 0x3df48a0a, 0xbe90230d, 0xbe44f282, 0x3f75b8ad, 0x3e4d2f8e, 0x3e500000,
+ 0x40c730a8, 0x3f426a94, 0x3e06c5a8, 0xbe880dff, 0xbe5f3651, 0x3f726fd9, 0x3e6bb32e, 0x3e700000,
+ 0xbf0f160e, 0xbadb0dc4, 0x3e1a3aba, 0xbe7479b3, 0xbe81c7c0, 0x3f6cc59b, 0x3e8c51cd, 0x3e900000,
+ 0x3e30e76f, 0x3da43b17, 0x3e27c405, 0xbe4c3d88, 0xbe96d7ca, 0x3f63fb92, 0x3ea96163, 0x3eb00000,
+ // 12
+ 0xbea81387, 0xbd51ab88, 0x3e2e78d0, 0xbe212482, 0xbea7fb8e, 0x3f59ff97, 0x3ec543f1, 0x3ed00000,
+ 0xbdb26a1c, 0xbcaea23d, 0x3e2c3e44, 0xbdeb8cba, 0xbeb50e9e, 0x3f4f11d7, 0x3edfd735, 0x3ef00000,
+ 0xbd351e57, 0xbd3b6d8d, 0x3e1d3097, 0xbd5e78ad, 0xbec12efe, 0x3f3d7573, 0x3f028438, 0x3f100000,
+ 0xbb4c01a0, 0xbd6caaad, 0x3df4a8f4, 0x3c6b5e6e, 0xbec4be92, 0x3f24f360, 0x3f18abf0, 0x3f300000,
+ // 16
+ 0x3c1d7bfb, 0xbd795bed, 0x3da38508, 0x3d839143, 0xbebce070, 0x3f0cbfe7, 0x3f2bc480, 0x3f500000,
+ 0x3c722cd1, 0xbd5fddda, 0x3d31416a, 0x3dc21ee1, 0xbead510e, 0x3eec1a69, 0x3f3bec1c, 0x3f700000,
+ 0x3c973f1c, 0xbd038f3b, 0x3b562657, 0x3de347af, 0xbe8ef7d6, 0x3eb0a801, 0x3f4f2e5b, 0x3f900000,
+ 0x3c33a31b, 0xbc1cad63, 0xbcaeeac9, 0x3dcbec96, 0xbe4b8704, 0x3e6753a2, 0x3f613c53, 0x3fb00000,
+ // 20
+ 0x3b862ef4, 0x3abb4766, 0xbcce9419, 0x3d99ef2d, 0xbe083237, 0x3e132f1a, 0x3f6ce37d, 0x3fd00000,
+ 0x3a27b3d0, 0x3b95f10b, 0xbcaaeac4, 0x3d542ea1, 0xbdaf7449, 0x3db7e7d3, 0x3f743c4f, 0x3ff00000,
+ 0xba3b5907, 0x3b825873, 0xbc49e7d0, 0x3cdde701, 0xbd2e1ec4, 0x3d320845, 0x3f7a5feb, 0x40100000,
+ 0xba0efc22, 0x3afaea66, 0xbba71ddd, 0x3c2cca67, 0xbc83bf06, 0x3c84d3d4, 0x3f7dea85, 0x40300000,
+ // 24
+ 0xb97f9f0f, 0x3a49f878, 0xbb003b0e, 0x3b81cb27, 0xbbc3e0b5, 0x3bc477b7, 0x3f7f3b3d, 0x40500000,
+ 0xb8c8af50, 0x39996bf3, 0xba3f9a05, 0x3ac073a1, 0xbb10aadc, 0x3b10d3da, 0x3f7fb78c, 0x40700000,
+ 0xb7bdddfb, 0x388f3e6c, 0xb92c08a7, 0x39ac3032, 0xba0157db, 0x3a01601e, 0x3f7fefd4, 0x40900000,
+ 0xb64f2950, 0x371bb0e3, 0xb7ba9232, 0x383a94d9, 0xb88c18f2, 0x388c1a3b, 0x3f7ffdd0, 0x40b00000,
+ // 28
+ 0xb4e085b1, 0x35a8a5e6, 0xb64a0b0f, 0x36ca081d, 0xb717b096, 0x3717b0da, 0x3f7fffb4, 0x40d00000,
+ 0xb3731dfa, 0x34369b17, 0xb4dac169, 0x355abd4c, 0xb5a43bae, 0x35a43bce, 0x3f7ffff6, 0x40f00000,
+ 0xb15a1f04, 0x322487b0, 0xb2ab78ac, 0x332b3cb6, 0xb383012c, 0x338306c6, 0x3f7fffff, 0x41100000,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3f800000, 0x0,
+ };
+#else
static const npy_uint32 NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) lut32x8[] = {
// 0
0x0, 0x3d700000, 0x3d900000, 0x3db00000, 0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000,
@@ -288,6 +490,7 @@ simd_tanh_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, npy_in
0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b, 0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22,
0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950, 0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x0
};
+#endif // defined(TANHF_TRANSPOSED_LUT)
const int nlanes = npyv_nlanes_f32;
const npyv_f32 qnan = npyv_setall_f32(NPY_NANF);
@@ -307,6 +510,48 @@ simd_tanh_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, npy_in
idxs = npyv_min_s32(idxs, npyv_setall_s32(0x3e00000));
npyv_u32 idx = npyv_shri_u32(npyv_reinterpret_u32_s32(idxs), 21);
+#if defined(TANHF_TRANSPOSED_LUT)
+ npyv_f32 c6543[npyv_nlanes_f32];
+ npyv_f32 c210b[npyv_nlanes_f32];
+ npyv_lanetype_u32 index[npyv_nlanes_f32];
+
+ /**begin repeat
+ * #lane = 0, 1, 2, 3#
+ */
+ index[@lane@] = npyv_get_lane_u32(idx, @lane@);
+ c6543[@lane@] = npyv_reinterpret_f32_u32(npyv_load_u32(lut8x32 + index[@lane@] * 8));
+ c210b[@lane@] = npyv_reinterpret_f32_u32(npyv_load_u32(lut8x32 + index[@lane@] * 8 + 4));
+ /**end repeat**/
+
+ // lane0: {c6, c5, c4, c3}, {c2, c1, c0, b}
+ // lane1: {c6, c5, c4, c3}, {c2, c1, c0, b}
+ // lane2: {c6, c5, c4, c3}, {c2, c1, c0, b}
+ // lane3: {c6, c5, c4, c3}, {c2, c1, c0, b}
+ //
+ // transposed:
+ // c6: {lane0, lane1, lane2, lane3}
+ // c5: {lane0, lane1, lane2, lane3}
+ // c4: {lane0, lane1, lane2, lane3}
+ // c3: {lane0, lane1, lane2, lane3}
+ // c2: {lane0, lane1, lane2, lane3}
+ // c1: {lane0, lane1, lane2, lane3}
+ // c0: {lane0, lane1, lane2, lane3}
+ // b : {lane0, lane1, lane2, lane3}
+
+ npyv_f32x2 c6543_l01 = npyv_zip_f32(c6543[0], c6543[1]);
+ npyv_f32x2 c6543_l23 = npyv_zip_f32(c6543[2], c6543[3]);
+ npyv_f32 c6 = npyv_combinel_f32(c6543_l01.val[0], c6543_l23.val[0]);
+ npyv_f32 c5 = npyv_combineh_f32(c6543_l01.val[0], c6543_l23.val[0]);
+ npyv_f32 c4 = npyv_combinel_f32(c6543_l01.val[1], c6543_l23.val[1]);
+ npyv_f32 c3 = npyv_combineh_f32(c6543_l01.val[1], c6543_l23.val[1]);
+
+ npyv_f32x2 c210b_l01 = npyv_zip_f32(c210b[0], c210b[1]);
+ npyv_f32x2 c210b_l23 = npyv_zip_f32(c210b[2], c210b[3]);
+ npyv_f32 c2 = npyv_combinel_f32(c210b_l01.val[0], c210b_l23.val[0]);
+ npyv_f32 c1 = npyv_combineh_f32(c210b_l01.val[0], c210b_l23.val[0]);
+ npyv_f32 c0 = npyv_combinel_f32(c210b_l01.val[1], c210b_l23.val[1]);
+ npyv_f32 b = npyv_combineh_f32(c210b_l01.val[1], c210b_l23.val[1]);
+#else
npyv_f32 b = npyv_lut32_f32((const float*)lut32x8 + 32*0, idx);
npyv_f32 c0 = npyv_lut32_f32((const float*)lut32x8 + 32*1, idx);
npyv_f32 c1 = npyv_lut32_f32((const float*)lut32x8 + 32*2, idx);
@@ -315,6 +560,7 @@ simd_tanh_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, npy_in
npyv_f32 c4 = npyv_lut32_f32((const float*)lut32x8 + 32*5, idx);
npyv_f32 c5 = npyv_lut32_f32((const float*)lut32x8 + 32*6, idx);
npyv_f32 c6 = npyv_lut32_f32((const float*)lut32x8 + 32*7, idx);
+#endif // defined(TANHF_TRANSPOSED_LUT)
// no need to zerofy nans or avoid FP exceptions by NO_EXC like SVML does
// since we're clearing the FP status anyway.
@@ -338,6 +584,13 @@ simd_tanh_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, npy_in
}
}
}
+
+#undef TANHF_TRANSPOSED_LUT
+#if defined(UNDEF_npyv_get_lane_u32)
+#undef UNDEF_npyv_get_lane_u32
+#undef npyv_get_lane_u32
+#endif
+
#endif // NPY_SIMD_F32
#endif // NPY_SIMD_FMA3