Use little-endian mask during BLAKE2 loads

author: Jeffrey Walton <noloader@gmail.com> 2020-06-28 02:34:52 -0400
committer: Jeffrey Walton <noloader@gmail.com> 2020-06-28 02:34:52 -0400
commit: 25cdab6d325f4806dd72debb5574ba6f08c6028c (patch)
tree: d7a6fcf0282a1a9f8fc6ca5e4f37263ddf7e4882 /blake2s_simd.cpp
parent: dc2b336acee0a341316bf4050f42c782da58924e (diff)
download: cryptopp-git-25cdab6d325f4806dd72debb5574ba6f08c6028c.tar.gz
1 files changed, 20 insertions, 17 deletions
diff --git a/blake2s_simd.cpp b/blake2s_simd.cpp
index 4125c99b..b50bd988 100644
--- a/blake2s_simd.cpp
+++ b/blake2s_simd.cpp
@@ -706,13 +706,13 @@ inline uint32x4_p VecLoad32(const T* p)
 }
 
 template <class T>
-inline uint32x4_p VecLoad32LE(const T* p)
+inline uint32x4_p VecLoad32LE(const T* p, const uint8x16_p le_mask)
 {
-#if __BIG_ENDIAN__
-    const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
+#if defined(CRYPTOPP_BIG_ENDIAN)
     const uint32x4_p v = VecLoad(p);
-    return VecPermute(v, v, m);
+    return VecPermute(v, v, le_mask);
 #else
+    CRYPTOPP_UNUSED(le_mask);
     return VecLoad(p);
 #endif
 }
@@ -724,12 +724,13 @@ inline void VecStore32(T* p, const uint32x4_p x)
 }
 
 template <class T>
-inline void VecStore32LE(T* p, const uint32x4_p x)
+inline void VecStore32LE(T* p, const uint32x4_p x, const uint8x16_p le_mask)
 {
-#if __BIG_ENDIAN__
-    const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
-    VecStore(VecPermute(x, x, m), p);
+#if defined(CRYPTOPP_BIG_ENDIAN)
+    const uint32x4_p v = VecPermute(x, x, le_mask);
+    VecStore(v, p);
 #else
+    CRYPTOPP_UNUSED(le_mask);
     VecStore(x, p);
 #endif
 }
@@ -991,17 +992,19 @@ void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state)
       BLAKE2S_G2(row1,row2,row3,row4,buf4); \
       BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4);
 
+    const uint8x16_p le_mask = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
+
+    const uint32x4_p  m0 = VecLoad32LE(input +  0, le_mask);
+    const uint32x4_p  m4 = VecLoad32LE(input + 16, le_mask);
+    const uint32x4_p  m8 = VecLoad32LE(input + 32, le_mask);
+    const uint32x4_p m12 = VecLoad32LE(input + 48, le_mask);
+
     uint32x4_p row1, row2, row3, row4;
     uint32x4_p buf1, buf2, buf3, buf4;
     uint32x4_p  ff0,  ff1;
 
-    const uint32x4_p  m0 = VecLoad32LE(input +  0);
-    const uint32x4_p  m4 = VecLoad32LE(input + 16);
-    const uint32x4_p  m8 = VecLoad32LE(input + 32);
-    const uint32x4_p m12 = VecLoad32LE(input + 48);
-
-    row1 = ff0 = VecLoad32LE(state.h()+0);
-    row2 = ff1 = VecLoad32LE(state.h()+4);
+    row1 = ff0 = VecLoad32LE(state.h()+0, le_mask);
+    row2 = ff1 = VecLoad32LE(state.h()+4, le_mask);
     row3 = VecLoad32(BLAKE2S_IV+0);
     row4 = VecXor(VecLoad32(BLAKE2S_IV+4), VecLoad32(state.t()+0));
 
@@ -1016,8 +1019,8 @@ void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state)
     BLAKE2S_ROUND(8);
     BLAKE2S_ROUND(9);
 
-    VecStore32LE(state.h()+0, VecXor(ff0, VecXor(row1, row3)));
-    VecStore32LE(state.h()+4, VecXor(ff1, VecXor(row2, row4)));
+    VecStore32LE(state.h()+0, VecXor(ff0, VecXor(row1, row3)), le_mask);
+    VecStore32LE(state.h()+4, VecXor(ff1, VecXor(row2, row4)), le_mask);
 }
 #endif  // CRYPTOPP_ALTIVEC_AVAILABLE
author	Jeffrey Walton <noloader@gmail.com>	2020-06-28 02:34:52 -0400
committer	Jeffrey Walton <noloader@gmail.com>	2020-06-28 02:34:52 -0400
commit	25cdab6d325f4806dd72debb5574ba6f08c6028c (patch)
tree	d7a6fcf0282a1a9f8fc6ca5e4f37263ddf7e4882 /blake2s_simd.cpp
parent	dc2b336acee0a341316bf4050f42c782da58924e (diff)
download	cryptopp-git-25cdab6d325f4806dd72debb5574ba6f08c6028c.tar.gz