diff options
author | Sage Weil <sage@inktank.com> | 2013-09-19 10:09:49 -0700 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2013-09-19 10:09:49 -0700 |
commit | d7d85292a4903269d5d80e0c58fce83b9832961d (patch) | |
tree | 5234b05b056e328d5dc482db249ba590a1545df9 | |
parent | 5421d6da2ca34d122c1de8faec13522801fe77fb (diff) | |
download | ceph-d7d85292a4903269d5d80e0c58fce83b9832961d.tar.gz |
common: add NEON crc32c implementation
Signed-off-by: Sage Weil <sage@inktank.com>
-rw-r--r-- | src/common/Makefile.am | 16 | ||||
-rw-r--r-- | src/common/crc32c.cc | 6 | ||||
-rw-r--r-- | src/common/crc32c_neon.c | 158 | ||||
-rw-r--r-- | src/common/crc32c_neon.h | 14 |
4 files changed, 192 insertions, 2 deletions
diff --git a/src/common/Makefile.am b/src/common/Makefile.am index 4c027909b4d..bef23f9f894 100644 --- a/src/common/Makefile.am +++ b/src/common/Makefile.am @@ -83,19 +83,31 @@ libcommon_crc_la_SOURCES = \ common/sctp_crc32.c \ common/crc32c.cc \ common/crc32c_intel_baseline.c \ - common/crc32c_intel_fast.c + common/crc32c_intel_fast.c \ + common/crc32c_neon.c + +libcommon_crc_la_CFLAGS = ${AM_CFLAGS} +libcommon_crc_la_CXXFLAGS = ${AM_CXXFLAGS} if WITH_GOOD_YASM_ELF64 libcommon_crc_la_SOURCES += common/crc32c_intel_fast_asm.S libcommon_crc_la_LIBTOOLFLAGS = --tag=CC endif + +if ENABLE_FPU_NEON +libcommon_crc_la_CFLAGS += -mfpu=neon +libcommon_crc_la_CXXFLAGS += -mfpu=neon +endif + LIBCOMMON_DEPS += libcommon_crc.la noinst_LTLIBRARIES += libcommon_crc.la + noinst_HEADERS += \ common/sctp_crc32.h \ common/crc32c_intel_baseline.h \ - common/crc32c_intel_fast.h + common/crc32c_intel_fast.h \ + common/crc32c_neon.h # important; libmsg before libauth! diff --git a/src/common/crc32c.cc b/src/common/crc32c.cc index e2e81a42f45..137a436727e 100644 --- a/src/common/crc32c.cc +++ b/src/common/crc32c.cc @@ -5,9 +5,11 @@ #include "arch/probe.h" #include "arch/intel.h" +#include "arch/neon.h" #include "common/sctp_crc32.h" #include "common/crc32c_intel_baseline.h" #include "common/crc32c_intel_fast.h" +#include "common/crc32c_neon.h" /* * choose best implementation based on the CPU architecture. @@ -24,6 +26,10 @@ ceph_crc32c_func_t ceph_choose_crc32(void) return ceph_crc32c_intel_fast; } + if (ceph_arch_neon) { + return ceph_crc32c_neon; + } + // default return ceph_crc32c_sctp; } diff --git a/src/common/crc32c_neon.c b/src/common/crc32c_neon.c new file mode 100644 index 00000000000..e27ec1b604c --- /dev/null +++ b/src/common/crc32c_neon.c @@ -0,0 +1,158 @@ +/* + * This code is lifted from a pending patch to Hadoop, found here: + * + * http://lists.linaro.org/pipermail/linaro-toolchain/2013-April/003282.html + * + * to the bulk_crc32.c file, which is licensed: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * The author is: + * + * Steve Capper <steve.capper at linaro.org> + * + */ + +#include <inttypes.h> + +#ifdef __arm__ + +#include <arm_neon.h> +#include "include/crc32c.h" +#include "common/sctp_crc32.h" + +#define crc32c_fallback ceph_crc32c_sctp + +/* + * Functions to reduce the size of the input buffer (fold) on ARM + * NEON. The smaller buffer has the same CRC32c checksum as the + * original. + * + * Most of the NEON buffer folding work takes place in the function + * below. We do the following: + * 1) 4 sets of vmull.p8's + * 2) Combine these to give a "vmull.p32" (lf3) + * 3) Shift left 1 bit to account for the endianess of multiplication. + * + * The folding and multiplication logic can be found documented at: + * https://wiki.linaro.org/LEG/Engineering/CRC + */ +static inline uint64x1_t crc32c_neon_proc_part(poly8x8_t lhs, poly8x8_t rhs1, + poly8x8_t rhs2, poly8x8_t rhs3, poly8x8_t rhs4) +{ + poly16x8_t lm1, lm2, lm3, lm4; + poly16x4x2_t lz1, lz2; + uint16x4_t le1, le2; + uint32x2_t le3; + uint32x4_t ls1, ls2, lf1, lf2; + uint64x2_t ls3, le4; + uint64x1_t lf3, lf4; + + lm1 = vmull_p8(lhs, rhs1); + lm2 = vmull_p8(lhs, rhs2); + lz1 = vuzp_p16(vget_low_p16(lm2), vget_high_p16(lm2)); + le1 = veor_u16(vreinterpret_u16_p16(lz1.val[0]), + vreinterpret_u16_p16(lz1.val[1])); + ls1 = vshll_n_u16(le1, 8); + lf1 = veorq_u32(ls1, vreinterpretq_u32_p16(lm1)); + + lm3 = vmull_p8(lhs, rhs3); + lm4 = vmull_p8(lhs, rhs4); + lz2 = vuzp_p16(vget_low_p16(lm4), vget_high_p16(lm4)); + le2 = veor_u16(vreinterpret_u16_p16(lz2.val[0]), + vreinterpret_u16_p16(lz2.val[1])); + ls2 = vshll_n_u16(le2, 8); + lf2 = veorq_u32(ls2, vreinterpretq_u32_p16(lm3)); + + le3 = veor_u32(vget_low_u32(lf2), vget_high_u32(lf2)); + ls3 = vshll_n_u32(le3, 16); + le4 = veorq_u64(ls3, vreinterpretq_u64_u32(lf1)); + lf3 = vreinterpret_u64_u32(veor_u32(vget_low_u32(vreinterpretq_u32_u64(le4)), + vget_high_u32(vreinterpretq_u32_u64(le4)))); + lf4 = vshl_n_u64(lf3, 1); + return lf4; +} + +uint32_t ceph_crc32c_neon(uint32_t crc, const uint8_t *buf, size_t length) +{ + poly8x8_t xor_constant, lhs1, lhs2, lhs3, lhs4, rhs1, rhs2, rhs3, rhs4; + poly8x16_t lhl1, lhl2; + + uint64_t residues[4]; + uint32_t loop; + + if (length % 32) + return crc32c_fallback(crc, buf, length); + + /* + * because crc32c has an initial crc value of 0xffffffff, we need to + * pre-fold the buffer before folding begins proper. + * The following constant is computed by: + * 1) finding a 8x32 bit value that gives a 0xffffffff crc (with initial value 0) + * (this will be 7x32 bit 0s and 1x32 bit constant) + * 2) run a buffer fold (with 0 xor_constant) on this 8x32 bit value to get the + * xor_constant. + */ + xor_constant = vcreate_p8(0x3E43E474A2870290); + + if (crc != 0xffffffff) + return crc32c_fallback(crc, buf, length); + + /* k1 = x^288 mod P(x) - bit reversed */ + /* k2 = x^256 mod P(x) - bit reversed */ + + rhs1 = vcreate_p8(0x510AC59A9C25531D); /* k2:k1 */ + rhs2 = vcreate_p8(0x0A519AC5259C1D53); /* byte swap */ + rhs3 = vcreate_p8(0xC59A510A531D9C25); /* half word swap */ + rhs4 = vcreate_p8(0x9AC50A511D53259C); /* byte swap of half word swap */ + + lhl1 = vld1q_p8((const poly8_t *) buf); + lhl2 = vld1q_p8((const poly8_t *) buf + 16); + + lhs1 = vget_low_p8(lhl1); + lhs2 = vget_high_p8(lhl1); + lhs3 = vget_low_p8(lhl2); + lhs4 = vget_high_p8(lhl2); + + /* pre-fold lhs4 */ + lhs4 = vreinterpret_p8_u16(veor_u16(vreinterpret_u16_p8(lhs4), + vreinterpret_u16_p8(xor_constant))); + + for(loop = 0; loop < (length - 32)/32; ++loop) { + uint64x1_t l1f4, l2f4, l3f4, l4f4; + + l1f4 = crc32c_neon_proc_part(lhs1, rhs1, rhs2, rhs3, rhs4); + l2f4 = crc32c_neon_proc_part(lhs2, rhs1, rhs2, rhs3, rhs4); + l3f4 = crc32c_neon_proc_part(lhs3, rhs1, rhs2, rhs3, rhs4); + l4f4 = crc32c_neon_proc_part(lhs4, rhs1, rhs2, rhs3, rhs4); + + lhl1 = vld1q_p8((const poly8_t *) (buf + 32 * (loop + 1))); + lhl2 = vld1q_p8((const poly8_t *) (buf + 32 * (loop + 1) + 16)); + + __builtin_prefetch(buf + 32 * (loop + 2)); + + lhs1 = vget_low_p8(lhl1); + lhs2 = vget_high_p8(lhl1); + lhs3 = vget_low_p8(lhl2); + lhs4 = vget_high_p8(lhl2); + + lhs1 = vreinterpret_p8_u64(veor_u64(vreinterpret_u64_p8(lhs1), l1f4)); + lhs2 = vreinterpret_p8_u64(veor_u64(vreinterpret_u64_p8(lhs2), l2f4)); + lhs3 = vreinterpret_p8_u64(veor_u64(vreinterpret_u64_p8(lhs3), l3f4)); + lhs4 = vreinterpret_p8_u64(veor_u64(vreinterpret_u64_p8(lhs4), l4f4)); + } + + vst1q_p8((poly8_t *) &residues[0], vcombine_p8(lhs1, lhs2)); + vst1q_p8((poly8_t *) &residues[2], vcombine_p8(lhs3, lhs4)); + + return crc32c_fallback(0, (const uint8_t *)residues, 32); +} + +#else /* __arm__ */ + +uint32_t ceph_crc32c_neon(uint32_t crc, unsigned char const *data, unsigned length) +{ + return 0; +} + +#endif /* __arm__ */ diff --git a/src/common/crc32c_neon.h b/src/common/crc32c_neon.h new file mode 100644 index 00000000000..ea55462bd82 --- /dev/null +++ b/src/common/crc32c_neon.h @@ -0,0 +1,14 @@ +#ifndef CEPH_COMMON_CRC32C_NEON_H +#define CEPH_COMMON_CRC32C_NEON_H + +#ifdef __cplusplus +extern "C" { +#endif + +extern uint32_t ceph_crc32c_neon(uint32_t crc, unsigned char const *buffer, unsigned len); + +#ifdef __cplusplus +} +#endif + +#endif |