/* * arm/crc32_pmull_helpers.h - helper functions for CRC-32 folding with PMULL * * Copyright 2022 Eric Biggers * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ /* * This file is a "template" for instantiating helper functions for CRC folding * with pmull instructions. It accepts the following parameters: * * SUFFIX: * Name suffix to append to all instantiated functions. * ATTRIBUTES: * Target function attributes to use. * ENABLE_EOR3: * Use the eor3 instruction (from the sha3 extension). */ #include /* Create a vector with 'a' in the first 4 bytes, and the rest zeroed out. */ #undef u32_to_bytevec static forceinline ATTRIBUTES uint8x16_t ADD_SUFFIX(u32_to_bytevec)(u32 a) { return vreinterpretq_u8_u32(vsetq_lane_u32(a, vdupq_n_u32(0), 0)); } #define u32_to_bytevec ADD_SUFFIX(u32_to_bytevec) /* Load two 64-bit values into a vector. */ #undef load_multipliers static forceinline ATTRIBUTES poly64x2_t ADD_SUFFIX(load_multipliers)(const u64 p[2]) { return vreinterpretq_p64_u64(vld1q_u64(p)); } #define load_multipliers ADD_SUFFIX(load_multipliers) /* Do carryless multiplication of the low halves of two vectors. */ #undef clmul_low static forceinline ATTRIBUTES uint8x16_t ADD_SUFFIX(clmul_low)(uint8x16_t a, poly64x2_t b) { return vreinterpretq_u8_p128( compat_vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u8(a), 0), vgetq_lane_p64(b, 0))); } #define clmul_low ADD_SUFFIX(clmul_low) /* Do carryless multiplication of the high halves of two vectors. */ #undef clmul_high static forceinline ATTRIBUTES uint8x16_t ADD_SUFFIX(clmul_high)(uint8x16_t a, poly64x2_t b) { #if defined(__clang__) && defined(ARCH_ARM64) /* * Use inline asm to ensure that pmull2 is really used. This works * around clang bug https://github.com/llvm/llvm-project/issues/52868. */ uint8x16_t res; __asm__("pmull2 %0.1q, %1.2d, %2.2d" : "=w" (res) : "w" (a), "w" (b)); return res; #else return vreinterpretq_u8_p128(vmull_high_p64(vreinterpretq_p64_u8(a), b)); #endif } #define clmul_high ADD_SUFFIX(clmul_high) #undef eor3 static forceinline ATTRIBUTES uint8x16_t ADD_SUFFIX(eor3)(uint8x16_t a, uint8x16_t b, uint8x16_t c) { #if ENABLE_EOR3 #if HAVE_SHA3_INTRIN return veor3q_u8(a, b, c); #else uint8x16_t res; __asm__("eor3 %0.16b, %1.16b, %2.16b, %3.16b" : "=w" (res) : "w" (a), "w" (b), "w" (c)); return res; #endif #else /* ENABLE_EOR3 */ return veorq_u8(veorq_u8(a, b), c); #endif /* !ENABLE_EOR3 */ } #define eor3 ADD_SUFFIX(eor3) #undef fold_vec static forceinline ATTRIBUTES uint8x16_t ADD_SUFFIX(fold_vec)(uint8x16_t src, uint8x16_t dst, poly64x2_t multipliers) { uint8x16_t a = clmul_low(src, multipliers); uint8x16_t b = clmul_high(src, multipliers); return eor3(a, b, dst); } #define fold_vec ADD_SUFFIX(fold_vec) #undef vtbl static forceinline ATTRIBUTES uint8x16_t ADD_SUFFIX(vtbl)(uint8x16_t table, uint8x16_t indices) { #ifdef ARCH_ARM64 return vqtbl1q_u8(table, indices); #else uint8x8x2_t tab2; tab2.val[0] = vget_low_u8(table); tab2.val[1] = vget_high_u8(table); return vcombine_u8(vtbl2_u8(tab2, vget_low_u8(indices)), vtbl2_u8(tab2, vget_high_u8(indices))); #endif } #define vtbl ADD_SUFFIX(vtbl) /* * Given v containing a 16-byte polynomial, and a pointer 'p' that points to the * next '1 <= len <= 15' data bytes, rearrange the concatenation of v and the * data into vectors x0 and x1 that contain 'len' bytes and 16 bytes, * respectively. Then fold x0 into x1 and return the result. Assumes that * 'p + len - 16' is in-bounds. */ #undef fold_partial_vec static forceinline ATTRIBUTES MAYBE_UNUSED uint8x16_t ADD_SUFFIX(fold_partial_vec)(uint8x16_t v, const u8 *p, size_t len, poly64x2_t multipliers_1) { /* * vtbl(v, shift_tab[len..len+15]) left shifts v by 16-len bytes. * vtbl(v, shift_tab[len+16..len+31]) right shifts v by len bytes. */ static const u8 shift_tab[48] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, }; const uint8x16_t lshift = vld1q_u8(&shift_tab[len]); const uint8x16_t rshift = vld1q_u8(&shift_tab[len + 16]); uint8x16_t x0, x1, bsl_mask; /* x0 = v left-shifted by '16 - len' bytes */ x0 = vtbl(v, lshift); /* Create a vector of '16 - len' 0x00 bytes, then 'len' 0xff bytes. */ bsl_mask = vreinterpretq_u8_s8( vshrq_n_s8(vreinterpretq_s8_u8(rshift), 7)); /* * x1 = the last '16 - len' bytes from v (i.e. v right-shifted by 'len' * bytes) followed by the remaining data. */ x1 = vbslq_u8(bsl_mask /* 0 bits select from arg3, 1 bits from arg2 */, vld1q_u8(p + len - 16), vtbl(v, rshift)); return fold_vec(x0, x1, multipliers_1); } #define fold_partial_vec ADD_SUFFIX(fold_partial_vec)