/* * arm/crc32_pmull_wide.h - gzip CRC-32 with PMULL (extra-wide version) * * Copyright 2022 Eric Biggers * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ /* * This file is a "template" for instantiating PMULL-based crc32_arm functions. * The "parameters" are: * * SUFFIX: * Name suffix to append to all instantiated functions. * ATTRIBUTES: * Target function attributes to use. * ENABLE_EOR3: * Use the eor3 instruction (from the sha3 extension). * * This is the extra-wide version; it uses an unusually large stride length of * 12, and it assumes that crc32 instructions are available too. It's intended * for powerful CPUs that support both pmull and crc32 instructions, but where * throughput of pmull and xor (given enough instructions issued in parallel) is * significantly higher than that of crc32, thus making the crc32 instructions * (counterintuitively) not actually the fastest way to compute the CRC-32. The * Apple M1 processor is an example of such a CPU. */ #include "crc32_pmull_helpers.h" static ATTRIBUTES u32 ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len) { uint8x16_t v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11; if (len < 3 * 192) { static const u64 _aligned_attribute(16) mults[3][2] = { { CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */ { CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */ { CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */ }; poly64x2_t multipliers_4, multipliers_2, multipliers_1; if (len < 64) goto tail; multipliers_4 = load_multipliers(mults[0]); multipliers_2 = load_multipliers(mults[1]); multipliers_1 = load_multipliers(mults[2]); /* * Short length; don't bother aligning the pointer, and fold * 64 bytes (4 vectors) at a time, at most. */ v0 = veorq_u8(vld1q_u8(p + 0), u32_to_bytevec(crc)); v1 = vld1q_u8(p + 16); v2 = vld1q_u8(p + 32); v3 = vld1q_u8(p + 48); p += 64; len -= 64; while (len >= 64) { v0 = fold_vec(v0, vld1q_u8(p + 0), multipliers_4); v1 = fold_vec(v1, vld1q_u8(p + 16), multipliers_4); v2 = fold_vec(v2, vld1q_u8(p + 32), multipliers_4); v3 = fold_vec(v3, vld1q_u8(p + 48), multipliers_4); p += 64; len -= 64; } v0 = fold_vec(v0, v2, multipliers_2); v1 = fold_vec(v1, v3, multipliers_2); if (len >= 32) { v0 = fold_vec(v0, vld1q_u8(p + 0), multipliers_2); v1 = fold_vec(v1, vld1q_u8(p + 16), multipliers_2); p += 32; len -= 32; } v0 = fold_vec(v0, v1, multipliers_1); } else { static const u64 _aligned_attribute(16) mults[4][2] = { { CRC32_X1567_MODG, CRC32_X1503_MODG }, /* 12 vecs */ { CRC32_X799_MODG, CRC32_X735_MODG }, /* 6 vecs */ { CRC32_X415_MODG, CRC32_X351_MODG }, /* 3 vecs */ { CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */ }; const poly64x2_t multipliers_12 = load_multipliers(mults[0]); const poly64x2_t multipliers_6 = load_multipliers(mults[1]); const poly64x2_t multipliers_3 = load_multipliers(mults[2]); const poly64x2_t multipliers_1 = load_multipliers(mults[3]); const size_t align = -(uintptr_t)p & 15; const uint8x16_t *vp; /* Align p to the next 16-byte boundary. */ if (align) { if (align & 1) crc = __crc32b(crc, *p++); if (align & 2) { crc = __crc32h(crc, le16_bswap(*(u16 *)p)); p += 2; } if (align & 4) { crc = __crc32w(crc, le32_bswap(*(u32 *)p)); p += 4; } if (align & 8) { crc = __crc32d(crc, le64_bswap(*(u64 *)p)); p += 8; } len -= align; } vp = (const uint8x16_t *)p; v0 = veorq_u8(*vp++, u32_to_bytevec(crc)); v1 = *vp++; v2 = *vp++; v3 = *vp++; v4 = *vp++; v5 = *vp++; v6 = *vp++; v7 = *vp++; v8 = *vp++; v9 = *vp++; v10 = *vp++; v11 = *vp++; len -= 192; /* Fold 192 bytes (12 vectors) at a time. */ do { v0 = fold_vec(v0, *vp++, multipliers_12); v1 = fold_vec(v1, *vp++, multipliers_12); v2 = fold_vec(v2, *vp++, multipliers_12); v3 = fold_vec(v3, *vp++, multipliers_12); v4 = fold_vec(v4, *vp++, multipliers_12); v5 = fold_vec(v5, *vp++, multipliers_12); v6 = fold_vec(v6, *vp++, multipliers_12); v7 = fold_vec(v7, *vp++, multipliers_12); v8 = fold_vec(v8, *vp++, multipliers_12); v9 = fold_vec(v9, *vp++, multipliers_12); v10 = fold_vec(v10, *vp++, multipliers_12); v11 = fold_vec(v11, *vp++, multipliers_12); len -= 192; } while (len >= 192); /* * Fewer than 192 bytes left. Fold v0-v11 down to just v0, * while processing up to 144 more bytes. */ v0 = fold_vec(v0, v6, multipliers_6); v1 = fold_vec(v1, v7, multipliers_6); v2 = fold_vec(v2, v8, multipliers_6); v3 = fold_vec(v3, v9, multipliers_6); v4 = fold_vec(v4, v10, multipliers_6); v5 = fold_vec(v5, v11, multipliers_6); if (len >= 96) { v0 = fold_vec(v0, *vp++, multipliers_6); v1 = fold_vec(v1, *vp++, multipliers_6); v2 = fold_vec(v2, *vp++, multipliers_6); v3 = fold_vec(v3, *vp++, multipliers_6); v4 = fold_vec(v4, *vp++, multipliers_6); v5 = fold_vec(v5, *vp++, multipliers_6); len -= 96; } v0 = fold_vec(v0, v3, multipliers_3); v1 = fold_vec(v1, v4, multipliers_3); v2 = fold_vec(v2, v5, multipliers_3); if (len >= 48) { v0 = fold_vec(v0, *vp++, multipliers_3); v1 = fold_vec(v1, *vp++, multipliers_3); v2 = fold_vec(v2, *vp++, multipliers_3); len -= 48; } v0 = fold_vec(v0, v1, multipliers_1); v0 = fold_vec(v0, v2, multipliers_1); p = (const u8 *)vp; } /* Reduce 128 to 32 bits using crc32 instructions. */ crc = __crc32d(0, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 0)); crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 1)); tail: /* Finish up the remainder using crc32 instructions. */ if (len & 32) { crc = __crc32d(crc, get_unaligned_le64(p + 0)); crc = __crc32d(crc, get_unaligned_le64(p + 8)); crc = __crc32d(crc, get_unaligned_le64(p + 16)); crc = __crc32d(crc, get_unaligned_le64(p + 24)); p += 32; } if (len & 16) { crc = __crc32d(crc, get_unaligned_le64(p + 0)); crc = __crc32d(crc, get_unaligned_le64(p + 8)); p += 16; } if (len & 8) { crc = __crc32d(crc, get_unaligned_le64(p)); p += 8; } if (len & 4) { crc = __crc32w(crc, get_unaligned_le32(p)); p += 4; } if (len & 2) { crc = __crc32h(crc, get_unaligned_le16(p)); p += 2; } if (len & 1) crc = __crc32b(crc, *p); return crc; } #undef SUFFIX #undef ATTRIBUTES #undef ENABLE_EOR3