/* * x86/crc32_pclmul_template.h - gzip CRC-32 with PCLMULQDQ instructions * * Copyright 2016 Eric Biggers * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ /* * This file is a "template" for instantiating PCLMULQDQ-based crc32_x86 * functions. The "parameters" are: * * SUFFIX: * Name suffix to append to all instantiated functions. * ATTRIBUTES: * Target function attributes to use. Must satisfy the dependencies of the * other parameters as follows: * VL=16 && USE_AVX512=0: at least pclmul,sse4.1 * VL=32 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2 * VL=32 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl * VL=64 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl * (Other combinations are not useful and have not been tested.) * VL: * Vector length in bytes. Must be 16, 32, or 64. * USE_AVX512: * If 1, take advantage of AVX-512 features such as masking and the * vpternlog instruction. This doesn't enable the use of 512-bit vectors; * the vector length is controlled by VL. If 0, assume that the CPU might * not support AVX-512. * * The overall algorithm used is CRC folding with carryless multiplication * instructions. Note that the x86 crc32 instruction cannot be used, as it is * for a different polynomial, not the gzip one. For an explanation of CRC * folding with carryless multiplication instructions, see * scripts/gen-crc32-consts.py and the following blog posts and papers: * * "An alternative exposition of crc32_4k_pclmulqdq" * https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq * * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf * * The original pclmulqdq instruction does one 64x64 to 128-bit carryless * multiplication. The VPCLMULQDQ feature added instructions that do two * parallel 64x64 to 128-bit carryless multiplications in combination with AVX * or AVX512VL, or four in combination with AVX512F. */ #if VL == 16 # define vec_t __m128i # define fold_vec fold_vec128 # define VLOADU(p) _mm_loadu_si128((const void *)(p)) # define VXOR(a, b) _mm_xor_si128((a), (b)) # define M128I_TO_VEC(a) a # define MULTS_8V _mm_set_epi64x(CRC32_X991_MODG, CRC32_X1055_MODG) # define MULTS_4V _mm_set_epi64x(CRC32_X479_MODG, CRC32_X543_MODG) # define MULTS_2V _mm_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG) # define MULTS_1V _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG) #elif VL == 32 # define vec_t __m256i # define fold_vec fold_vec256 # define VLOADU(p) _mm256_loadu_si256((const void *)(p)) # define VXOR(a, b) _mm256_xor_si256((a), (b)) # define M128I_TO_VEC(a) _mm256_zextsi128_si256(a) # define MULTS(a, b) _mm256_set_epi64x(a, b, a, b) # define MULTS_8V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG) # define MULTS_4V MULTS(CRC32_X991_MODG, CRC32_X1055_MODG) # define MULTS_2V MULTS(CRC32_X479_MODG, CRC32_X543_MODG) # define MULTS_1V MULTS(CRC32_X223_MODG, CRC32_X287_MODG) #elif VL == 64 # define vec_t __m512i # define fold_vec fold_vec512 # define VLOADU(p) _mm512_loadu_si512((const void *)(p)) # define VXOR(a, b) _mm512_xor_si512((a), (b)) # define M128I_TO_VEC(a) _mm512_zextsi128_si512(a) # define MULTS(a, b) _mm512_set_epi64(a, b, a, b, a, b, a, b) # define MULTS_8V MULTS(CRC32_X4063_MODG, CRC32_X4127_MODG) # define MULTS_4V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG) # define MULTS_2V MULTS(CRC32_X991_MODG, CRC32_X1055_MODG) # define MULTS_1V MULTS(CRC32_X479_MODG, CRC32_X543_MODG) #else # error "unsupported vector length" #endif #undef fold_vec128 static forceinline ATTRIBUTES __m128i ADD_SUFFIX(fold_vec128)(__m128i src, __m128i dst, __m128i /* __v2du */ mults) { dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, mults, 0x00)); dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, mults, 0x11)); return dst; } #define fold_vec128 ADD_SUFFIX(fold_vec128) #if VL >= 32 #undef fold_vec256 static forceinline ATTRIBUTES __m256i ADD_SUFFIX(fold_vec256)(__m256i src, __m256i dst, __m256i /* __v4du */ mults) { #if USE_AVX512 /* vpternlog with immediate 0x96 is a three-argument XOR. */ return _mm256_ternarylogic_epi32( _mm256_clmulepi64_epi128(src, mults, 0x00), _mm256_clmulepi64_epi128(src, mults, 0x11), dst, 0x96); #else return _mm256_xor_si256( _mm256_xor_si256(dst, _mm256_clmulepi64_epi128(src, mults, 0x00)), _mm256_clmulepi64_epi128(src, mults, 0x11)); #endif } #define fold_vec256 ADD_SUFFIX(fold_vec256) #endif /* VL >= 32 */ #if VL >= 64 #undef fold_vec512 static forceinline ATTRIBUTES __m512i ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i /* __v8du */ mults) { /* vpternlog with immediate 0x96 is a three-argument XOR. */ return _mm512_ternarylogic_epi32( _mm512_clmulepi64_epi128(src, mults, 0x00), _mm512_clmulepi64_epi128(src, mults, 0x11), dst, 0x96); } #define fold_vec512 ADD_SUFFIX(fold_vec512) #endif /* VL >= 64 */ /* * Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to * the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and * the data into vectors x0 and x1 that contain 'len' bytes and 16 bytes, * respectively. Then fold x0 into x1 and return the result. * Assumes that 'p + len - 16' is in-bounds. */ #undef fold_lessthan16bytes static forceinline ATTRIBUTES __m128i ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len, __m128i /* __v2du */ mults_128b) { __m128i lshift = _mm_loadu_si128((const void *)&shift_tab[len]); __m128i rshift = _mm_loadu_si128((const void *)&shift_tab[len + 16]); __m128i x0, x1; /* x0 = x left-shifted by '16 - len' bytes */ x0 = _mm_shuffle_epi8(x, lshift); /* * x1 = the last '16 - len' bytes from x (i.e. x right-shifted by 'len' * bytes) followed by the remaining data. */ x1 = _mm_blendv_epi8(_mm_shuffle_epi8(x, rshift), _mm_loadu_si128((const void *)(p + len - 16)), /* msb 0/1 of each byte selects byte from arg1/2 */ rshift); return fold_vec128(x0, x1, mults_128b); } #define fold_lessthan16bytes ADD_SUFFIX(fold_lessthan16bytes) static ATTRIBUTES u32 ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) { /* * mults_{N}v are the vectors of multipliers for folding across N vec_t * vectors, i.e. N*VL*8 bits. mults_128b are the two multipliers for * folding across 128 bits. mults_128b differs from mults_1v when * VL != 16. All multipliers are 64-bit, to match what pclmulqdq needs, * but since this is for CRC-32 only their low 32 bits are nonzero. * For more details, see scripts/gen-crc32-consts.py. */ const vec_t mults_8v = MULTS_8V; const vec_t mults_4v = MULTS_4V; const vec_t mults_2v = MULTS_2V; const vec_t mults_1v = MULTS_1V; const __m128i mults_128b = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG); const __m128i barrett_reduction_constants = _mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1); vec_t v0, v1, v2, v3, v4, v5, v6, v7; __m128i x0 = _mm_cvtsi32_si128(crc); __m128i x1; if (len < 8*VL) { if (len < VL) { STATIC_ASSERT(VL == 16 || VL == 32 || VL == 64); if (len < 16) { #if USE_AVX512 if (len < 4) return crc32_slice1(crc, p, len); /* * Handle 4 <= len <= 15 bytes by doing a masked * load, XOR'ing the current CRC with the first * 4 bytes, left-shifting by '16 - len' bytes to * align the result to the end of x0 (so that it * becomes the low-order coefficients of a * 128-bit polynomial), and then doing the usual * reduction from 128 bits to 32 bits. */ x0 = _mm_xor_si128( x0, _mm_maskz_loadu_epi8((1 << len) - 1, p)); x0 = _mm_shuffle_epi8( x0, _mm_loadu_si128((const void *)&shift_tab[len])); goto reduce_x0; #else return crc32_slice1(crc, p, len); #endif } /* * Handle 16 <= len < VL bytes where VL is 32 or 64. * Use 128-bit instructions so that these lengths aren't * slower with VL > 16 than with VL=16. */ x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0); if (len >= 32) { x0 = fold_vec128(x0, _mm_loadu_si128((const void *)(p + 16)), mults_128b); if (len >= 48) x0 = fold_vec128(x0, _mm_loadu_si128((const void *)(p + 32)), mults_128b); } p += len & ~15; goto less_than_16_remaining; } v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0)); if (len < 2*VL) { p += VL; goto less_than_vl_remaining; } v1 = VLOADU(p + 1*VL); if (len < 4*VL) { p += 2*VL; goto less_than_2vl_remaining; } v2 = VLOADU(p + 2*VL); v3 = VLOADU(p + 3*VL); p += 4*VL; } else { /* * If the length is large and the pointer is misaligned, align * it. For smaller lengths, just take the misaligned load * penalty. Note that on recent x86 CPUs, vmovdqu with an * aligned address is just as fast as vmovdqa, so there's no * need to use vmovdqa in the main loop. */ if (len > 65536 && ((uintptr_t)p & (VL-1))) { size_t align = -(uintptr_t)p & (VL-1); len -= align; x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0); p += 16; if (align & 15) { x0 = fold_lessthan16bytes(x0, p, align & 15, mults_128b); p += align & 15; align &= ~15; } while (align) { x0 = fold_vec128(x0, *(const __m128i *)p, mults_128b); p += 16; align -= 16; } v0 = M128I_TO_VEC(x0); # if VL == 32 v0 = _mm256_inserti128_si256(v0, *(const __m128i *)p, 1); # elif VL == 64 v0 = _mm512_inserti32x4(v0, *(const __m128i *)p, 1); v0 = _mm512_inserti64x4(v0, *(const __m256i *)(p + 16), 1); # endif p -= 16; } else { v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0)); } v1 = VLOADU(p + 1*VL); v2 = VLOADU(p + 2*VL); v3 = VLOADU(p + 3*VL); v4 = VLOADU(p + 4*VL); v5 = VLOADU(p + 5*VL); v6 = VLOADU(p + 6*VL); v7 = VLOADU(p + 7*VL); p += 8*VL; /* * This is the main loop, processing 8*VL bytes per iteration. * 4*VL is usually enough and would result in smaller code, but * Skylake and Cascade Lake need 8*VL to get full performance. */ while (len >= 16*VL) { v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_8v); v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_8v); v2 = fold_vec(v2, VLOADU(p + 2*VL), mults_8v); v3 = fold_vec(v3, VLOADU(p + 3*VL), mults_8v); v4 = fold_vec(v4, VLOADU(p + 4*VL), mults_8v); v5 = fold_vec(v5, VLOADU(p + 5*VL), mults_8v); v6 = fold_vec(v6, VLOADU(p + 6*VL), mults_8v); v7 = fold_vec(v7, VLOADU(p + 7*VL), mults_8v); p += 8*VL; len -= 8*VL; } /* Fewer than 8*VL bytes remain. */ v0 = fold_vec(v0, v4, mults_4v); v1 = fold_vec(v1, v5, mults_4v); v2 = fold_vec(v2, v6, mults_4v); v3 = fold_vec(v3, v7, mults_4v); if (len & (4*VL)) { v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_4v); v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_4v); v2 = fold_vec(v2, VLOADU(p + 2*VL), mults_4v); v3 = fold_vec(v3, VLOADU(p + 3*VL), mults_4v); p += 4*VL; } } /* Fewer than 4*VL bytes remain. */ v0 = fold_vec(v0, v2, mults_2v); v1 = fold_vec(v1, v3, mults_2v); if (len & (2*VL)) { v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_2v); v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_2v); p += 2*VL; } less_than_2vl_remaining: /* Fewer than 2*VL bytes remain. */ v0 = fold_vec(v0, v1, mults_1v); if (len & VL) { v0 = fold_vec(v0, VLOADU(p), mults_1v); p += VL; } less_than_vl_remaining: /* * Fewer than VL bytes remain. Reduce v0 (length VL bytes) to x0 * (length 16 bytes) and fold in any 16-byte data segments that remain. */ #if VL == 16 x0 = v0; #else { #if VL == 32 __m256i y0 = v0; #else const __m256i mults_256b = _mm256_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG, CRC32_X223_MODG, CRC32_X287_MODG); __m256i y0 = fold_vec256(_mm512_extracti64x4_epi64(v0, 0), _mm512_extracti64x4_epi64(v0, 1), mults_256b); if (len & 32) { y0 = fold_vec256(y0, _mm256_loadu_si256((const void *)p), mults_256b); p += 32; } #endif x0 = fold_vec128(_mm256_extracti128_si256(y0, 0), _mm256_extracti128_si256(y0, 1), mults_128b); } if (len & 16) { x0 = fold_vec128(x0, _mm_loadu_si128((const void *)p), mults_128b); p += 16; } #endif less_than_16_remaining: len &= 15; /* Handle any remainder of 1 to 15 bytes. */ if (len) x0 = fold_lessthan16bytes(x0, p, len, mults_128b); #if USE_AVX512 reduce_x0: #endif /* * Multiply the remaining 128-bit message polynomial 'x0' by x^32, then * reduce it modulo the generator polynomial G. This gives the CRC. * * This implementation matches that used in crc-pclmul-template.S from * https://lore.kernel.org/r/20250210174540.161705-4-ebiggers@kernel.org/ * with the parameters n=32 and LSB_CRC=1 (what the gzip CRC uses). See * there for a detailed explanation of the math used here. */ x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, mults_128b, 0x10), _mm_bsrli_si128(x0, 8)); x1 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x00); x1 = _mm_clmulepi64_si128(x1, barrett_reduction_constants, 0x10); x0 = _mm_xor_si128(x0, x1); return _mm_extract_epi32(x0, 2); } #undef vec_t #undef fold_vec #undef VLOADU #undef VXOR #undef M128I_TO_VEC #undef MULTS #undef MULTS_8V #undef MULTS_4V #undef MULTS_2V #undef MULTS_1V #undef SUFFIX #undef ATTRIBUTES #undef VL #undef USE_AVX512