/* * x86/crc32_impl.h - x86 implementations of the gzip CRC-32 algorithm * * Copyright 2016 Eric Biggers * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #ifndef LIB_X86_CRC32_IMPL_H #define LIB_X86_CRC32_IMPL_H #include "cpu_features.h" /* * pshufb(x, shift_tab[len..len+15]) left shifts x by 16-len bytes. * pshufb(x, shift_tab[len+16..len+31]) right shifts x by len bytes. */ static const u8 MAYBE_UNUSED shift_tab[48] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, }; #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) /* * PCLMULQDQ implementation. This targets PCLMULQDQ+SSE4.1, since in practice * all CPUs that support PCLMULQDQ also support SSE4.1. */ # define crc32_x86_pclmulqdq crc32_x86_pclmulqdq # define SUFFIX _pclmulqdq # define ATTRIBUTES _target_attribute("pclmul,sse4.1") # define VL 16 # define USE_AVX512 0 # include "crc32_pclmul_template.h" /* * PCLMULQDQ/AVX implementation. Same as above, but this is compiled with AVX * enabled so that the compiler can generate VEX-coded instructions which can be * slightly more efficient. It still uses 128-bit vectors. */ # define crc32_x86_pclmulqdq_avx crc32_x86_pclmulqdq_avx # define SUFFIX _pclmulqdq_avx # define ATTRIBUTES _target_attribute("pclmul,avx") # define VL 16 # define USE_AVX512 0 # include "crc32_pclmul_template.h" #endif /* * VPCLMULQDQ/AVX2 implementation. This is used on CPUs that have AVX2 and * VPCLMULQDQ but don't have AVX-512, for example Intel Alder Lake. * * Currently this can't be enabled with MSVC because MSVC has a bug where it * incorrectly assumes that VPCLMULQDQ implies AVX-512: * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785 * * gcc 8.1 and 8.2 had a similar bug where they assumed that * _mm256_clmulepi64_epi128() always needed AVX512. It's fixed in gcc 8.3. * * _mm256_zextsi128_si256() requires gcc 10. */ #if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000)) && \ !defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ) # define crc32_x86_vpclmulqdq_avx2 crc32_x86_vpclmulqdq_avx2 # define SUFFIX _vpclmulqdq_avx2 # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2") # define VL 32 # define USE_AVX512 0 # include "crc32_pclmul_template.h" #endif #if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \ !defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ) /* * VPCLMULQDQ/AVX512 implementation using 256-bit vectors. This is very similar * to the VPCLMULQDQ/AVX2 implementation but takes advantage of the vpternlog * instruction and more registers. This is used on certain older Intel CPUs, * specifically Ice Lake and Tiger Lake, which support VPCLMULQDQ and AVX512 but * downclock a bit too eagerly when ZMM registers are used. * * _mm256_zextsi128_si256() requires gcc 10. */ # define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256 # define SUFFIX _vpclmulqdq_avx512_vl256 # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl") # define VL 32 # define USE_AVX512 1 # include "crc32_pclmul_template.h" /* * VPCLMULQDQ/AVX512 implementation using 512-bit vectors. This is used on CPUs * that have a good AVX-512 implementation including VPCLMULQDQ. * * _mm512_zextsi128_si512() requires gcc 10. */ # define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512 # define SUFFIX _vpclmulqdq_avx512_vl512 # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl") # define VL 64 # define USE_AVX512 1 # include "crc32_pclmul_template.h" #endif static inline crc32_func_t arch_select_crc32_func(void) { const u32 features MAYBE_UNUSED = get_x86_cpu_features(); #ifdef crc32_x86_vpclmulqdq_avx512_vl512 if ((features & X86_CPU_FEATURE_ZMM) && HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && HAVE_AVX512BW(features) && HAVE_AVX512VL(features)) return crc32_x86_vpclmulqdq_avx512_vl512; #endif #ifdef crc32_x86_vpclmulqdq_avx512_vl256 if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && HAVE_AVX512BW(features) && HAVE_AVX512VL(features)) return crc32_x86_vpclmulqdq_avx512_vl256; #endif #ifdef crc32_x86_vpclmulqdq_avx2 if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && HAVE_AVX2(features)) return crc32_x86_vpclmulqdq_avx2; #endif #ifdef crc32_x86_pclmulqdq_avx if (HAVE_PCLMULQDQ(features) && HAVE_AVX(features)) return crc32_x86_pclmulqdq_avx; #endif #ifdef crc32_x86_pclmulqdq if (HAVE_PCLMULQDQ(features)) return crc32_x86_pclmulqdq; #endif return NULL; } #define arch_select_crc32_func arch_select_crc32_func #endif /* LIB_X86_CRC32_IMPL_H */