/*
 * x86/adler32_template.h - template for vectorized Adler-32 implementations
 *
 * Copyright 2016 Eric Biggers
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

/*
 * This file is a "template" for instantiating Adler-32 functions for x86.
 * The "parameters" are:
 *
 * SUFFIX:
 *	Name suffix to append to all instantiated functions.
 * ATTRIBUTES:
 *	Target function attributes to use.  Must satisfy the dependencies of the
 *	other parameters as follows:
 *	   VL=16 && USE_VNNI=0 && USE_AVX512=0: at least sse2
 *	   VL=32 && USE_VNNI=0 && USE_AVX512=0: at least avx2
 *	   VL=32 && USE_VNNI=1 && USE_AVX512=0: at least avx2,avxvnni
 *	   VL=32 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vl,avx512vnni
 *	   VL=64 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vnni
 *	   (Other combinations are not useful and have not been tested.)
 * VL:
 *	Vector length in bytes.  Must be 16, 32, or 64.
 * USE_VNNI:
 *	If 1, use the VNNI dot product based algorithm.
 *	If 0, use the legacy SSE2 and AVX2 compatible algorithm.
 * USE_AVX512:
 *	If 1, take advantage of AVX-512 features such as masking.  This doesn't
 *	enable the use of 512-bit vectors; the vector length is controlled by
 *	VL.  If 0, assume that the CPU might not support AVX-512.
 */

#if VL == 16
#  define vec_t			__m128i
#  define mask_t		u16
#  define LOG2_VL		4
#  define VADD8(a, b)		_mm_add_epi8((a), (b))
#  define VADD16(a, b)		_mm_add_epi16((a), (b))
#  define VADD32(a, b)		_mm_add_epi32((a), (b))
#  if USE_AVX512
#    define VDPBUSD(a, b, c)	_mm_dpbusd_epi32((a), (b), (c))
#  else
#    define VDPBUSD(a, b, c)	_mm_dpbusd_avx_epi32((a), (b), (c))
#  endif
#  define VLOAD(p)		_mm_load_si128((const void *)(p))
#  define VLOADU(p)		_mm_loadu_si128((const void *)(p))
#  define VMADD16(a, b)		_mm_madd_epi16((a), (b))
#  define VMASKZ_LOADU(mask, p) _mm_maskz_loadu_epi8((mask), (p))
#  define VMULLO32(a, b)	_mm_mullo_epi32((a), (b))
#  define VSAD8(a, b)		_mm_sad_epu8((a), (b))
#  define VSET1_8(a)		_mm_set1_epi8(a)
#  define VSET1_32(a)		_mm_set1_epi32(a)
#  define VSETZERO()		_mm_setzero_si128()
#  define VSLL32(a, b)		_mm_slli_epi32((a), (b))
#  define VUNPACKLO8(a, b)	_mm_unpacklo_epi8((a), (b))
#  define VUNPACKHI8(a, b)	_mm_unpackhi_epi8((a), (b))
#elif VL == 32
#  define vec_t			__m256i
#  define mask_t		u32
#  define LOG2_VL		5
#  define VADD8(a, b)		_mm256_add_epi8((a), (b))
#  define VADD16(a, b)		_mm256_add_epi16((a), (b))
#  define VADD32(a, b)		_mm256_add_epi32((a), (b))
#  if USE_AVX512
#    define VDPBUSD(a, b, c)	_mm256_dpbusd_epi32((a), (b), (c))
#  else
#    define VDPBUSD(a, b, c)	_mm256_dpbusd_avx_epi32((a), (b), (c))
#  endif
#  define VLOAD(p)		_mm256_load_si256((const void *)(p))
#  define VLOADU(p)		_mm256_loadu_si256((const void *)(p))
#  define VMADD16(a, b)		_mm256_madd_epi16((a), (b))
#  define VMASKZ_LOADU(mask, p) _mm256_maskz_loadu_epi8((mask), (p))
#  define VMULLO32(a, b)	_mm256_mullo_epi32((a), (b))
#  define VSAD8(a, b)		_mm256_sad_epu8((a), (b))
#  define VSET1_8(a)		_mm256_set1_epi8(a)
#  define VSET1_32(a)		_mm256_set1_epi32(a)
#  define VSETZERO()		_mm256_setzero_si256()
#  define VSLL32(a, b)		_mm256_slli_epi32((a), (b))
#  define VUNPACKLO8(a, b)	_mm256_unpacklo_epi8((a), (b))
#  define VUNPACKHI8(a, b)	_mm256_unpackhi_epi8((a), (b))
#elif VL == 64
#  define vec_t			__m512i
#  define mask_t		u64
#  define LOG2_VL		6
#  define VADD8(a, b)		_mm512_add_epi8((a), (b))
#  define VADD16(a, b)		_mm512_add_epi16((a), (b))
#  define VADD32(a, b)		_mm512_add_epi32((a), (b))
#  define VDPBUSD(a, b, c)	_mm512_dpbusd_epi32((a), (b), (c))
#  define VLOAD(p)		_mm512_load_si512((const void *)(p))
#  define VLOADU(p)		_mm512_loadu_si512((const void *)(p))
#  define VMADD16(a, b)		_mm512_madd_epi16((a), (b))
#  define VMASKZ_LOADU(mask, p) _mm512_maskz_loadu_epi8((mask), (p))
#  define VMULLO32(a, b)	_mm512_mullo_epi32((a), (b))
#  define VSAD8(a, b)		_mm512_sad_epu8((a), (b))
#  define VSET1_8(a)		_mm512_set1_epi8(a)
#  define VSET1_32(a)		_mm512_set1_epi32(a)
#  define VSETZERO()		_mm512_setzero_si512()
#  define VSLL32(a, b)		_mm512_slli_epi32((a), (b))
#  define VUNPACKLO8(a, b)	_mm512_unpacklo_epi8((a), (b))
#  define VUNPACKHI8(a, b)	_mm512_unpackhi_epi8((a), (b))
#else
#  error "unsupported vector length"
#endif

#define VADD32_3X(a, b, c)	VADD32(VADD32((a), (b)), (c))
#define VADD32_4X(a, b, c, d)	VADD32(VADD32((a), (b)), VADD32((c), (d)))
#define VADD32_5X(a, b, c, d, e) VADD32((a), VADD32_4X((b), (c), (d), (e)))
#define VADD32_7X(a, b, c, d, e, f, g)	\
	VADD32(VADD32_3X((a), (b), (c)), VADD32_4X((d), (e), (f), (g)))

/* Sum the 32-bit elements of v_s1 and add them to s1, and likewise for s2. */
#undef reduce_to_32bits
static forceinline ATTRIBUTES void
ADD_SUFFIX(reduce_to_32bits)(vec_t v_s1, vec_t v_s2, u32 *s1_p, u32 *s2_p)
{
	__m128i v_s1_128, v_s2_128;
#if VL == 16
	{
		v_s1_128 = v_s1;
		v_s2_128 = v_s2;
	}
#else
	{
		__m256i v_s1_256, v_s2_256;
	#if VL == 32
		v_s1_256 = v_s1;
		v_s2_256 = v_s2;
	#else
		/* Reduce 512 bits to 256 bits. */
		v_s1_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s1, 0),
					    _mm512_extracti64x4_epi64(v_s1, 1));
		v_s2_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s2, 0),
					    _mm512_extracti64x4_epi64(v_s2, 1));
	#endif
		/* Reduce 256 bits to 128 bits. */
		v_s1_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s1_256, 0),
					 _mm256_extracti128_si256(v_s1_256, 1));
		v_s2_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s2_256, 0),
					 _mm256_extracti128_si256(v_s2_256, 1));
	}
#endif

	/*
	 * Reduce 128 bits to 32 bits.
	 *
	 * If the bytes were summed into v_s1 using psadbw + paddd, then ignore
	 * the odd-indexed elements of v_s1_128 since they are zero.
	 */
#if USE_VNNI
	v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x31));
#endif
	v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x31));
	v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x02));
	v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x02));

	*s1_p += (u32)_mm_cvtsi128_si32(v_s1_128);
	*s2_p += (u32)_mm_cvtsi128_si32(v_s2_128);
}
#define reduce_to_32bits	ADD_SUFFIX(reduce_to_32bits)

static ATTRIBUTES u32
ADD_SUFFIX(adler32_x86)(u32 adler, const u8 *p, size_t len)
{
#if USE_VNNI
	/* This contains the bytes [VL, VL-1, VL-2, ..., 1]. */
	static const u8 _aligned_attribute(VL) raw_mults[VL] = {
	#if VL == 64
		64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
		48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
	#endif
	#if VL >= 32
		32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
	#endif
		16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
	};
	const vec_t ones = VSET1_8(1);
#else
	/*
	 * This contains the 16-bit values [2*VL, 2*VL - 1, 2*VL - 2, ..., 1].
	 * For VL==32 the ordering is weird because it has to match the way that
	 * vpunpcklbw and vpunpckhbw work on 128-bit lanes separately.
	 */
	static const u16 _aligned_attribute(VL) raw_mults[4][VL / 2] = {
	#if VL == 16
		{ 32, 31, 30, 29, 28, 27, 26, 25 },
		{ 24, 23, 22, 21, 20, 19, 18, 17 },
		{ 16, 15, 14, 13, 12, 11, 10, 9  },
		{ 8,  7,  6,  5,  4,  3,  2,  1  },
	#elif VL == 32
		{ 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 },
		{ 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 },
		{ 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10,  9 },
		{ 24, 23, 22, 21, 20, 19, 18, 17,  8,  7,  6,  5,  4,  3,  2,  1 },
	#else
	#  error "unsupported parameters"
	#endif
	};
	const vec_t mults_a = VLOAD(raw_mults[0]);
	const vec_t mults_b = VLOAD(raw_mults[1]);
	const vec_t mults_c = VLOAD(raw_mults[2]);
	const vec_t mults_d = VLOAD(raw_mults[3]);
#endif
	const vec_t zeroes = VSETZERO();
	u32 s1 = adler & 0xFFFF;
	u32 s2 = adler >> 16;

	/*
	 * If the length is large and the pointer is misaligned, align it.
	 * For smaller lengths, just take the misaligned load penalty.
	 */
	if (unlikely(len > 65536 && ((uintptr_t)p & (VL-1)))) {
		do {
			s1 += *p++;
			s2 += s1;
			len--;
		} while ((uintptr_t)p & (VL-1));
		s1 %= DIVISOR;
		s2 %= DIVISOR;
	}

#if USE_VNNI
	/*
	 * This is Adler-32 using the vpdpbusd instruction from AVX512VNNI or
	 * AVX-VNNI.  vpdpbusd multiplies the unsigned bytes of one vector by
	 * the signed bytes of another vector and adds the sums in groups of 4
	 * to the 32-bit elements of a third vector.  We use it in two ways:
	 * multiplying the data bytes by a sequence like 64,63,62,...,1 for
	 * calculating part of s2, and multiplying the data bytes by an all-ones
	 * sequence 1,1,1,...,1 for calculating s1 and part of s2.  The all-ones
	 * trick seems to be faster than the alternative of vpsadbw + vpaddd.
	 */
	while (len) {
		/*
		 * Calculate the length of the next data chunk such that s1 and
		 * s2 are guaranteed to not exceed UINT32_MAX.
		 */
		size_t n = MIN(len, MAX_CHUNK_LEN & ~(4*VL - 1));
		vec_t mults = VLOAD(raw_mults);
		vec_t v_s1 = zeroes;
		vec_t v_s2 = zeroes;

		s2 += s1 * n;
		len -= n;

		if (n >= 4*VL) {
			vec_t v_s1_b = zeroes;
			vec_t v_s1_c = zeroes;
			vec_t v_s1_d = zeroes;
			vec_t v_s2_b = zeroes;
			vec_t v_s2_c = zeroes;
			vec_t v_s2_d = zeroes;
			vec_t v_s1_sums   = zeroes;
			vec_t v_s1_sums_b = zeroes;
			vec_t v_s1_sums_c = zeroes;
			vec_t v_s1_sums_d = zeroes;
			vec_t tmp0, tmp1;

			do {
				vec_t data_a = VLOADU(p + 0*VL);
				vec_t data_b = VLOADU(p + 1*VL);
				vec_t data_c = VLOADU(p + 2*VL);
				vec_t data_d = VLOADU(p + 3*VL);

				/*
				 * Workaround for gcc bug where it generates
				 * unnecessary move instructions
				 * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892)
				 */
			#if GCC_PREREQ(1, 0)
				__asm__("" : "+v" (data_a), "+v" (data_b),
					     "+v" (data_c), "+v" (data_d));
			#endif

				v_s2   = VDPBUSD(v_s2,   data_a, mults);
				v_s2_b = VDPBUSD(v_s2_b, data_b, mults);
				v_s2_c = VDPBUSD(v_s2_c, data_c, mults);
				v_s2_d = VDPBUSD(v_s2_d, data_d, mults);

				v_s1_sums   = VADD32(v_s1_sums,   v_s1);
				v_s1_sums_b = VADD32(v_s1_sums_b, v_s1_b);
				v_s1_sums_c = VADD32(v_s1_sums_c, v_s1_c);
				v_s1_sums_d = VADD32(v_s1_sums_d, v_s1_d);

				v_s1   = VDPBUSD(v_s1,   data_a, ones);
				v_s1_b = VDPBUSD(v_s1_b, data_b, ones);
				v_s1_c = VDPBUSD(v_s1_c, data_c, ones);
				v_s1_d = VDPBUSD(v_s1_d, data_d, ones);

				/* Same gcc bug workaround.  See above */
			#if GCC_PREREQ(1, 0) && !defined(ARCH_X86_32)
				__asm__("" : "+v" (v_s2), "+v" (v_s2_b),
					     "+v" (v_s2_c), "+v" (v_s2_d),
					     "+v" (v_s1_sums),
					     "+v" (v_s1_sums_b),
					     "+v" (v_s1_sums_c),
					     "+v" (v_s1_sums_d),
					     "+v" (v_s1), "+v" (v_s1_b),
					     "+v" (v_s1_c), "+v" (v_s1_d));
			#endif
				p += 4*VL;
				n -= 4*VL;
			} while (n >= 4*VL);

			/*
			 * Reduce into v_s1 and v_s2 as follows:
			 *
			 * v_s2 = v_s2 + v_s2_b + v_s2_c + v_s2_d +
			 *	  (4*VL)*(v_s1_sums   + v_s1_sums_b +
			 *		  v_s1_sums_c + v_s1_sums_d) +
			 *	  (3*VL)*v_s1 + (2*VL)*v_s1_b + VL*v_s1_c
			 * v_s1 = v_s1 + v_s1_b + v_s1_c + v_s1_d
			 */
			tmp0 = VADD32(v_s1, v_s1_b);
			tmp1 = VADD32(v_s1, v_s1_c);
			v_s1_sums = VADD32_4X(v_s1_sums, v_s1_sums_b,
					      v_s1_sums_c, v_s1_sums_d);
			v_s1 = VADD32_3X(tmp0, v_s1_c, v_s1_d);
			v_s2 = VADD32_7X(VSLL32(v_s1_sums, LOG2_VL + 2),
					 VSLL32(tmp0, LOG2_VL + 1),
					 VSLL32(tmp1, LOG2_VL),
					 v_s2, v_s2_b, v_s2_c, v_s2_d);
		}

		/* Process the last 0 <= n < 4*VL bytes of the chunk. */
		if (n >= 2*VL) {
			const vec_t data_a = VLOADU(p + 0*VL);
			const vec_t data_b = VLOADU(p + 1*VL);

			v_s2 = VADD32(v_s2, VSLL32(v_s1, LOG2_VL + 1));
			v_s1 = VDPBUSD(v_s1, data_a, ones);
			v_s1 = VDPBUSD(v_s1, data_b, ones);
			v_s2 = VDPBUSD(v_s2, data_a, VSET1_8(VL));
			v_s2 = VDPBUSD(v_s2, data_a, mults);
			v_s2 = VDPBUSD(v_s2, data_b, mults);
			p += 2*VL;
			n -= 2*VL;
		}
		if (n) {
			/* Process the last 0 < n < 2*VL bytes of the chunk. */
			vec_t data;

			v_s2 = VADD32(v_s2, VMULLO32(v_s1, VSET1_32(n)));

			mults = VADD8(mults, VSET1_8((int)n - VL));
			if (n > VL) {
				data = VLOADU(p);
				v_s1 = VDPBUSD(v_s1, data, ones);
				v_s2 = VDPBUSD(v_s2, data, mults);
				p += VL;
				n -= VL;
				mults = VADD8(mults, VSET1_8(-VL));
			}
			/*
			 * Process the last 0 < n <= VL bytes of the chunk.
			 * Utilize a masked load if it's available.
			 */
		#if USE_AVX512
			data = VMASKZ_LOADU((mask_t)-1 >> (VL - n), p);
		#else
			data = zeroes;
			memcpy(&data, p, n);
		#endif
			v_s1 = VDPBUSD(v_s1, data, ones);
			v_s2 = VDPBUSD(v_s2, data, mults);
			p += n;
		}

		reduce_to_32bits(v_s1, v_s2, &s1, &s2);
		s1 %= DIVISOR;
		s2 %= DIVISOR;
	}
#else /* USE_VNNI */
	/*
	 * This is Adler-32 for SSE2 and AVX2.
	 *
	 * To horizontally sum bytes, use psadbw + paddd, where one of the
	 * arguments to psadbw is all-zeroes.
	 *
	 * For the s2 contribution from (2*VL - i)*data[i] for each of the 2*VL
	 * bytes of each iteration of the inner loop, use punpck{l,h}bw + paddw
	 * to sum, for each i across iterations, byte i into a corresponding
	 * 16-bit counter in v_byte_sums_*.  After the inner loop, use pmaddwd
	 * to multiply each counter by (2*VL - i), then add the products to s2.
	 *
	 * An alternative implementation would use pmaddubsw and pmaddwd in the
	 * inner loop to do (2*VL - i)*data[i] directly and add the products in
	 * groups of 4 to 32-bit counters.  However, on average that approach
	 * seems to be slower than the current approach which delays the
	 * multiplications.  Also, pmaddubsw requires SSSE3; the current
	 * approach keeps the implementation aligned between SSE2 and AVX2.
	 *
	 * The inner loop processes 2*VL bytes per iteration.  Increasing this
	 * to 4*VL doesn't seem to be helpful here.
	 */
	while (len) {
		/*
		 * Calculate the length of the next data chunk such that s1 and
		 * s2 are guaranteed to not exceed UINT32_MAX, and every
		 * v_byte_sums_* counter is guaranteed to not exceed INT16_MAX.
		 * It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are
		 * used with pmaddwd which does signed multiplication.  In the
		 * SSE2 case this limits chunks to 4096 bytes instead of 5536.
		 */
		size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX),
					MAX_CHUNK_LEN) & ~(2*VL - 1));
		len -= n;

		if (n >= 2*VL) {
			vec_t v_s1 = zeroes;
			vec_t v_s1_sums = zeroes;
			vec_t v_byte_sums_a = zeroes;
			vec_t v_byte_sums_b = zeroes;
			vec_t v_byte_sums_c = zeroes;
			vec_t v_byte_sums_d = zeroes;
			vec_t v_s2;

			s2 += s1 * (n & ~(2*VL - 1));

			do {
				vec_t data_a = VLOADU(p + 0*VL);
				vec_t data_b = VLOADU(p + 1*VL);

				v_s1_sums = VADD32(v_s1_sums, v_s1);
				v_byte_sums_a = VADD16(v_byte_sums_a,
						       VUNPACKLO8(data_a, zeroes));
				v_byte_sums_b = VADD16(v_byte_sums_b,
						       VUNPACKHI8(data_a, zeroes));
				v_byte_sums_c = VADD16(v_byte_sums_c,
						       VUNPACKLO8(data_b, zeroes));
				v_byte_sums_d = VADD16(v_byte_sums_d,
						       VUNPACKHI8(data_b, zeroes));
				v_s1 = VADD32(v_s1,
					      VADD32(VSAD8(data_a, zeroes),
						     VSAD8(data_b, zeroes)));
				/*
				 * Workaround for gcc bug where it generates
				 * unnecessary move instructions
				 * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892)
				 */
			#if GCC_PREREQ(1, 0)
				__asm__("" : "+x" (v_s1), "+x" (v_s1_sums),
					     "+x" (v_byte_sums_a),
					     "+x" (v_byte_sums_b),
					     "+x" (v_byte_sums_c),
					     "+x" (v_byte_sums_d));
			#endif
				p += 2*VL;
				n -= 2*VL;
			} while (n >= 2*VL);

			/*
			 * Calculate v_s2 as (2*VL)*v_s1_sums +
			 * [2*VL, 2*VL - 1, 2*VL - 2, ..., 1] * v_byte_sums.
			 * Then update s1 and s2 from v_s1 and v_s2.
			 */
			v_s2 = VADD32_5X(VSLL32(v_s1_sums, LOG2_VL + 1),
					 VMADD16(v_byte_sums_a, mults_a),
					 VMADD16(v_byte_sums_b, mults_b),
					 VMADD16(v_byte_sums_c, mults_c),
					 VMADD16(v_byte_sums_d, mults_d));
			reduce_to_32bits(v_s1, v_s2, &s1, &s2);
		}
		/*
		 * Process the last 0 <= n < 2*VL bytes of the chunk using
		 * scalar instructions and reduce s1 and s2 mod DIVISOR.
		 */
		ADLER32_CHUNK(s1, s2, p, n);
	}
#endif /* !USE_VNNI */
	return (s2 << 16) | s1;
}

#undef vec_t
#undef mask_t
#undef LOG2_VL
#undef VADD8
#undef VADD16
#undef VADD32
#undef VDPBUSD
#undef VLOAD
#undef VLOADU
#undef VMADD16
#undef VMASKZ_LOADU
#undef VMULLO32
#undef VSAD8
#undef VSET1_8
#undef VSET1_32
#undef VSETZERO
#undef VSLL32
#undef VUNPACKLO8
#undef VUNPACKHI8

#undef SUFFIX
#undef ATTRIBUTES
#undef VL
#undef USE_VNNI
#undef USE_AVX512