/*
 * x86/cpu_features.c - feature detection for x86 CPUs
 *
 * Copyright 2016 Eric Biggers
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

#include "../cpu_features_common.h" /* must be included first */
#include "cpu_features.h"

#ifdef X86_CPU_FEATURES_KNOWN
/* Runtime x86 CPU feature detection is supported. */

/* Execute the CPUID instruction. */
static inline void
cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
{
#ifdef _MSC_VER
	int result[4];

	__cpuidex(result, leaf, subleaf);
	*a = result[0];
	*b = result[1];
	*c = result[2];
	*d = result[3];
#else
	__asm__ volatile("cpuid" : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d)
			 : "a" (leaf), "c" (subleaf));
#endif
}

/* Read an extended control register. */
static inline u64
read_xcr(u32 index)
{
#ifdef _MSC_VER
	return _xgetbv(index);
#else
	u32 d, a;

	/*
	 * Execute the "xgetbv" instruction.  Old versions of binutils do not
	 * recognize this instruction, so list the raw bytes instead.
	 *
	 * This must be 'volatile' to prevent this code from being moved out
	 * from under the check for OSXSAVE.
	 */
	__asm__ volatile(".byte 0x0f, 0x01, 0xd0" :
			 "=d" (d), "=a" (a) : "c" (index));

	return ((u64)d << 32) | a;
#endif
}

static const struct cpu_feature x86_cpu_feature_table[] = {
	{X86_CPU_FEATURE_SSE2,		"sse2"},
	{X86_CPU_FEATURE_PCLMULQDQ,	"pclmulqdq"},
	{X86_CPU_FEATURE_AVX,		"avx"},
	{X86_CPU_FEATURE_AVX2,		"avx2"},
	{X86_CPU_FEATURE_BMI2,		"bmi2"},
	{X86_CPU_FEATURE_ZMM,		"zmm"},
	{X86_CPU_FEATURE_AVX512BW,	"avx512bw"},
	{X86_CPU_FEATURE_AVX512VL,	"avx512vl"},
	{X86_CPU_FEATURE_VPCLMULQDQ,	"vpclmulqdq"},
	{X86_CPU_FEATURE_AVX512VNNI,	"avx512_vnni"},
	{X86_CPU_FEATURE_AVXVNNI,	"avx_vnni"},
};

volatile u32 libdeflate_x86_cpu_features = 0;

static inline bool
os_supports_avx512(u64 xcr0)
{
#ifdef __APPLE__
	/*
	 * The Darwin kernel had a bug where it could corrupt the opmask
	 * registers.  See
	 * https://community.intel.com/t5/Software-Tuning-Performance/MacOS-Darwin-kernel-bug-clobbers-AVX-512-opmask-register-state/m-p/1327259
	 * Darwin also does not initially set the XCR0 bits for AVX512, but they
	 * are set if the thread tries to use AVX512 anyway.  Thus, to safely
	 * and consistently use AVX512 on macOS we'd need to check the kernel
	 * version as well as detect AVX512 support using a macOS-specific
	 * method.  We don't bother with this, especially given Apple's
	 * transition to arm64.
	 */
	return false;
#else
	return (xcr0 & 0xe6) == 0xe6;
#endif
}

/*
 * Don't use 512-bit vectors (ZMM registers) on Intel CPUs before Rocket Lake
 * and Sapphire Rapids, due to the overly-eager downclocking which can reduce
 * the performance of workloads that use ZMM registers only occasionally.
 */
static inline bool
allow_512bit_vectors(const u32 manufacturer[3], u32 family, u32 model)
{
#ifdef TEST_SUPPORT__DO_NOT_USE
	return true;
#endif
	if (memcmp(manufacturer, "GenuineIntel", 12) != 0)
		return true;
	if (family != 6)
		return true;
	switch (model) {
	case 85: /* Skylake (Server), Cascade Lake, Cooper Lake */
	case 106: /* Ice Lake (Server) */
	case 108: /* Ice Lake (Server) */
	case 126: /* Ice Lake (Client) */
	case 140: /* Tiger Lake */
	case 141: /* Tiger Lake */
		return false;
	}
	return true;
}

/* Initialize libdeflate_x86_cpu_features. */
void libdeflate_init_x86_cpu_features(void)
{
	u32 max_leaf;
	u32 manufacturer[3];
	u32 family, model;
	u32 a, b, c, d;
	u64 xcr0 = 0;
	u32 features = 0;

	/* EAX=0: Highest Function Parameter and Manufacturer ID */
	cpuid(0, 0, &max_leaf, &manufacturer[0], &manufacturer[2],
	      &manufacturer[1]);
	if (max_leaf < 1)
		goto out;

	/* EAX=1: Processor Info and Feature Bits */
	cpuid(1, 0, &a, &b, &c, &d);
	family = (a >> 8) & 0xf;
	model = (a >> 4) & 0xf;
	if (family == 6 || family == 0xf)
		model += (a >> 12) & 0xf0;
	if (family == 0xf)
		family += (a >> 20) & 0xff;
	if (d & (1 << 26))
		features |= X86_CPU_FEATURE_SSE2;
	/*
	 * No known CPUs have pclmulqdq without sse4.1, so in practice code
	 * targeting pclmulqdq can use sse4.1 instructions.  But to be safe,
	 * explicitly check for both the pclmulqdq and sse4.1 bits.
	 */
	if ((c & (1 << 1)) && (c & (1 << 19)))
		features |= X86_CPU_FEATURE_PCLMULQDQ;
	if (c & (1 << 27))
		xcr0 = read_xcr(0);
	if ((c & (1 << 28)) && ((xcr0 & 0x6) == 0x6))
		features |= X86_CPU_FEATURE_AVX;

	if (max_leaf < 7)
		goto out;

	/* EAX=7, ECX=0: Extended Features */
	cpuid(7, 0, &a, &b, &c, &d);
	if (b & (1 << 8))
		features |= X86_CPU_FEATURE_BMI2;
	if ((xcr0 & 0x6) == 0x6) {
		if (b & (1 << 5))
			features |= X86_CPU_FEATURE_AVX2;
		if (c & (1 << 10))
			features |= X86_CPU_FEATURE_VPCLMULQDQ;
	}
	if (os_supports_avx512(xcr0)) {
		if (allow_512bit_vectors(manufacturer, family, model))
			features |= X86_CPU_FEATURE_ZMM;
		if (b & (1 << 30))
			features |= X86_CPU_FEATURE_AVX512BW;
		if (b & (1U << 31))
			features |= X86_CPU_FEATURE_AVX512VL;
		if (c & (1 << 11))
			features |= X86_CPU_FEATURE_AVX512VNNI;
	}

	/* EAX=7, ECX=1: Extended Features */
	cpuid(7, 1, &a, &b, &c, &d);
	if ((a & (1 << 4)) && ((xcr0 & 0x6) == 0x6))
		features |= X86_CPU_FEATURE_AVXVNNI;

out:
	disable_cpu_features_for_testing(&features, x86_cpu_feature_table,
					 ARRAY_LEN(x86_cpu_feature_table));

	libdeflate_x86_cpu_features = features | X86_CPU_FEATURES_KNOWN;
}

#endif /* X86_CPU_FEATURES_KNOWN */