$NetBSD$ * Part of patchset to build electron on NetBSD * Based on OpenBSD's chromium patches, and FreeBSD's electron patches --- third_party/lzma_sdk/C/CpuArch.c.orig 2025-02-24 19:59:26.000000000 +0000 +++ third_party/lzma_sdk/C/CpuArch.c @@ -1,187 +1,357 @@ /* CpuArch.c -- CPU specific code -2021-07-13 : Igor Pavlov : Public domain */ +Igor Pavlov : Public domain */ #include "Precomp.h" +// #include + #include "CpuArch.h" #ifdef MY_CPU_X86_OR_AMD64 -#if (defined(_MSC_VER) && !defined(MY_CPU_AMD64)) || defined(__GNUC__) -#define USE_ASM +#undef NEED_CHECK_FOR_CPUID +#if !defined(MY_CPU_AMD64) +#define NEED_CHECK_FOR_CPUID #endif -#if !defined(USE_ASM) && _MSC_VER >= 1500 -#include -#endif +/* + cpuid instruction supports (subFunction) parameter in ECX, + that is used only with some specific (function) parameter values. + most functions use only (subFunction==0). +*/ +/* + __cpuid(): MSVC and GCC/CLANG use same function/macro name + but parameters are different. + We use MSVC __cpuid() parameters style for our z7_x86_cpuid() function. +*/ + +#if defined(__GNUC__) /* && (__GNUC__ >= 10) */ \ + || defined(__clang__) /* && (__clang_major__ >= 10) */ + +/* there was some CLANG/GCC compilers that have issues with + rbx(ebx) handling in asm blocks in -fPIC mode (__PIC__ is defined). + compiler's contains the macro __cpuid() that is similar to our code. + The history of __cpuid() changes in CLANG/GCC: + GCC: + 2007: it preserved ebx for (__PIC__ && __i386__) + 2013: it preserved rbx and ebx for __PIC__ + 2014: it doesn't preserves rbx and ebx anymore + we suppose that (__GNUC__ >= 5) fixed that __PIC__ ebx/rbx problem. + CLANG: + 2014+: it preserves rbx, but only for 64-bit code. No __PIC__ check. + Why CLANG cares about 64-bit mode only, and doesn't care about ebx (in 32-bit)? + Do we need __PIC__ test for CLANG or we must care about rbx even if + __PIC__ is not defined? +*/ + +#define ASM_LN "\n" + +#if defined(MY_CPU_AMD64) && defined(__PIC__) \ + && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) + + /* "=&r" selects free register. It can select even rbx, if that register is free. + "=&D" for (RDI) also works, but the code can be larger with "=&D" + "2"(subFun) : 2 is (zero-based) index in the output constraint list "=c" (ECX). */ + +#define x86_cpuid_MACRO_2(p, func, subFunc) { \ + __asm__ __volatile__ ( \ + ASM_LN "mov %%rbx, %q1" \ + ASM_LN "cpuid" \ + ASM_LN "xchg %%rbx, %q1" \ + : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); } + +#elif defined(MY_CPU_X86) && defined(__PIC__) \ + && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) + +#define x86_cpuid_MACRO_2(p, func, subFunc) { \ + __asm__ __volatile__ ( \ + ASM_LN "mov %%ebx, %k1" \ + ASM_LN "cpuid" \ + ASM_LN "xchg %%ebx, %k1" \ + : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); } -#if defined(USE_ASM) && !defined(MY_CPU_AMD64) -static UInt32 CheckFlag(UInt32 flag) -{ - #ifdef _MSC_VER - __asm pushfd; - __asm pop EAX; - __asm mov EDX, EAX; - __asm xor EAX, flag; - __asm push EAX; - __asm popfd; - __asm pushfd; - __asm pop EAX; - __asm xor EAX, EDX; - __asm push EDX; - __asm popfd; - __asm and flag, EAX; - #else - __asm__ __volatile__ ( - "pushf\n\t" - "pop %%EAX\n\t" - "movl %%EAX,%%EDX\n\t" - "xorl %0,%%EAX\n\t" - "push %%EAX\n\t" - "popf\n\t" - "pushf\n\t" - "pop %%EAX\n\t" - "xorl %%EDX,%%EAX\n\t" - "push %%EDX\n\t" - "popf\n\t" - "andl %%EAX, %0\n\t": - "=c" (flag) : "c" (flag) : - "%eax", "%edx"); - #endif - return flag; -} -#define CHECK_CPUID_IS_SUPPORTED if (CheckFlag(1 << 18) == 0 || CheckFlag(1 << 21) == 0) return False; #else -#define CHECK_CPUID_IS_SUPPORTED + +#define x86_cpuid_MACRO_2(p, func, subFunc) { \ + __asm__ __volatile__ ( \ + ASM_LN "cpuid" \ + : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); } + #endif -#ifndef USE_ASM - #ifdef _MSC_VER - #if _MSC_VER >= 1600 - #define MY__cpuidex __cpuidex - #else +#define x86_cpuid_MACRO(p, func) x86_cpuid_MACRO_2(p, func, 0) -/* - __cpuid (function == 4) requires subfunction number in ECX. - MSDN: The __cpuid intrinsic clears the ECX register before calling the cpuid instruction. - __cpuid() in new MSVC clears ECX. - __cpuid() in old MSVC (14.00) doesn't clear ECX - We still can use __cpuid for low (function) values that don't require ECX, - but __cpuid() in old MSVC will be incorrect for some function values: (function == 4). - So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction, - where ECX value is first parameter for FAST_CALL / NO_INLINE function, - So the caller of MY__cpuidex_HACK() sets ECX as subFunction, and - old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value. - - DON'T remove MY_NO_INLINE and MY_FAST_CALL for MY__cpuidex_HACK() !!! -*/ +void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) +{ + x86_cpuid_MACRO(p, func) +} static -MY_NO_INLINE -void MY_FAST_CALL MY__cpuidex_HACK(UInt32 subFunction, int *CPUInfo, UInt32 function) +void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) { - UNUSED_VAR(subFunction); - __cpuid(CPUInfo, function); + x86_cpuid_MACRO_2(p, func, subFunc) } - #define MY__cpuidex(info, func, func2) MY__cpuidex_HACK(func2, info, func) - #pragma message("======== MY__cpuidex_HACK WAS USED ========") - #endif - #else - #define MY__cpuidex(info, func, func2) __cpuid(info, func) - #pragma message("======== (INCORRECT ?) cpuid WAS USED ========") - #endif -#endif +Z7_NO_INLINE +UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) +{ + #if defined(NEED_CHECK_FOR_CPUID) + #define EFALGS_CPUID_BIT 21 + UInt32 a; + __asm__ __volatile__ ( + ASM_LN "pushf" + ASM_LN "pushf" + ASM_LN "pop %0" + // ASM_LN "movl %0, %1" + // ASM_LN "xorl $0x200000, %0" + ASM_LN "btc %1, %0" + ASM_LN "push %0" + ASM_LN "popf" + ASM_LN "pushf" + ASM_LN "pop %0" + ASM_LN "xorl (%%esp), %0" + + ASM_LN "popf" + ASM_LN + : "=&r" (a) // "=a" + : "i" (EFALGS_CPUID_BIT) + ); + if ((a & (1 << EFALGS_CPUID_BIT)) == 0) + return 0; + #endif + { + UInt32 p[4]; + x86_cpuid_MACRO(p, 0) + return p[0]; + } +} +#undef ASM_LN +#elif !defined(_MSC_VER) -void MyCPUID(UInt32 function, UInt32 *a, UInt32 *b, UInt32 *c, UInt32 *d) +/* +// for gcc/clang and other: we can try to use __cpuid macro: +#include +void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) { - #ifdef USE_ASM - - #ifdef _MSC_VER + __cpuid(func, p[0], p[1], p[2], p[3]); +} +UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) +{ + return (UInt32)__get_cpuid_max(0, NULL); +} +*/ +// for unsupported cpuid: +void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) +{ + UNUSED_VAR(func) + p[0] = p[1] = p[2] = p[3] = 0; +} +UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) +{ + return 0; +} - UInt32 a2, b2, c2, d2; - __asm xor EBX, EBX; - __asm xor ECX, ECX; - __asm xor EDX, EDX; - __asm mov EAX, function; - __asm cpuid; - __asm mov a2, EAX; - __asm mov b2, EBX; - __asm mov c2, ECX; - __asm mov d2, EDX; - - *a = a2; - *b = b2; - *c = c2; - *d = d2; +#else // _MSC_VER - #else +#if !defined(MY_CPU_AMD64) - __asm__ __volatile__ ( - #if defined(MY_CPU_AMD64) && defined(__PIC__) - "mov %%rbx, %%rdi;" - "cpuid;" - "xchg %%rbx, %%rdi;" - : "=a" (*a) , - "=D" (*b) , - #elif defined(MY_CPU_X86) && defined(__PIC__) - "mov %%ebx, %%edi;" - "cpuid;" - "xchgl %%ebx, %%edi;" - : "=a" (*a) , - "=D" (*b) , - #else - "cpuid" - : "=a" (*a) , - "=b" (*b) , +UInt32 __declspec(naked) Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) +{ + #if defined(NEED_CHECK_FOR_CPUID) + #define EFALGS_CPUID_BIT 21 + __asm pushfd + __asm pushfd + /* + __asm pop eax + // __asm mov edx, eax + __asm btc eax, EFALGS_CPUID_BIT + __asm push eax + */ + __asm btc dword ptr [esp], EFALGS_CPUID_BIT + __asm popfd + __asm pushfd + __asm pop eax + // __asm xor eax, edx + __asm xor eax, [esp] + // __asm push edx + __asm popfd + __asm and eax, (1 shl EFALGS_CPUID_BIT) + __asm jz end_func #endif - "=c" (*c) , - "=d" (*d) - : "0" (function), "c"(0) ) ; - + __asm push ebx + __asm xor eax, eax // func + __asm xor ecx, ecx // subFunction (optional) for (func == 0) + __asm cpuid + __asm pop ebx + #if defined(NEED_CHECK_FOR_CPUID) + end_func: #endif - - #else + __asm ret 0 +} + +void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) +{ + UNUSED_VAR(p) + UNUSED_VAR(func) + __asm push ebx + __asm push edi + __asm mov edi, ecx // p + __asm mov eax, edx // func + __asm xor ecx, ecx // subfunction (optional) for (func == 0) + __asm cpuid + __asm mov [edi ], eax + __asm mov [edi + 4], ebx + __asm mov [edi + 8], ecx + __asm mov [edi + 12], edx + __asm pop edi + __asm pop ebx + __asm ret 0 +} - int CPUInfo[4]; +static +void __declspec(naked) Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) +{ + UNUSED_VAR(p) + UNUSED_VAR(func) + UNUSED_VAR(subFunc) + __asm push ebx + __asm push edi + __asm mov edi, ecx // p + __asm mov eax, edx // func + __asm mov ecx, [esp + 12] // subFunc + __asm cpuid + __asm mov [edi ], eax + __asm mov [edi + 4], ebx + __asm mov [edi + 8], ecx + __asm mov [edi + 12], edx + __asm pop edi + __asm pop ebx + __asm ret 4 +} - MY__cpuidex(CPUInfo, (int)function, 0); +#else // MY_CPU_AMD64 - *a = (UInt32)CPUInfo[0]; - *b = (UInt32)CPUInfo[1]; - *c = (UInt32)CPUInfo[2]; - *d = (UInt32)CPUInfo[3]; + #if _MSC_VER >= 1600 + #include + #define MY_cpuidex __cpuidex - #endif +static +void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) +{ + __cpuidex((int *)p, func, subFunc); +} + + #else +/* + __cpuid (func == (0 or 7)) requires subfunction number in ECX. + MSDN: The __cpuid intrinsic clears the ECX register before calling the cpuid instruction. + __cpuid() in new MSVC clears ECX. + __cpuid() in old MSVC (14.00) x64 doesn't clear ECX + We still can use __cpuid for low (func) values that don't require ECX, + but __cpuid() in old MSVC will be incorrect for some func values: (func == 7). + So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction, + where ECX value is first parameter for FASTCALL / NO_INLINE func. + So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and + old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value. + +DON'T remove Z7_NO_INLINE and Z7_FASTCALL for MY_cpuidex_HACK(): !!! +*/ +static +Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int32 *CPUInfo) +{ + UNUSED_VAR(subFunction) + __cpuid(CPUInfo, func); +} + #define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info) + #pragma message("======== MY_cpuidex_HACK WAS USED ========") +static +void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) +{ + MY_cpuidex_HACK(subFunc, func, (Int32 *)p); } + #endif // _MSC_VER >= 1600 -BoolInt x86cpuid_CheckAndRead(Cx86cpuid *p) +#if !defined(MY_CPU_AMD64) +/* inlining for __cpuid() in MSVC x86 (32-bit) produces big ineffective code, + so we disable inlining here */ +Z7_NO_INLINE +#endif +void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) +{ + MY_cpuidex((Int32 *)p, (Int32)func, 0); +} + +Z7_NO_INLINE +UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) +{ + Int32 a[4]; + MY_cpuidex(a, 0, 0); + return a[0]; +} + +#endif // MY_CPU_AMD64 +#endif // _MSC_VER + +#if defined(NEED_CHECK_FOR_CPUID) +#define CHECK_CPUID_IS_SUPPORTED { if (z7_x86_cpuid_GetMaxFunc() == 0) return 0; } +#else +#define CHECK_CPUID_IS_SUPPORTED +#endif +#undef NEED_CHECK_FOR_CPUID + + +static +BoolInt x86cpuid_Func_1(UInt32 *p) { CHECK_CPUID_IS_SUPPORTED - MyCPUID(0, &p->maxFunc, &p->vendor[0], &p->vendor[2], &p->vendor[1]); - MyCPUID(1, &p->ver, &p->b, &p->c, &p->d); + z7_x86_cpuid(p, 1); return True; } -static const UInt32 kVendors[][3] = +/* +static const UInt32 kVendors[][1] = { - { 0x756E6547, 0x49656E69, 0x6C65746E}, - { 0x68747541, 0x69746E65, 0x444D4163}, - { 0x746E6543, 0x48727561, 0x736C7561} + { 0x756E6547 }, // , 0x49656E69, 0x6C65746E }, + { 0x68747541 }, // , 0x69746E65, 0x444D4163 }, + { 0x746E6543 } // , 0x48727561, 0x736C7561 } }; +*/ + +/* +typedef struct +{ + UInt32 maxFunc; + UInt32 vendor[3]; + UInt32 ver; + UInt32 b; + UInt32 c; + UInt32 d; +} Cx86cpuid; + +enum +{ + CPU_FIRM_INTEL, + CPU_FIRM_AMD, + CPU_FIRM_VIA +}; +int x86cpuid_GetFirm(const Cx86cpuid *p); +#define x86cpuid_ver_GetFamily(ver) (((ver >> 16) & 0xff0) | ((ver >> 8) & 0xf)) +#define x86cpuid_ver_GetModel(ver) (((ver >> 12) & 0xf0) | ((ver >> 4) & 0xf)) +#define x86cpuid_ver_GetStepping(ver) (ver & 0xf) int x86cpuid_GetFirm(const Cx86cpuid *p) { unsigned i; - for (i = 0; i < sizeof(kVendors) / sizeof(kVendors[i]); i++) + for (i = 0; i < sizeof(kVendors) / sizeof(kVendors[0]); i++) { const UInt32 *v = kVendors[i]; - if (v[0] == p->vendor[0] && - v[1] == p->vendor[1] && - v[2] == p->vendor[2]) + if (v[0] == p->vendor[0] + // && v[1] == p->vendor[1] + // && v[2] == p->vendor[2] + ) return (int)i; } return -1; @@ -190,41 +360,55 @@ int x86cpuid_GetFirm(const Cx86cpuid *p) BoolInt CPU_Is_InOrder() { Cx86cpuid p; - int firm; UInt32 family, model; if (!x86cpuid_CheckAndRead(&p)) return True; - family = x86cpuid_GetFamily(p.ver); - model = x86cpuid_GetModel(p.ver); - - firm = x86cpuid_GetFirm(&p); + family = x86cpuid_ver_GetFamily(p.ver); + model = x86cpuid_ver_GetModel(p.ver); - switch (firm) + switch (x86cpuid_GetFirm(&p)) { case CPU_FIRM_INTEL: return (family < 6 || (family == 6 && ( - /* In-Order Atom CPU */ - model == 0x1C /* 45 nm, N4xx, D4xx, N5xx, D5xx, 230, 330 */ - || model == 0x26 /* 45 nm, Z6xx */ - || model == 0x27 /* 32 nm, Z2460 */ - || model == 0x35 /* 32 nm, Z2760 */ - || model == 0x36 /* 32 nm, N2xxx, D2xxx */ + // In-Order Atom CPU + model == 0x1C // 45 nm, N4xx, D4xx, N5xx, D5xx, 230, 330 + || model == 0x26 // 45 nm, Z6xx + || model == 0x27 // 32 nm, Z2460 + || model == 0x35 // 32 nm, Z2760 + || model == 0x36 // 32 nm, N2xxx, D2xxx ))); case CPU_FIRM_AMD: return (family < 5 || (family == 5 && (model < 6 || model == 0xA))); case CPU_FIRM_VIA: return (family < 6 || (family == 6 && model < 0xF)); } - return True; + return False; // v23 : unknown processors are not In-Order } +*/ + +#ifdef _WIN32 +#include "7zWindows.h" +#endif #if !defined(MY_CPU_AMD64) && defined(_WIN32) -#include -static BoolInt CPU_Sys_Is_SSE_Supported() + +/* for legacy SSE ia32: there is no user-space cpu instruction to check + that OS supports SSE register storing/restoring on context switches. + So we need some OS-specific function to check that it's safe to use SSE registers. +*/ + +Z7_FORCE_INLINE +static BoolInt CPU_Sys_Is_SSE_Supported(void) { - OSVERSIONINFO vi; - vi.dwOSVersionInfoSize = sizeof(vi); - if (!GetVersionEx(&vi)) - return False; - return (vi.dwMajorVersion >= 5); +#ifdef _MSC_VER + #pragma warning(push) + #pragma warning(disable : 4996) // `GetVersion': was declared deprecated +#endif + /* low byte is major version of Windows + We suppose that any Windows version since + Windows2000 (major == 5) supports SSE registers */ + return (Byte)GetVersion() >= 5; +#if defined(_MSC_VER) + #pragma warning(pop) +#endif } #define CHECK_SYS_SSE_SUPPORT if (!CPU_Sys_Is_SSE_Supported()) return False; #else @@ -232,117 +416,364 @@ static BoolInt CPU_Sys_Is_SSE_Supported( #endif -static UInt32 X86_CPUID_ECX_Get_Flags() +#if !defined(MY_CPU_AMD64) + +BoolInt CPU_IsSupported_CMOV(void) { - Cx86cpuid p; + UInt32 a[4]; + if (!x86cpuid_Func_1(&a[0])) + return 0; + return (BoolInt)(a[3] >> 15) & 1; +} + +BoolInt CPU_IsSupported_SSE(void) +{ + UInt32 a[4]; CHECK_SYS_SSE_SUPPORT - if (!x86cpuid_CheckAndRead(&p)) + if (!x86cpuid_Func_1(&a[0])) + return 0; + return (BoolInt)(a[3] >> 25) & 1; +} + +BoolInt CPU_IsSupported_SSE2(void) +{ + UInt32 a[4]; + CHECK_SYS_SSE_SUPPORT + if (!x86cpuid_Func_1(&a[0])) return 0; - return p.c; + return (BoolInt)(a[3] >> 26) & 1; } -BoolInt CPU_IsSupported_AES() +#endif + + +static UInt32 x86cpuid_Func_1_ECX(void) { - return (X86_CPUID_ECX_Get_Flags() >> 25) & 1; + UInt32 a[4]; + CHECK_SYS_SSE_SUPPORT + if (!x86cpuid_Func_1(&a[0])) + return 0; + return a[2]; } -BoolInt CPU_IsSupported_SSSE3() +BoolInt CPU_IsSupported_AES(void) { - return (X86_CPUID_ECX_Get_Flags() >> 9) & 1; + return (BoolInt)(x86cpuid_Func_1_ECX() >> 25) & 1; } -BoolInt CPU_IsSupported_SSE41() +BoolInt CPU_IsSupported_SSSE3(void) { - return (X86_CPUID_ECX_Get_Flags() >> 19) & 1; + return (BoolInt)(x86cpuid_Func_1_ECX() >> 9) & 1; } -BoolInt CPU_IsSupported_SHA() +BoolInt CPU_IsSupported_SSE41(void) +{ + return (BoolInt)(x86cpuid_Func_1_ECX() >> 19) & 1; +} + +BoolInt CPU_IsSupported_SHA(void) { - Cx86cpuid p; CHECK_SYS_SSE_SUPPORT - if (!x86cpuid_CheckAndRead(&p)) + + if (z7_x86_cpuid_GetMaxFunc() < 7) return False; + { + UInt32 d[4]; + z7_x86_cpuid(d, 7); + return (BoolInt)(d[1] >> 29) & 1; + } +} + + +BoolInt CPU_IsSupported_SHA512(void) +{ + if (!CPU_IsSupported_AVX2()) return False; // maybe CPU_IsSupported_AVX() is enough here - if (p.maxFunc < 7) + if (z7_x86_cpuid_GetMaxFunc() < 7) return False; { - UInt32 d[4] = { 0 }; - MyCPUID(7, &d[0], &d[1], &d[2], &d[3]); - return (d[1] >> 29) & 1; + UInt32 d[4]; + z7_x86_cpuid_subFunc(d, 7, 0); + if (d[0] < 1) // d[0] - is max supported subleaf value + return False; + z7_x86_cpuid_subFunc(d, 7, 1); + return (BoolInt)(d[0]) & 1; } } -// #include +/* +MSVC: _xgetbv() intrinsic is available since VS2010SP1. + MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in + that we can use or check. + For any 32-bit x86 we can use asm code in MSVC, + but MSVC asm code is huge after compilation. + So _xgetbv() is better + +ICC: _xgetbv() intrinsic is available (in what version of ICC?) + ICC defines (__GNUC___) and it supports gnu assembler + also ICC supports MASM style code with -use-msasm switch. + but ICC doesn't support __attribute__((__target__)) + +GCC/CLANG 9: + _xgetbv() is macro that works via __builtin_ia32_xgetbv() + and we need __attribute__((__target__("xsave")). + But with __target__("xsave") the function will be not + inlined to function that has no __target__("xsave") attribute. + If we want _xgetbv() call inlining, then we should use asm version + instead of calling _xgetbv(). + Note:intrinsic is broke before GCC 8.2: + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85684 +*/ -#ifdef _WIN32 -#include +#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1100) \ + || defined(_MSC_VER) && (_MSC_VER >= 1600) && (_MSC_FULL_VER >= 160040219) \ + || defined(__GNUC__) && (__GNUC__ >= 9) \ + || defined(__clang__) && (__clang_major__ >= 9) +// we define ATTRIB_XGETBV, if we want to use predefined _xgetbv() from compiler +#if defined(__INTEL_COMPILER) +#define ATTRIB_XGETBV +#elif defined(__GNUC__) || defined(__clang__) +// we don't define ATTRIB_XGETBV here, because asm version is better for inlining. +// #define ATTRIB_XGETBV __attribute__((__target__("xsave"))) +#else +#define ATTRIB_XGETBV +#endif #endif -BoolInt CPU_IsSupported_AVX2() +#if defined(ATTRIB_XGETBV) +#include +#endif + + +// XFEATURE_ENABLED_MASK/XCR0 +#define MY_XCR_XFEATURE_ENABLED_MASK 0 + +#if defined(ATTRIB_XGETBV) +ATTRIB_XGETBV +#endif +static UInt64 x86_xgetbv_0(UInt32 num) { - Cx86cpuid p; - CHECK_SYS_SSE_SUPPORT +#if defined(ATTRIB_XGETBV) + { + return + #if (defined(_MSC_VER)) + _xgetbv(num); + #else + __builtin_ia32_xgetbv( + #if !defined(__clang__) + (int) + #endif + num); + #endif + } +#elif defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_CC) + + UInt32 a, d; + #if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) + __asm__ + ( + "xgetbv" + : "=a"(a), "=d"(d) : "c"(num) : "cc" + ); + #else // is old gcc + __asm__ + ( + ".byte 0x0f, 0x01, 0xd0" "\n\t" + : "=a"(a), "=d"(d) : "c"(num) : "cc" + ); + #endif + return ((UInt64)d << 32) | a; + // return a; + +#elif defined(_MSC_VER) && !defined(MY_CPU_AMD64) + + UInt32 a, d; + __asm { + push eax + push edx + push ecx + mov ecx, num; + // xor ecx, ecx // = MY_XCR_XFEATURE_ENABLED_MASK + _emit 0x0f + _emit 0x01 + _emit 0xd0 + mov a, eax + mov d, edx + pop ecx + pop edx + pop eax + } + return ((UInt64)d << 32) | a; + // return a; + +#else // it's unknown compiler + // #error "Need xgetbv function" + UNUSED_VAR(num) + // for MSVC-X64 we could call external function from external file. + /* Actually we had checked OSXSAVE/AVX in cpuid before. + So it's expected that OS supports at least AVX and below. */ + // if (num != MY_XCR_XFEATURE_ENABLED_MASK) return 0; // if not XCR0 + return + // (1 << 0) | // x87 + (1 << 1) // SSE + | (1 << 2); // AVX + +#endif +} + +#ifdef _WIN32 +/* + Windows versions do not know about new ISA extensions that + can be introduced. But we still can use new extensions, + even if Windows doesn't report about supporting them, + But we can use new extensions, only if Windows knows about new ISA extension + that changes the number or size of registers: SSE, AVX/XSAVE, AVX512 + So it's enough to check + MY_PF_AVX_INSTRUCTIONS_AVAILABLE + instead of + MY_PF_AVX2_INSTRUCTIONS_AVAILABLE +*/ +#define MY_PF_XSAVE_ENABLED 17 +// #define MY_PF_SSSE3_INSTRUCTIONS_AVAILABLE 36 +// #define MY_PF_SSE4_1_INSTRUCTIONS_AVAILABLE 37 +// #define MY_PF_SSE4_2_INSTRUCTIONS_AVAILABLE 38 +// #define MY_PF_AVX_INSTRUCTIONS_AVAILABLE 39 +// #define MY_PF_AVX2_INSTRUCTIONS_AVAILABLE 40 +// #define MY_PF_AVX512F_INSTRUCTIONS_AVAILABLE 41 +#endif + +BoolInt CPU_IsSupported_AVX(void) +{ #ifdef _WIN32 - #define MY__PF_XSAVE_ENABLED 17 - if (!IsProcessorFeaturePresent(MY__PF_XSAVE_ENABLED)) + if (!IsProcessorFeaturePresent(MY_PF_XSAVE_ENABLED)) + return False; + /* PF_AVX_INSTRUCTIONS_AVAILABLE probably is supported starting from + some latest Win10 revisions. But we need AVX in older Windows also. + So we don't use the following check: */ + /* + if (!IsProcessorFeaturePresent(MY_PF_AVX_INSTRUCTIONS_AVAILABLE)) return False; + */ #endif - if (!x86cpuid_CheckAndRead(&p)) + /* + OS must use new special XSAVE/XRSTOR instructions to save + AVX registers when it required for context switching. + At OS statring: + OS sets CR4.OSXSAVE flag to signal the processor that OS supports the XSAVE extensions. + Also OS sets bitmask in XCR0 register that defines what + registers will be processed by XSAVE instruction: + XCR0.SSE[bit 0] - x87 registers and state + XCR0.SSE[bit 1] - SSE registers and state + XCR0.AVX[bit 2] - AVX registers and state + CR4.OSXSAVE is reflected to CPUID.1:ECX.OSXSAVE[bit 27]. + So we can read that bit in user-space. + XCR0 is available for reading in user-space by new XGETBV instruction. + */ + { + const UInt32 c = x86cpuid_Func_1_ECX(); + if (0 == (1 + & (c >> 28) // AVX instructions are supported by hardware + & (c >> 27))) // OSXSAVE bit: XSAVE and related instructions are enabled by OS. + return False; + } + + /* also we can check + CPUID.1:ECX.XSAVE [bit 26] : that shows that + XSAVE, XRESTOR, XSETBV, XGETBV instructions are supported by hardware. + But that check is redundant, because if OSXSAVE bit is set, then XSAVE is also set */ + + /* If OS have enabled XSAVE extension instructions (OSXSAVE == 1), + in most cases we expect that OS also will support storing/restoring + for AVX and SSE states at least. + But to be ensure for that we call user-space instruction + XGETBV(0) to get XCR0 value that contains bitmask that defines + what exact states(registers) OS have enabled for storing/restoring. + */ + + { + const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK); + // printf("\n=== XGetBV=0x%x\n", bm); + return 1 + & (BoolInt)(bm >> 1) // SSE state is supported (set by OS) for storing/restoring + & (BoolInt)(bm >> 2); // AVX state is supported (set by OS) for storing/restoring + } + // since Win7SP1: we can use GetEnabledXStateFeatures(); +} + + +BoolInt CPU_IsSupported_AVX2(void) +{ + if (!CPU_IsSupported_AVX()) return False; - if (p.maxFunc < 7) + if (z7_x86_cpuid_GetMaxFunc() < 7) return False; { - UInt32 d[4] = { 0 }; - MyCPUID(7, &d[0], &d[1], &d[2], &d[3]); + UInt32 d[4]; + z7_x86_cpuid(d, 7); // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); return 1 - & (d[1] >> 5); // avx2 + & (BoolInt)(d[1] >> 5); // avx2 } } -BoolInt CPU_IsSupported_VAES_AVX2() +#if 0 +BoolInt CPU_IsSupported_AVX512F_AVX512VL(void) { - Cx86cpuid p; - CHECK_SYS_SSE_SUPPORT - - #ifdef _WIN32 - #define MY__PF_XSAVE_ENABLED 17 - if (!IsProcessorFeaturePresent(MY__PF_XSAVE_ENABLED)) + if (!CPU_IsSupported_AVX()) return False; - #endif + if (z7_x86_cpuid_GetMaxFunc() < 7) + return False; + { + UInt32 d[4]; + BoolInt v; + z7_x86_cpuid(d, 7); + // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); + v = 1 + & (BoolInt)(d[1] >> 16) // avx512f + & (BoolInt)(d[1] >> 31); // avx512vl + if (!v) + return False; + } + { + const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK); + // printf("\n=== XGetBV=0x%x\n", bm); + return 1 + & (BoolInt)(bm >> 5) // OPMASK + & (BoolInt)(bm >> 6) // ZMM upper 256-bit + & (BoolInt)(bm >> 7); // ZMM16 ... ZMM31 + } +} +#endif - if (!x86cpuid_CheckAndRead(&p)) +BoolInt CPU_IsSupported_VAES_AVX2(void) +{ + if (!CPU_IsSupported_AVX()) return False; - if (p.maxFunc < 7) + if (z7_x86_cpuid_GetMaxFunc() < 7) return False; { - UInt32 d[4] = { 0 }; - MyCPUID(7, &d[0], &d[1], &d[2], &d[3]); + UInt32 d[4]; + z7_x86_cpuid(d, 7); // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); return 1 - & (d[1] >> 5) // avx2 + & (BoolInt)(d[1] >> 5) // avx2 // & (d[1] >> 31) // avx512vl - & (d[2] >> 9); // vaes // VEX-256/EVEX + & (BoolInt)(d[2] >> 9); // vaes // VEX-256/EVEX } } -BoolInt CPU_IsSupported_PageGB() +BoolInt CPU_IsSupported_PageGB(void) { - Cx86cpuid cpuid; - if (!x86cpuid_CheckAndRead(&cpuid)) - return False; + CHECK_CPUID_IS_SUPPORTED { - UInt32 d[4] = { 0 }; - MyCPUID(0x80000000, &d[0], &d[1], &d[2], &d[3]); + UInt32 d[4]; + z7_x86_cpuid(d, 0x80000000); if (d[0] < 0x80000001) return False; - } - { - UInt32 d[4] = { 0 }; - MyCPUID(0x80000001, &d[0], &d[1], &d[2], &d[3]); - return (d[3] >> 26) & 1; + z7_x86_cpuid(d, 0x80000001); + return (BoolInt)(d[3] >> 26) & 1; } } @@ -351,11 +782,11 @@ BoolInt CPU_IsSupported_PageGB() #ifdef _WIN32 -#include +#include "7zWindows.h" -BoolInt CPU_IsSupported_CRC32() { return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } -BoolInt CPU_IsSupported_CRYPTO() { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } -BoolInt CPU_IsSupported_NEON() { return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } +BoolInt CPU_IsSupported_CRC32(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } +BoolInt CPU_IsSupported_CRYPTO(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } +BoolInt CPU_IsSupported_NEON(void) { return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } #else @@ -378,29 +809,40 @@ static void Print_sysctlbyname(const cha } } */ +/* + Print_sysctlbyname("hw.pagesize"); + Print_sysctlbyname("machdep.cpu.brand_string"); +*/ -static BoolInt My_sysctlbyname_Get_BoolInt(const char *name) +static BoolInt z7_sysctlbyname_Get_BoolInt(const char *name) { UInt32 val = 0; - if (My_sysctlbyname_Get_UInt32(name, &val) == 0 && val == 1) + if (z7_sysctlbyname_Get_UInt32(name, &val) == 0 && val == 1) return 1; return 0; } - /* - Print_sysctlbyname("hw.pagesize"); - Print_sysctlbyname("machdep.cpu.brand_string"); - */ - BoolInt CPU_IsSupported_CRC32(void) { - return My_sysctlbyname_Get_BoolInt("hw.optional.armv8_crc32"); + return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_crc32"); } BoolInt CPU_IsSupported_NEON(void) { - return My_sysctlbyname_Get_BoolInt("hw.optional.neon"); + return z7_sysctlbyname_Get_BoolInt("hw.optional.neon"); +} + +BoolInt CPU_IsSupported_SHA512(void) +{ + return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha512"); +} + +/* +BoolInt CPU_IsSupported_SHA3(void) +{ + return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha3"); } +*/ #ifdef MY_CPU_ARM64 #define APPLE_CRYPTO_SUPPORT_VAL 1 @@ -412,38 +854,101 @@ BoolInt CPU_IsSupported_SHA1(void) { ret BoolInt CPU_IsSupported_SHA2(void) { return APPLE_CRYPTO_SUPPORT_VAL; } BoolInt CPU_IsSupported_AES (void) { return APPLE_CRYPTO_SUPPORT_VAL; } +#elif defined(__OpenBSD__) + +#include +#include +#include +#include + +BoolInt CPU_IsSupported_NEON() { return 1; } + +#define MY_HWCAP_CHECK_FUNC_2(name1, name2) \ + BoolInt CPU_IsSupported_ ## name1() { \ + int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 }; \ + size_t len = sizeof(uint64_t); \ + uint64_t cpu_id = 0; \ + if (sysctl(isar0_mib, 2, &cpu_id, &len, NULL, 0) < 0) \ + return 0; \ + if (ID_AA64ISAR0_ ## name2(cpu_id) >= ID_AA64ISAR0_## name2 ##_BASE) \ + return 1; \ + return 0; \ + } + +#define MY_HWCAP_CHECK_FUNC(name) \ + MY_HWCAP_CHECK_FUNC_2(name, name) + +MY_HWCAP_CHECK_FUNC (CRC32) +MY_HWCAP_CHECK_FUNC (SHA1) +MY_HWCAP_CHECK_FUNC (SHA2) +MY_HWCAP_CHECK_FUNC (AES) #else // __APPLE__ -#include +#if defined(__GLIBC__) && (__GLIBC__ * 100 + __GLIBC_MINOR__ >= 216) + #define Z7_GETAUXV_AVAILABLE +#else +// #pragma message("=== is not NEW GLIBC === ") + #if defined __has_include + #if __has_include () +// #pragma message("=== sys/auxv.h is avail=== ") + #define Z7_GETAUXV_AVAILABLE + #endif + #endif +#endif -#if !defined(ARMV8_OS_FUCHSIA) +#ifdef Z7_GETAUXV_AVAILABLE +// #pragma message("=== Z7_GETAUXV_AVAILABLE === ") +#include #define USE_HWCAP -#endif // !defined(ARMV8_OS_FUCHSIA) +#endif #ifdef USE_HWCAP +#if defined(__FreeBSD__) +static unsigned long MY_getauxval(int aux) +{ + unsigned long val; + if (elf_aux_info(aux, &val, sizeof(val))) + return 0; + return val; +} +#else +#define MY_getauxval getauxval + #if defined __has_include + #if __has_include () #include + #endif + #endif +#endif #define MY_HWCAP_CHECK_FUNC_2(name1, name2) \ - BoolInt CPU_IsSupported_ ## name1() { return (getauxval(AT_HWCAP) & (HWCAP_ ## name2)) ? 1 : 0; } + BoolInt CPU_IsSupported_ ## name1(void) { return (MY_getauxval(AT_HWCAP) & (HWCAP_ ## name2)); } #ifdef MY_CPU_ARM64 #define MY_HWCAP_CHECK_FUNC(name) \ MY_HWCAP_CHECK_FUNC_2(name, name) +#if 1 || defined(__ARM_NEON) + BoolInt CPU_IsSupported_NEON(void) { return True; } +#else MY_HWCAP_CHECK_FUNC_2(NEON, ASIMD) +#endif // MY_HWCAP_CHECK_FUNC (ASIMD) #elif defined(MY_CPU_ARM) #define MY_HWCAP_CHECK_FUNC(name) \ - BoolInt CPU_IsSupported_ ## name() { return (getauxval(AT_HWCAP2) & (HWCAP2_ ## name)) ? 1 : 0; } + BoolInt CPU_IsSupported_ ## name(void) { return (MY_getauxval(AT_HWCAP2) & (HWCAP2_ ## name)); } MY_HWCAP_CHECK_FUNC_2(NEON, NEON) #endif #else // USE_HWCAP #define MY_HWCAP_CHECK_FUNC(name) \ - BoolInt CPU_IsSupported_ ## name() { return 0; } + BoolInt CPU_IsSupported_ ## name(void) { return 0; } +#if defined(__ARM_NEON) + BoolInt CPU_IsSupported_NEON(void) { return True; } +#else MY_HWCAP_CHECK_FUNC(NEON) +#endif #endif // USE_HWCAP @@ -451,6 +956,19 @@ MY_HWCAP_CHECK_FUNC (CRC32) MY_HWCAP_CHECK_FUNC (SHA1) MY_HWCAP_CHECK_FUNC (SHA2) MY_HWCAP_CHECK_FUNC (AES) +#ifdef MY_CPU_ARM64 +// supports HWCAP_SHA512 and HWCAP_SHA3 since 2017. +// we define them here, if they are not defined +#ifndef HWCAP_SHA3 +// #define HWCAP_SHA3 (1 << 17) +#endif +#ifndef HWCAP_SHA512 +// #pragma message("=== HWCAP_SHA512 define === ") +#define HWCAP_SHA512 (1 << 21) +#endif +MY_HWCAP_CHECK_FUNC (SHA512) +// MY_HWCAP_CHECK_FUNC (SHA3) +#endif #endif // __APPLE__ #endif // _WIN32 @@ -463,15 +981,15 @@ MY_HWCAP_CHECK_FUNC (AES) #include -int My_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize) +int z7_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize) { return sysctlbyname(name, buf, bufSize, NULL, 0); } -int My_sysctlbyname_Get_UInt32(const char *name, UInt32 *val) +int z7_sysctlbyname_Get_UInt32(const char *name, UInt32 *val) { size_t bufSize = sizeof(*val); - int res = My_sysctlbyname_Get(name, val, &bufSize); + const int res = z7_sysctlbyname_Get(name, val, &bufSize); if (res == 0 && bufSize != sizeof(*val)) return EFAULT; return res;