/*
 * x86/crc32_impl.h - x86 implementations of the gzip CRC-32 algorithm
 *
 * Copyright 2016 Eric Biggers
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

#ifndef LIB_X86_CRC32_IMPL_H
#define LIB_X86_CRC32_IMPL_H

#include "cpu_features.h"

/* PCLMUL implementation */
#if HAVE_PCLMUL_INTRIN
#  define crc32_x86_pclmul	crc32_x86_pclmul
#  define SUFFIX			 _pclmul
#  if HAVE_PCLMUL_NATIVE
#    define ATTRIBUTES
#  else
#    define ATTRIBUTES		_target_attribute("pclmul")
#  endif
#  define FOLD_PARTIAL_VECS	0
#  include "crc32_pclmul_template.h"
#endif

/*
 * PCLMUL/AVX implementation.  This implementation has two benefits over the
 * regular PCLMUL one.  First, simply compiling against the AVX target can
 * improve performance significantly (e.g. 10100 MB/s to 16700 MB/s on Skylake)
 * without actually using any AVX intrinsics, probably due to the availability
 * of non-destructive VEX-encoded instructions.  Second, AVX support implies
 * SSSE3 and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for
 * efficient handling of partial blocks.  (We *could* compile a variant with
 * PCLMUL+SSSE3+SSE4.1 w/o AVX, but for simplicity we don't currently bother.)
 *
 * FIXME: with MSVC, this isn't actually compiled with AVX code generation
 * enabled yet.  That would require that this be moved to its own .c file.
 */
#if HAVE_PCLMUL_INTRIN && HAVE_AVX_INTRIN
#  define crc32_x86_pclmul_avx	crc32_x86_pclmul_avx
#  define SUFFIX			 _pclmul_avx
#  if HAVE_PCLMUL_NATIVE && HAVE_AVX_NATIVE
#    define ATTRIBUTES
#  else
#    define ATTRIBUTES		_target_attribute("pclmul,avx")
#  endif
#  define FOLD_PARTIAL_VECS	1
#  include "crc32_pclmul_template.h"
#endif

/*
 * If the best implementation is statically available, use it unconditionally.
 * Otherwise choose the best implementation at runtime.
 */
#if defined(crc32_x86_pclmul_avx) && HAVE_PCLMUL_NATIVE && HAVE_AVX_NATIVE
#define DEFAULT_IMPL	crc32_x86_pclmul_avx
#else
static inline crc32_func_t
arch_select_crc32_func(void)
{
	const u32 features MAYBE_UNUSED = get_x86_cpu_features();

#ifdef crc32_x86_pclmul_avx
	if (HAVE_PCLMUL(features) && HAVE_AVX(features))
		return crc32_x86_pclmul_avx;
#endif
#ifdef crc32_x86_pclmul
	if (HAVE_PCLMUL(features))
		return crc32_x86_pclmul;
#endif
	return NULL;
}
#define arch_select_crc32_func	arch_select_crc32_func
#endif

#endif /* LIB_X86_CRC32_IMPL_H */