/*
 * adler32.c - Adler-32 checksum algorithm
 *
 * Copyright 2016 Eric Biggers
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

#include "lib_common.h"

/* The Adler-32 divisor, or "base", value */
#define DIVISOR 65521

/*
 * MAX_CHUNK_LEN is the most bytes that can be processed without the possibility
 * of s2 overflowing when it is represented as an unsigned 32-bit integer.  This
 * value was computed using the following Python script:
 *
 *	divisor = 65521
 *	count = 0
 *	s1 = divisor - 1
 *	s2 = divisor - 1
 *	while True:
 *		s1 += 0xFF
 *		s2 += s1
 *		if s2 > 0xFFFFFFFF:
 *			break
 *		count += 1
 *	print(count)
 *
 * Note that to get the correct worst-case value, we must assume that every byte
 * has value 0xFF and that s1 and s2 started with the highest possible values
 * modulo the divisor.
 */
#define MAX_CHUNK_LEN	5552

/*
 * Update the Adler-32 values s1 and s2 using n bytes from p, update p to p + n,
 * update n to 0, and reduce s1 and s2 mod DIVISOR.  It is assumed that neither
 * s1 nor s2 can overflow before the reduction at the end, i.e. n plus any bytes
 * already processed after the last reduction must not exceed MAX_CHUNK_LEN.
 *
 * This uses only portable C code.  This is used as a fallback when a vectorized
 * implementation of Adler-32 (e.g. AVX2) is unavailable on the platform.
 *
 * Some of the vectorized implementations also use this to handle the end of the
 * data when the data isn't evenly divisible by the length the vectorized code
 * works on.  To avoid compiler errors about target-specific option mismatches
 * when this is used in that way, this is a macro rather than a function.
 *
 * Although this is unvectorized, this does include an optimization where the
 * main loop processes four bytes at a time using a strategy similar to that
 * used by vectorized implementations.  This provides increased instruction-
 * level parallelism compared to the traditional 's1 += *p++; s2 += s1;'.
 */
#define ADLER32_CHUNK(s1, s2, p, n)					\
do {									\
	if (n >= 4) {							\
		u32 s1_sum = 0;						\
		u32 byte_0_sum = 0;					\
		u32 byte_1_sum = 0;					\
		u32 byte_2_sum = 0;					\
		u32 byte_3_sum = 0;					\
									\
		do {							\
			s1_sum += s1;					\
			s1 += p[0] + p[1] + p[2] + p[3];		\
			byte_0_sum += p[0];				\
			byte_1_sum += p[1];				\
			byte_2_sum += p[2];				\
			byte_3_sum += p[3];				\
			p += 4;						\
			n -= 4;						\
		} while (n >= 4);					\
		s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) +	\
		      (2 * byte_2_sum) + byte_3_sum;			\
	}								\
	for (; n; n--, p++) {						\
		s1 += *p;						\
		s2 += s1;						\
	}								\
	s1 %= DIVISOR;							\
	s2 %= DIVISOR;							\
} while (0)

static u32 MAYBE_UNUSED
adler32_generic(u32 adler, const u8 *p, size_t len)
{
	u32 s1 = adler & 0xFFFF;
	u32 s2 = adler >> 16;

	while (len) {
		size_t n = MIN(len, MAX_CHUNK_LEN & ~3);

		len -= n;
		ADLER32_CHUNK(s1, s2, p, n);
	}

	return (s2 << 16) | s1;
}

/* Include architecture-specific implementation(s) if available. */
#undef DEFAULT_IMPL
#undef arch_select_adler32_func
typedef u32 (*adler32_func_t)(u32 adler, const u8 *p, size_t len);
#if defined(ARCH_ARM32) || defined(ARCH_ARM64)
#  include "arm/adler32_impl.h"
#elif defined(ARCH_X86_32) || defined(ARCH_X86_64)
#  include "x86/adler32_impl.h"
#endif

#ifndef DEFAULT_IMPL
#  define DEFAULT_IMPL adler32_generic
#endif

#ifdef arch_select_adler32_func
static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len);

static volatile adler32_func_t adler32_impl = dispatch_adler32;

/* Choose the best implementation at runtime. */
static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len)
{
	adler32_func_t f = arch_select_adler32_func();

	if (f == NULL)
		f = DEFAULT_IMPL;

	adler32_impl = f;
	return f(adler, p, len);
}
#else
/* The best implementation is statically known, so call it directly. */
#define adler32_impl DEFAULT_IMPL
#endif

LIBDEFLATEAPI u32
libdeflate_adler32(u32 adler, const void *buffer, size_t len)
{
	if (buffer == NULL) /* Return initial value. */
		return 1;
	return adler32_impl(adler, buffer, len);
}