/*
 * adler32_vec_template.h - template for vectorized Adler-32 implementations
 *
 * Copyright 2016 Eric Biggers
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

/*
 * This file contains a template for vectorized Adler-32 implementations.
 *
 * The inner loop between reductions modulo 65521 of an unvectorized Adler-32
 * implementation looks something like this:
 *
 *	do {
 *		s1 += *p;
 *		s2 += s1;
 *	} while (++p != chunk_end);
 *
 * For vectorized calculation of s1, we only need to sum the input bytes.  They
 * can be accumulated into multiple counters which are eventually summed
 * together.
 *
 * For vectorized calculation of s2, the basic idea is that for each iteration
 * that processes N bytes, we can perform the following vectorizable
 * calculation:
 *
 *	s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N
 *
 * Or, equivalently, we can sum the byte_1...byte_N for each iteration into N
 * separate counters, then do the multiplications by N...1 just once at the end
 * rather than once per iteration.
 *
 * Also, we must account for how previous bytes will affect s2 by doing the
 * following at beginning of each iteration:
 *
 *	s2 += s1 * N
 *
 * Furthermore, like s1, "s2" can actually be multiple counters which are
 * eventually summed together.
 */

static u32 ATTRIBUTES
FUNCNAME(u32 adler, const u8 *p, size_t size)
{
	u32 s1 = adler & 0xFFFF;
	u32 s2 = adler >> 16;
	const u8 * const end = p + size;
	const u8 *vend;
	const size_t max_chunk_size =
		MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) -
		(MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) %
		 IMPL_SEGMENT_SIZE);

	/* Process a byte at a time until the needed alignment is reached */
	if (p != end && (uintptr_t)p % IMPL_ALIGNMENT) {
		do {
			s1 += *p++;
			s2 += s1;
		} while (p != end && (uintptr_t)p % IMPL_ALIGNMENT);
		s1 %= DIVISOR;
		s2 %= DIVISOR;
	}

	/*
	 * Process "chunks" of bytes using vector instructions.  Chunk sizes are
	 * limited to MAX_CHUNK_SIZE, which guarantees that s1 and s2 never
	 * overflow before being reduced modulo DIVISOR.  For vector processing,
	 * chunk sizes are also made evenly divisible by IMPL_SEGMENT_SIZE and
	 * may be further limited to IMPL_MAX_CHUNK_SIZE.
	 */
	STATIC_ASSERT(IMPL_SEGMENT_SIZE % IMPL_ALIGNMENT == 0);
	vend = end - ((size_t)(end - p) % IMPL_SEGMENT_SIZE);
	while (p != vend) {
		size_t chunk_size = MIN((size_t)(vend - p), max_chunk_size);

		s2 += s1 * chunk_size;

		FUNCNAME_CHUNK((const void *)p, (const void *)(p + chunk_size),
			       &s1, &s2);

		p += chunk_size;
		s1 %= DIVISOR;
		s2 %= DIVISOR;
	}

	/* Process any remaining bytes */
	if (p != end) {
		do {
			s1 += *p++;
			s2 += s1;
		} while (p != end);
		s1 %= DIVISOR;
		s2 %= DIVISOR;
	}

	return (s2 << 16) | s1;
}

#undef FUNCNAME
#undef FUNCNAME_CHUNK
#undef ATTRIBUTES
#undef IMPL_ALIGNMENT
#undef IMPL_SEGMENT_SIZE
#undef IMPL_MAX_CHUNK_SIZE