/*
 *  This file is part of the optimized implementation of the Picnic signature scheme.
 *  See the accompanying documentation for complete details.
 *
 *  The code is provided under the MIT license, see LICENSE for
 *  more details.
 *  SPDX-License-Identifier: MIT
 */

/* Inspired by m4ri's mzd implementation, but completely re-written for our use-case. */

#ifndef MZD_ADDITIONAL_H
#define MZD_ADDITIONAL_H

#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>

#include "macros.h"
#include "compat.h"

PICNIC_BEGIN_C_DECL

typedef uint64_t word;
#define WORD_C(v) UINT64_C(v)

typedef ATTR_ALIGNED(32) struct { word w64[4]; } block_t;

/**
 * Representation of matrices and vectors
 *
 * The basic memory unit is a block of 256 bit. Each row is stored in (possible multiple) blocks
 * depending on the number of columns. Matrices with up to 128 columns are the only exception. In
 * this case, a block actually contains two rows. The row with even index is contained in w64[0] and
 * w64[1], the row with odd index is contained in w64[2] and w64[3].
 */
typedef block_t mzd_local_t;

mzd_local_t* mzd_local_init_ex(unsigned int r, unsigned int c, bool clear) ATTR_ASSUME_ALIGNED(32);

#define mzd_local_init(r, c) mzd_local_init_ex(r, c, true)

static inline void mzd_local_free(mzd_local_t* v) {
  picnic_aligned_free(v);
}

void mzd_copy_uint64_128(mzd_local_t* dst, mzd_local_t const* src) ATTR_NONNULL;
void mzd_copy_uint64_192(mzd_local_t* dst, mzd_local_t const* src) ATTR_NONNULL;
void mzd_copy_uint64_256(mzd_local_t* dst, mzd_local_t const* src) ATTR_NONNULL;
void mzd_copy_s128_128(mzd_local_t* dst, mzd_local_t const* src) ATTR_NONNULL;
void mzd_copy_s128_256(mzd_local_t* dst, mzd_local_t const* src) ATTR_NONNULL;
void mzd_copy_s256_128(mzd_local_t* dst, mzd_local_t const* src) ATTR_NONNULL;
void mzd_copy_s256_256(mzd_local_t* dst, mzd_local_t const* src) ATTR_NONNULL;

/**
 * mzd_xor variants
 */
void mzd_xor_uint64_128(mzd_local_t* res, mzd_local_t const* first,
                        mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_uint64_192(mzd_local_t* res, mzd_local_t const* first,
                        mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_uint64_256(mzd_local_t* res, mzd_local_t const* first,
                        mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_uint64_640(mzd_local_t* res, mzd_local_t const* first,
                        mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_uint64_960(mzd_local_t* res, mzd_local_t const* first,
                        mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_uint64_1216(mzd_local_t* res, mzd_local_t const* first,
                         mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_s128_128(mzd_local_t* res, mzd_local_t const* first,
                      mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_s128_256(mzd_local_t* res, mzd_local_t const* first,
                      mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_s128_640(mzd_local_t* res, mzd_local_t const* first,
                      mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_s128_1024(mzd_local_t* res, mzd_local_t const* first,
                       mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_s128_1280(mzd_local_t* res, mzd_local_t const* first,
                       mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_s256_128(mzd_local_t* res, mzd_local_t const* first,
                      mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_s256_256(mzd_local_t* res, mzd_local_t const* first,
                      mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_s256_768(mzd_local_t* res, mzd_local_t const* first,
                      mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_s256_1024(mzd_local_t* res, mzd_local_t const* first,
                       mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_s256_1280(mzd_local_t* res, mzd_local_t const* first,
                       mzd_local_t const* second) ATTR_NONNULL;

/**
 * mzd_and variants
 */
void mzd_and_uint64_128(mzd_local_t* res, mzd_local_t const* first,
                        mzd_local_t const* second) ATTR_NONNULL;
void mzd_and_uint64_192(mzd_local_t* res, mzd_local_t const* first,
                        mzd_local_t const* second) ATTR_NONNULL;
void mzd_and_uint64_256(mzd_local_t* res, mzd_local_t const* first,
                        mzd_local_t const* second) ATTR_NONNULL;

/**
 * shifts and rotations
 */
void mzd_shift_left_uint64_128(mzd_local_t* res, const mzd_local_t* val, unsigned int count);
void mzd_shift_right_uint64_128(mzd_local_t* res, const mzd_local_t* val, unsigned int count);
void mzd_shift_left_uint64_192(mzd_local_t* res, const mzd_local_t* val, unsigned int count);
void mzd_shift_right_uint64_192(mzd_local_t* res, const mzd_local_t* val, unsigned int count);
void mzd_shift_left_uint64_256(mzd_local_t* res, const mzd_local_t* val, unsigned int count);
void mzd_shift_right_uint64_256(mzd_local_t* res, const mzd_local_t* val, unsigned int count);
#if defined(PICNIC_STATIC)
/* only needed for tests */
void mzd_rotate_left_uint64_128(mzd_local_t* res, const mzd_local_t* val, unsigned int count);
void mzd_rotate_right_uint64_128(mzd_local_t* res, const mzd_local_t* val, unsigned int count);
void mzd_rotate_left_uint64_192(mzd_local_t* res, const mzd_local_t* val, unsigned int count);
void mzd_rotate_right_uint64_192(mzd_local_t* res, const mzd_local_t* val, unsigned int count);
void mzd_rotate_left_uint64_256(mzd_local_t* res, const mzd_local_t* val, unsigned int count);
void mzd_rotate_right_uint64_256(mzd_local_t* res, const mzd_local_t* val, unsigned int count);
#endif

/**
 * Compute v * A optimized for v being a vector.
 */
void mzd_mul_v_uint64_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* At) ATTR_NONNULL;
void mzd_mul_v_uint64_129(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* At) ATTR_NONNULL;
void mzd_mul_v_uint64_192(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* At) ATTR_NONNULL;
void mzd_mul_v_uint64_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* At) ATTR_NONNULL;
void mzd_mul_v_uint64_128_640(mzd_local_t* c, mzd_local_t const* v,
                              mzd_local_t const* At) ATTR_NONNULL;
void mzd_mul_v_uint64_192_960(mzd_local_t* c, mzd_local_t const* v,
                              mzd_local_t const* At) ATTR_NONNULL;
void mzd_mul_v_uint64_256_1216(mzd_local_t* c, mzd_local_t const* v,
                               mzd_local_t const* At) ATTR_NONNULL;
void mzd_mul_v_s128_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_s128_129(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_s128_192(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_s128_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_s128_128_640(mzd_local_t* c, mzd_local_t const* v,
                            mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_s128_192_1024(mzd_local_t* c, mzd_local_t const* v,
                             mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_s128_256_1280(mzd_local_t* c, mzd_local_t const* v,
                             mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_s256_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_s256_129(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_s256_192(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_s256_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_s256_128_768(mzd_local_t* c, mzd_local_t const* v,
                            mzd_local_t const* At) ATTR_NONNULL;
void mzd_mul_v_s256_192_1024(mzd_local_t* c, mzd_local_t const* v,
                             mzd_local_t const* At) ATTR_NONNULL;
void mzd_mul_v_s256_256_1280(mzd_local_t* c, mzd_local_t const* v,
                             mzd_local_t const* At) ATTR_NONNULL;

/**
 * Compute v * A optimized for v being a vector, for specific sizes depending on instance
 * Only work for specific sizes and RLL_NEXT algorithm using uint64 operations
 */
void mzd_addmul_v_uint64_30_128(mzd_local_t* c, mzd_local_t const* v,
                                mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_uint64_30_192(mzd_local_t* c, mzd_local_t const* v,
                                mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_uint64_30_256(mzd_local_t* c, mzd_local_t const* v,
                                mzd_local_t const* A) ATTR_NONNULL;

/**
 * Use SSE2 or NEON
 */
void mzd_addmul_v_s128_30_128(mzd_local_t* c, mzd_local_t const* v,
                              mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s128_30_192(mzd_local_t* c, mzd_local_t const* v,
                              mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s128_30_256(mzd_local_t* c, mzd_local_t const* v,
                              mzd_local_t const* A) ATTR_NONNULL;

/**
 * Use AVX2
 */
void mzd_addmul_v_s256_30_128(mzd_local_t* c, mzd_local_t const* v,
                              mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s256_30_192(mzd_local_t* c, mzd_local_t const* v,
                              mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s256_30_256(mzd_local_t* c, mzd_local_t const* v,
                              mzd_local_t const* A) ATTR_NONNULL;

/**
 * Compute using parity based algorithm
 * */
void mzd_mul_v_parity_uint64_128_30(mzd_local_t* c, mzd_local_t const* v,
                                    mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_parity_uint64_192_30(mzd_local_t* c, mzd_local_t const* v,
                                    mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_parity_uint64_256_30(mzd_local_t* c, mzd_local_t const* v,
                                    mzd_local_t const* A) ATTR_NONNULL;

void mzd_mul_v_parity_s256_256_30(mzd_local_t* c, mzd_local_t const* v,
                                  mzd_local_t const* A) ATTR_NONNULL;

/**
 * Compute c + v * A optimized for c and v being vectors.
 */
void mzd_addmul_v_uint64_128(mzd_local_t* c, mzd_local_t const* v,
                             mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_uint64_129(mzd_local_t* c, mzd_local_t const* v,
                             mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_uint64_192(mzd_local_t* c, mzd_local_t const* v,
                             mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_uint64_256(mzd_local_t* c, mzd_local_t const* v,
                             mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s128_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s128_129(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s128_192(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s128_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s256_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s256_129(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s256_192(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s256_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;

/**
 * Shuffle vector x according to info in mask. Needed for OLLE optimizations.
 */
void mzd_shuffle_128_30(mzd_local_t* x, const word mask) ATTR_NONNULL;
void mzd_shuffle_192_30(mzd_local_t* x, const word mask) ATTR_NONNULL;
void mzd_shuffle_256_30(mzd_local_t* x, const word mask) ATTR_NONNULL;
void mzd_shuffle_pext_128_30(mzd_local_t* x, const word mask) ATTR_NONNULL;
void mzd_shuffle_pext_192_30(mzd_local_t* x, const word mask) ATTR_NONNULL;
void mzd_shuffle_pext_256_30(mzd_local_t* x, const word mask) ATTR_NONNULL;

#define BLOCK(v, b) ((block_t*)ASSUME_ALIGNED(&(v)[(b)], 32))
#define CONST_BLOCK(v, b) ((const block_t*)ASSUME_ALIGNED(&(v)[(b)], 32))

PICNIC_END_C_DECL

#endif