#include <stdint.h>
#include <stdio.h>
#include <string.h>

/* pull in the entire thing */
#include "sha512x4.h"
#include "utils.h"

typedef uint64_t u64;
typedef __m256i u256;

#define BYTESWAP(x) _mm256_shuffle_epi8(x, _mm256_set_epi8(0x8,0x9,0xa,0xb,0xc,0xd,0xe,0xf,0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xa,0xb,0xc,0xd,0xe,0xf,0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7))
#define STORE(dest,src) _mm256_storeu_si256((__m256i *)(dest),src)

// Transpose 4 vectors containing 64-bit values
// That is, it rearranges the array:
//     A B C D
//     E F G H
//     I J K L
//     M N O P
// into
//     A E I M
//     B F J N
//     C G K O
//     D H L P
// where each letter stands for 64 bits (and lsbits on the left)
static void transpose(u256 s[4]) {
    u256 tmp[4];
    tmp[0] = _mm256_unpacklo_epi64(s[0], s[1]);
    tmp[1] = _mm256_unpackhi_epi64(s[0], s[1]);
    tmp[2] = _mm256_unpacklo_epi64(s[2], s[3]);
    tmp[3] = _mm256_unpackhi_epi64(s[2], s[3]);
    // tmp is in the order of
    //   A E C G
    //   B F D H
    //   I M K O
    //   J N L P
    s[0] = _mm256_permute2x128_si256(tmp[0], tmp[2], 0x20);
    s[1] = _mm256_permute2x128_si256(tmp[1], tmp[3], 0x20);
    s[2] = _mm256_permute2x128_si256(tmp[0], tmp[2], 0x31);
    s[3] = _mm256_permute2x128_si256(tmp[1], tmp[3], 0x31);
}

void sha512_init4x(sha512x4ctx *ctx) {
#define SET4(x) _mm256_set_epi64x((long long)(x), (long long)(x), (long long)(x), (long long)(x))
    ctx->s[0] = SET4(0x6a09e667f3bcc908ULL);
    ctx->s[1] = SET4(0xbb67ae8584caa73bULL);
    ctx->s[2] = SET4(0x3c6ef372fe94f82bULL);
    ctx->s[3] = SET4(0xa54ff53a5f1d36f1ULL);
    ctx->s[4] = SET4(0x510e527fade682d1ULL);
    ctx->s[5] = SET4(0x9b05688c2b3e6c1fULL);
    ctx->s[6] = SET4(0x1f83d9abfb41bd6bULL);
    ctx->s[7] = SET4(0x5be0cd19137e2179ULL);
#undef SET4

    ctx->datalen = 0;
    ctx->msglen = 0;
}

#define XOR _mm256_xor_si256
#define OR _mm256_or_si256
#define AND _mm256_and_si256
#define ADD64 _mm256_add_epi64

#define LOAD(src) _mm256_loadu_si256((__m256i *)(src))

#define SHIFTR64(x, y) _mm256_srli_epi64(x, y)
#define SHIFTL64(x, y) _mm256_slli_epi64(x, y)

#define ROTR64(x, y) OR(SHIFTR64(x, y), SHIFTL64(x, 64 - (y)))

static u256 XOR3(u256 a, u256 b, u256 c) {
    return XOR(XOR(a, b), c);
}

#define ADD3_64(a, b, c) ADD64(ADD64(a, b), c)
#define ADD4_64(a, b, c, d) ADD64(ADD64(ADD64(a, b), c), d)
#define ADD5_64(a, b, c, d, e) ADD64(ADD64(ADD64(ADD64(a, b), c), d), e)

static u256 MAJ_AVX(u256 a, u256 b, u256 c) {
    return XOR(c, AND(XOR(a, c), XOR(b, c)));
}
static u256 CH_AVX(u256 a, u256 b, u256 c) {
    return XOR(c, AND(a, XOR(b, c)));
}
static u256 SIGMA0_AVX(u256 x) {
    return XOR3(ROTR64(x, 28), ROTR64(x, 34), ROTR64(x, 39));
}
static u256 SIGMA1_AVX(u256 x) {
    return XOR3(ROTR64(x, 14), ROTR64(x, 18), ROTR64(x, 41));
}
static u256 GAMMA0_AVX(u256 x) {
    return XOR3(ROTR64(x, 1),  ROTR64(x, 8), SHIFTR64(x, 7));
}
static u256 GAMMA1_AVX(u256 x) {
    return XOR3(ROTR64(x, 19), ROTR64(x, 61), SHIFTR64(x, 6));
}

#define SHA512ROUND_AVX(a, b, c, d, e, f, g, h, rc, w) \
    T0 = ADD5_64(h, w, SIGMA1_AVX(e), CH_AVX(e, f, g), _mm256_set1_epi64x((long long)RC[rc])); \
    T1 = ADD64(SIGMA0_AVX(a), MAJ_AVX(a, b, c)); \
    (d) = ADD64(d, T0); \
    (h) = ADD64(T0, T1);

static const unsigned long long RC[80] = {
    0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
    0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
    0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
    0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
    0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
    0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
    0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
    0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
    0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
    0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
    0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
    0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
    0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
    0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
    0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
    0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
    0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
    0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
    0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
    0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
    0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
    0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
    0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
    0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
    0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
    0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
    0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
    0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
    0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
    0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
    0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
    0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
    0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
    0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
    0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
    0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
    0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
    0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
    0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
    0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
};

void sha512_transform4x(
    sha512x4ctx *ctx,
    const unsigned char *d0,
    const unsigned char *d1,
    const unsigned char *d2,
    const unsigned char *d3) {
    u256 s0, s1, s2, s3, s4, s5, s6, s7, w[16], T0, T1, nw;

    // Load words and transform data correctly
    w[0     ] = BYTESWAP(LOAD(d0     ));
    w[0 +  4] = BYTESWAP(LOAD(d0 + 32));
    w[0 +  8] = BYTESWAP(LOAD(d0 + 64));
    w[0 + 12] = BYTESWAP(LOAD(d0 + 96));

    w[1     ] = BYTESWAP(LOAD(d1     ));
    w[1 +  4] = BYTESWAP(LOAD(d1 + 32));
    w[1 +  8] = BYTESWAP(LOAD(d1 + 64));
    w[1 + 12] = BYTESWAP(LOAD(d1 + 96));

    w[2     ] = BYTESWAP(LOAD(d2     ));
    w[2 +  4] = BYTESWAP(LOAD(d2 + 32));
    w[2 +  8] = BYTESWAP(LOAD(d2 + 64));
    w[2 + 12] = BYTESWAP(LOAD(d2 + 96));

    w[3     ] = BYTESWAP(LOAD(d3     ));
    w[3 +  4] = BYTESWAP(LOAD(d3 + 32));
    w[3 +  8] = BYTESWAP(LOAD(d3 + 64));
    w[3 + 12] = BYTESWAP(LOAD(d3 + 96));

    transpose(w);
    transpose(w + 4);
    transpose(w + 8);
    transpose(w + 12);

    // Initial State
    s0 = ctx->s[0];
    s1 = ctx->s[1];
    s2 = ctx->s[2];
    s3 = ctx->s[3];
    s4 = ctx->s[4];
    s5 = ctx->s[5];
    s6 = ctx->s[6];
    s7 = ctx->s[7];

    // The first 16 rounds (where the w inputs are directly from the data)
    SHA512ROUND_AVX(s0, s1, s2, s3, s4, s5, s6, s7, 0, w[0]);
    SHA512ROUND_AVX(s7, s0, s1, s2, s3, s4, s5, s6, 1, w[1]);
    SHA512ROUND_AVX(s6, s7, s0, s1, s2, s3, s4, s5, 2, w[2]);
    SHA512ROUND_AVX(s5, s6, s7, s0, s1, s2, s3, s4, 3, w[3]);
    SHA512ROUND_AVX(s4, s5, s6, s7, s0, s1, s2, s3, 4, w[4]);
    SHA512ROUND_AVX(s3, s4, s5, s6, s7, s0, s1, s2, 5, w[5]);
    SHA512ROUND_AVX(s2, s3, s4, s5, s6, s7, s0, s1, 6, w[6]);
    SHA512ROUND_AVX(s1, s2, s3, s4, s5, s6, s7, s0, 7, w[7]);
    SHA512ROUND_AVX(s0, s1, s2, s3, s4, s5, s6, s7, 8, w[8]);
    SHA512ROUND_AVX(s7, s0, s1, s2, s3, s4, s5, s6, 9, w[9]);
    SHA512ROUND_AVX(s6, s7, s0, s1, s2, s3, s4, s5, 10, w[10]);
    SHA512ROUND_AVX(s5, s6, s7, s0, s1, s2, s3, s4, 11, w[11]);
    SHA512ROUND_AVX(s4, s5, s6, s7, s0, s1, s2, s3, 12, w[12]);
    SHA512ROUND_AVX(s3, s4, s5, s6, s7, s0, s1, s2, 13, w[13]);
    SHA512ROUND_AVX(s2, s3, s4, s5, s6, s7, s0, s1, 14, w[14]);
    SHA512ROUND_AVX(s1, s2, s3, s4, s5, s6, s7, s0, 15, w[15]);

#define M(i) (((i)+16) & 0xf)
#define NextW(i) \
    w[M(i)] = ADD4_64(GAMMA1_AVX(w[M((i)-2)]), w[M((i)-7)], GAMMA0_AVX(w[M((i)-15)]), w[M((i)-16)]);

    // The remaining 64 rounds (where the w inputs are a linear fix of the data)
    for (unsigned i = 16; i < 80; i += 16) {
        nw = NextW(0);
        SHA512ROUND_AVX(s0, s1, s2, s3, s4, s5, s6, s7, i + 0, nw);
        nw = NextW(1);
        SHA512ROUND_AVX(s7, s0, s1, s2, s3, s4, s5, s6, i + 1, nw);
        nw = NextW(2);
        SHA512ROUND_AVX(s6, s7, s0, s1, s2, s3, s4, s5, i + 2, nw);
        nw = NextW(3);
        SHA512ROUND_AVX(s5, s6, s7, s0, s1, s2, s3, s4, i + 3, nw);
        nw = NextW(4);
        SHA512ROUND_AVX(s4, s5, s6, s7, s0, s1, s2, s3, i + 4, nw);
        nw = NextW(5);
        SHA512ROUND_AVX(s3, s4, s5, s6, s7, s0, s1, s2, i + 5, nw);
        nw = NextW(6);
        SHA512ROUND_AVX(s2, s3, s4, s5, s6, s7, s0, s1, i + 6, nw);
        nw = NextW(7);
        SHA512ROUND_AVX(s1, s2, s3, s4, s5, s6, s7, s0, i + 7, nw);
        nw = NextW(8);
        SHA512ROUND_AVX(s0, s1, s2, s3, s4, s5, s6, s7, i + 8, nw);
        nw = NextW(9);
        SHA512ROUND_AVX(s7, s0, s1, s2, s3, s4, s5, s6, i + 9, nw);
        nw = NextW(10);
        SHA512ROUND_AVX(s6, s7, s0, s1, s2, s3, s4, s5, i + 10, nw);
        nw = NextW(11);
        SHA512ROUND_AVX(s5, s6, s7, s0, s1, s2, s3, s4, i + 11, nw);
        nw = NextW(12);
        SHA512ROUND_AVX(s4, s5, s6, s7, s0, s1, s2, s3, i + 12, nw);
        nw = NextW(13);
        SHA512ROUND_AVX(s3, s4, s5, s6, s7, s0, s1, s2, i + 13, nw);
        nw = NextW(14);
        SHA512ROUND_AVX(s2, s3, s4, s5, s6, s7, s0, s1, i + 14, nw);
        nw = NextW(15);
        SHA512ROUND_AVX(s1, s2, s3, s4, s5, s6, s7, s0, i + 15, nw);
    }

    // Feed Forward
    ctx->s[0] = ADD64(s0, ctx->s[0]);
    ctx->s[1] = ADD64(s1, ctx->s[1]);
    ctx->s[2] = ADD64(s2, ctx->s[2]);
    ctx->s[3] = ADD64(s3, ctx->s[3]);
    ctx->s[4] = ADD64(s4, ctx->s[4]);
    ctx->s[5] = ADD64(s5, ctx->s[5]);
    ctx->s[6] = ADD64(s6, ctx->s[6]);
    ctx->s[7] = ADD64(s7, ctx->s[7]);
}

static void _sha512x4(
    sha512x4ctx *ctx,
    unsigned char *out0,
    unsigned char *out1,
    unsigned char *out2,
    unsigned char *out3,
    const unsigned char *in0,
    const unsigned char *in1,
    const unsigned char *in2,
    const unsigned char *in3,
    unsigned long long inlen) {
    unsigned int i = 0;

    while (inlen - i >= 128) {
        sha512_transform4x(
            ctx,
            in0 + i,
            in1 + i,
            in2 + i,
            in3 + i
        );
        ctx->msglen += 1024;
        i += 128;
    }

    ctx->datalen = (unsigned int)(inlen - i);
    memcpy(&ctx->msgblocks[128 * 0], in0 + i, ctx->datalen);
    memcpy(&ctx->msgblocks[128 * 1], in1 + i, ctx->datalen);
    memcpy(&ctx->msgblocks[128 * 2], in2 + i, ctx->datalen);
    memcpy(&ctx->msgblocks[128 * 3], in3 + i, ctx->datalen);

    // Padding
    unsigned long curlen;
    if (ctx->datalen < 112) {
        for (i = 0; i < 4; ++i) {
            curlen = ctx->datalen;
            ctx->msgblocks[128 * i + curlen++] = 0x80;
            while (curlen < 128) {
                ctx->msgblocks[128 * i + curlen++] = 0x00;
            }
        }
    } else {
        for (i = 0; i < 4; ++i) {
            curlen = ctx->datalen;
            ctx->msgblocks[128 * i + curlen++] = 0x80;
            while (curlen < 128) {
                ctx->msgblocks[128 * i + curlen++] = 0x00;
            }
        }
        sha512_transform4x(
            ctx,
            ctx->msgblocks,
            ctx->msgblocks + 128,
            ctx->msgblocks + 256,
            ctx->msgblocks + 384
        );
        memset(ctx->msgblocks, 0, 4 * 128);
    }

    // Add length of the message to each block
    ctx->msglen += (unsigned long long)(ctx->datalen) * 8;
    for (i = 0; i < 4; i++) {
        ctx->msgblocks[128 * i + 127] = (unsigned char)(ctx->msglen);
        ctx->msgblocks[128 * i + 126] = (unsigned char)(ctx->msglen >> 8);
        ctx->msgblocks[128 * i + 125] = (unsigned char)(ctx->msglen >> 16);
        ctx->msgblocks[128 * i + 124] = (unsigned char)(ctx->msglen >> 24);
        ctx->msgblocks[128 * i + 123] = (unsigned char)(ctx->msglen >> 32);
        ctx->msgblocks[128 * i + 122] = (unsigned char)(ctx->msglen >> 40);
        ctx->msgblocks[128 * i + 121] = (unsigned char)(ctx->msglen >> 48);
        ctx->msgblocks[128 * i + 120] = (unsigned char)(ctx->msglen >> 56);
        memset( &ctx->msgblocks[128 * i + 112], 0, 8 );
    }
    sha512_transform4x(
        ctx,
        ctx->msgblocks,
        ctx->msgblocks + 128,
        ctx->msgblocks + 256,
        ctx->msgblocks + 384
    );

    // Compute final hash output
    transpose(ctx->s);
    transpose(ctx->s + 4);

    // Store Hash value
    __m256i out[2];
    STORE(out,   BYTESWAP(ctx->s[0]));
    STORE(out + 1, BYTESWAP(ctx->s[4]));
    memcpy(out0, out, 64);

    STORE(out,   BYTESWAP(ctx->s[1]));
    STORE(out + 1, BYTESWAP(ctx->s[5]));
    memcpy(out1, out, 64);

    STORE(out,   BYTESWAP(ctx->s[2]));
    STORE(out + 1, BYTESWAP(ctx->s[6]));
    memcpy(out2, out, 64);

    STORE(out,   BYTESWAP(ctx->s[3]));
    STORE(out + 1, BYTESWAP(ctx->s[7]));
    memcpy(out3, out, 64);
}

/**
 * Note that inlen should be sufficiently small that it still allows for
 * an array to be allocated on the stack. Typically 'in' is merely a seed.
 * Outputs outlen number of bytes
 */
void mgf1x4_512(unsigned char *outx4, unsigned long outlen,
                const unsigned char *in0,
                const unsigned char *in1,
                const unsigned char *in2,
                const unsigned char *in3,
                unsigned long inlen) {
    PQCLEAN_VLA(unsigned char, inbufx4, 4 * (inlen + 4));
    unsigned char outbuf[4 * 64];
    uint32_t i;
    unsigned int j;

    memcpy(inbufx4 + 0 * (inlen + 4), in0, inlen);
    memcpy(inbufx4 + 1 * (inlen + 4), in1, inlen);
    memcpy(inbufx4 + 2 * (inlen + 4), in2, inlen);
    memcpy(inbufx4 + 3 * (inlen + 4), in3, inlen);

    /* While we can fit in at least another full block of SHA512 output.. */
    unsigned long remaining = outlen;
    for (i = 0; remaining > 0; i++) {
        unsigned long this_step = SPX_SHA512_OUTPUT_BYTES;
        if (this_step > remaining) {
            this_step = remaining;
        }
        remaining -= this_step;
        for (j = 0; j < 4; j++) {
            u32_to_bytes(inbufx4 + inlen + j * (inlen + 4), i);
        }

        sha512x4ctx ctx;
        sha512_init4x(&ctx);

        _sha512x4(
            &ctx,
            outbuf + 0 * 64,
            outbuf + 1 * 64,
            outbuf + 2 * 64,
            outbuf + 3 * 64,
            inbufx4 + 0 * (inlen + 4),
            inbufx4 + 1 * (inlen + 4),
            inbufx4 + 2 * (inlen + 4),
            inbufx4 + 3 * (inlen + 4),
            inlen + 4
        );

        memcpy(outx4 + 0 * outlen, outbuf + 0 * 64, this_step);
        memcpy(outx4 + 1 * outlen, outbuf + 1 * 64, this_step);
        memcpy(outx4 + 2 * outlen, outbuf + 2 * 64, this_step);
        memcpy(outx4 + 3 * outlen, outbuf + 3 * 64, this_step);
        outx4 += this_step;
    }
}

void sha512x4_seeded(
    unsigned char *out0,
    unsigned char *out1,
    unsigned char *out2,
    unsigned char *out3,
    const sha512x4ctx *seed,
    const unsigned char *in0,
    const unsigned char *in1,
    const unsigned char *in2,
    const unsigned char *in3,
    unsigned long long inlen) {
    sha512x4ctx ctx;
    sha512_ctx_clone4x(&ctx, seed);
    _sha512x4(
        &ctx,
        out0, out1, out2, out3,
        in0, in1, in2, in3,
        inlen
    );
}

void sha512_ctx_clone4x(sha512x4ctx *out, const sha512x4ctx *in) {
    memcpy(out, in, sizeof(sha512x4ctx));
}