//! The generic (software) implementation.
//!
//! It's used by the other backends (`soft`, `aarch64`, etc.) if
//! hardware support is not detected.

#![forbid(unsafe_code)]

use {
    crate::poly::BLOCK_SIZE,
    core::ops::{BitXor, BitXorAssign, Mul},
};

/// An element in the field
///
/// ```text
/// x^128 + x^127 + x^126 + x^121 + 1
/// ```
#[derive(Copy, Clone, Default, Debug)]
#[cfg_attr(test, derive(Eq, PartialEq))]
#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
pub(crate) struct FieldElement(u64, u64);

impl FieldElement {
    pub(crate) fn from_le_bytes(b: &[u8]) -> Self {
        debug_assert!(b.len() >= 16);

        Self(
            u64::from_le_bytes(b[0..8].try_into().expect("should be 16 bytes")),
            u64::from_le_bytes(
                b[8..16].try_into().expect("should be 16 bytes"),
            ),
        )
    }

    pub(crate) fn to_le_bytes(self) -> [u8; 16] {
        let mut out = [0u8; 16];
        out[0..8].copy_from_slice(&self.0.to_le_bytes());
        out[8..16].copy_from_slice(&self.1.to_le_bytes());
        out
    }
}

impl BitXor for FieldElement {
    type Output = Self;

    fn bitxor(self, rhs: Self) -> Self {
        Self(self.0 ^ rhs.0, self.1 ^ rhs.1)
    }
}

impl BitXorAssign for FieldElement {
    fn bitxor_assign(&mut self, rhs: Self) {
        self.0 ^= rhs.0;
        self.1 ^= rhs.1;
    }
}

impl Mul for FieldElement {
    type Output = Self;

    fn mul(self, rhs: Self) -> Self {
        polymul(self, rhs)
    }
}

pub(crate) fn polymul(x: FieldElement, y: FieldElement) -> FieldElement {
    // We perform schoolbook multiplication of x and y:
    //
    // (x1,x0)*(y1,y0) = (x1*y1) + (x1*y0 + x0*y1) + (x0*y0)
    //                      H         M       M         L
    //
    // The middle result (M) can be simplified with Karatsuba
    // multiplication:
    //
    // (x1*y0 + x0*y1) = (x1+x0) * (y1+x0) + (x1*y1) + (x0*y0)
    //        M                                 H         L
    //
    // This requires one less 64-bit multiplication and reuses
    // the existing results H and L. (H and L are added to M in
    // the montgomery reduction; see x1 and x2.)
    //
    // This gives us a 256-bit product, X.
    //
    // Use the "Shift-XOR reflected reduction" method to reduce
    // it modulo x^128 + x^127 + x^126 + x^121 + 1.
    //
    // This is faster than Gueron's "Fast reduction ..." method
    // without CMUL/PMULL intrinsics.
    //
    // See [gueron] page 17-19.
    //
    // [gueron]: https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf]
    let mut h = gf128_mul(x.1, y.1); // H
    let mut m = gf128_mul(x.1 ^ x.0, y.1 ^ y.0); // M
    let mut l = gf128_mul(x.0, y.0); // L

    m ^= FieldElement(l.0 ^ h.0, l.1 ^ h.1);

    l.1 ^= m.0 ^ (l.0 << 63) ^ (l.0 << 62) ^ (l.0 << 57);
    h.0 ^= l.0 ^ (l.0 >> 1) ^ (l.0 >> 2) ^ (l.0 >> 7);
    h.0 ^= m.1 ^ (l.1 << 63) ^ (l.1 << 62) ^ (l.1 << 57);
    h.1 ^= l.1 ^ (l.1 >> 1) ^ (l.1 >> 2) ^ (l.1 >> 7);

    FieldElement(h.0, h.1)
}

/// Multiplies `acc` with the series of field elements in
/// `blocks`.
pub(crate) fn polymul_series(
    mut acc: FieldElement,
    pow: &[FieldElement; 8],
    blocks: &[u8],
) -> FieldElement {
    debug_assert!(blocks.len() % BLOCK_SIZE == 0);

    // Handle wide chunks.
    let mut blocks = blocks.chunks_exact(BLOCK_SIZE * 8);
    for chunk in blocks.by_ref() {
        let mut h = FieldElement::default();
        let mut m = FieldElement::default();
        let mut l = FieldElement::default();

        macro_rules! karatsuba_xor {
            ($i:expr) => {
                let mut y =
                    FieldElement::from_le_bytes(&chunk[$i * BLOCK_SIZE..]);
                if $i == 0 {
                    y ^= acc
                };
                let x = &pow[$i];
                h ^= gf128_mul(x.1, y.1);
                l ^= gf128_mul(x.0, y.0);
                m ^= gf128_mul(x.1 ^ x.0, y.1 ^ y.0);
            };
        }
        karatsuba_xor!(7);
        karatsuba_xor!(6);
        karatsuba_xor!(5);
        karatsuba_xor!(4);
        karatsuba_xor!(3);
        karatsuba_xor!(2);
        karatsuba_xor!(1);
        karatsuba_xor!(0);

        m ^= FieldElement(l.0 ^ h.0, l.1 ^ h.1);

        l.1 ^= m.0 ^ (l.0 << 63) ^ (l.0 << 62) ^ (l.0 << 57);
        h.0 ^= l.0 ^ (l.0 >> 1) ^ (l.0 >> 2) ^ (l.0 >> 7);
        h.0 ^= m.1 ^ (l.1 << 63) ^ (l.1 << 62) ^ (l.1 << 57);
        h.1 ^= l.1 ^ (l.1 >> 1) ^ (l.1 >> 2) ^ (l.1 >> 7);

        acc = h
    }

    // Handle singles.
    for block in blocks.remainder().chunks_exact(BLOCK_SIZE) {
        let y = FieldElement::from_le_bytes(block);
        acc = (acc ^ y) * pow[7];
    }

    acc
}

/// Returns the constant time 128-bit product of `x` and `y` in
/// GF(2¹²⁸).
///
/// The idea comes from [Thomas Pornin]'s constant-time blog post
/// with 64-bit fixes from [Tim Taubert]'s blog post on formally
/// verified GHASH.
///
/// [Thomas Pornin]: https://www.bearssl.org/constanttime.html
/// [Tim Taubert]: https://timtaubert.de/blog/2017/06/verified-binary-multiplication-for-ghash/
pub(crate) const fn gf128_mul(x: u64, y: u64) -> FieldElement {
    const MASK0: u128 = 0x21084210842108421084210842108421;
    const MASK1: u128 = 0x42108421084210842108421084210842;
    const MASK2: u128 = 0x84210842108421084210842108421084;
    const MASK3: u128 = 0x08421084210842108421084210842108;
    const MASK4: u128 = 0x10842108421084210842108421084210;

    // Split both x and y into 5 words with four-bit holes.
    let x0 = (x as u128) & MASK0;
    let y0 = (y as u128) & MASK0;
    let x1 = (x as u128) & MASK1;
    let y1 = (y as u128) & MASK1;
    let x2 = (x as u128) & MASK2;
    let y2 = (y as u128) & MASK2;
    let x3 = (x as u128) & MASK3;
    let y3 = (y as u128) & MASK3;
    let x4 = (x as u128) & MASK4;
    let y4 = (y as u128) & MASK4;

    let t0 = (x0 * y0) ^ (x1 * y4) ^ (x2 * y3) ^ (x3 * y2) ^ (x4 * y1);
    let t1 = (x0 * y1) ^ (x1 * y0) ^ (x2 * y4) ^ (x3 * y3) ^ (x4 * y2);
    let t2 = (x0 * y2) ^ (x1 * y1) ^ (x2 * y0) ^ (x3 * y4) ^ (x4 * y3);
    let t3 = (x0 * y3) ^ (x1 * y2) ^ (x2 * y1) ^ (x3 * y0) ^ (x4 * y4);
    let t4 = (x0 * y4) ^ (x1 * y3) ^ (x2 * y2) ^ (x3 * y1) ^ (x4 * y0);

    let z = (t0 & MASK0)
        | (t1 & MASK1)
        | (t2 & MASK2)
        | (t3 & MASK3)
        | (t4 & MASK4);

    FieldElement(z as u64, (z >> 64) as u64)
}