use.std::crypto::hashes::rpo

# CONSTANTS
# =================================================================================================

const.J=77321994752
const.M=12289
const.SQUARE_NORM_BOUND=34034726

# MODULAR REDUCTION FALCON PRIME
# =============================================================================================

#! Given dividend ( i.e. field element a ) on stack top, this routine computes c = a % 12289
#!
#! Expected stack state
#!
#! [a, ...]
#!
#! Output stack state looks like
#!
#! [c, ...] | c = a % 12289
export.mod_12289
    u32split
    push.M.0

    adv.push_u64div

    adv_push.2
    u32assert2

    swap
    push.M
    u32overflowing_mul

    movup.2
    push.M
    u32overflowing_madd
    drop

    adv_push.2
    drop
    u32assert

    dup

    movup.3
    u32overflowing_add

    movup.3
    u32overflowing_add
    drop

    movup.5
    assert_eq
    movup.4
    assert_eq

    swap
    drop
    swap
    drop
end

# HASH-TO-POINT
# =============================================================================================

#! Takes as input a message digest, a nonce of size 40 bytes represented as 8 field elements
#! and a pointer. The procedure absorbs MSG and NONCE into a fresh RPO state and squeezes the
#! coefficients of a polynomial c representing the hash-to-point of (MSG || NONCE). The coefficients
#! are then saved in the memory region [c_ptr, c_ptr + 128).
#! This implementation of the `hash_to_point` procedure avoids the rejection-sampling step
#! required in the per-the-spec algorithm by using the observation on page 31 in
#! https://falcon-sign.info/falcon.pdf
#!
#! Input: [c_ptr, MSG, NONCE1, NONCE0, ...]
#! Output: [...]
#!
#! Cycles: 1327
export.hash_to_point.2
    # Move pointer out of the way
    movdn.12

    # Store MSG for later absorption
    loc_storew.1 dropw

    # Absorb the nonce
    padw movdnw.2
    hperm

    # Absorb the message
    swapw loc_loadw.1 swapw hperm

    # Squeeze the coefficients and save them
    repeat.63
        swapw dup.12
        mem_storew
        swapw dup.12 add.2 swap.13 add.1
        mem_storew
        hperm
    end

    # Save the last remaining coefficients
    dup.12 add.1 mem_storew dropw
    movup.8 mem_storew dropw

    # Clean up the stack
    dropw
end


# PROBABILISTIC POLYNOMIAL MULTIPLICATION IN Z_Q[x]
# =============================================================================================

#! For an element `tau := (tau0, tau1)` in the quadratic extension field, computes all its powers
#! `tau^i` for `i = 0,..., 512` and stores them in the memory region `[tau_ptr, tau_ptr + 513)`.
#! The procedure returns `tau_ptr + 513`.
#!
#! Input: [tau1, tau0, tau_ptr, ...]
#! Output: [tau_ptr + 513, ...]
#!
#! Cycles: 8323
export.powers_of_tau

    # 1) Save tau^0 i.e. (0, 1)
    push.1 push.0.0.0
    dup.6 add.1 swap.7
    mem_storew
    drop drop
    #=> [0, 1, tau1, tau0, tau_ptr+1, ...]

    # 2) Compute tau^i
    repeat.512
        dupw ext2mul

        movup.3 movup.3

        dup.6 add.1 swap.7 mem_storew

        drop drop
    end

    dropw
    #=> [tau_ptr + 513, ...]
end


#! Sets the memory region `[ptr, ptr + 512)` to zero. The pointer c_ptr := ptr + 512 is returned
#! to be used to store the hash-to-point polynomial of the message later on.
#!
#! Input: [ptr, ...]
#! Output: [...]
#!
#! Cycles: 2607
export.set_to_zero
    padw
    repeat.512
        dup.4 add.1 swap.5
        mem_storew
    end
    dropw
end

#! Takes as input PK, the hash of the coefficients of the polynomial `h` representing the expanded
#! public key, and a pointer to the memory location where the coefficients of the polynomial `h`
#! will be stored.
#! The procedure loads `h` from the advice stack and compares its hash with the provided hash `PK`.
#! It then loads the polynomial `s2` representing the signature from the advice stack and lays it
#! in memory right after `h`.
#! It then loads the claimed polynomial `h * s2` in Z_Q[x] where Q is the Miden VM prime from
#! the advice stack and lays it right after `s2`.
#! The hash of `h`, `s2` and the claimed product is also computed and the first two field elements
#! of the digest (i.e., the Fiat-Shamir challenge) are returned on the stack alongside
#! the incremented pointer.
#!
#! Input: [ptr, PK, ...]
#! Output: [tau1, tau0, ptr + 512 ...]
#!
#! Cycles: 5049
export.load_h_s2_and_product.1

    # 1) Store PK for later comparison
    movdn.4
    loc_storew.0

    # 2) Prepare stack and load h polynomial. We also range check the coefficients of h.
    padw swapw
    padw
    repeat.64
        adv_pipe

        dupw.1
        u32assert2
        push.M u32lt assert
        push.M u32lt assert
        u32assert2
        push.M u32lt assert
        push.M u32lt assert

        dupw
        u32assert2
        push.M u32lt assert
        push.M u32lt assert
        u32assert2
        push.M u32lt assert
        push.M u32lt assert

        hperm
    end

    # 3) Load saved claimed hash of h and compare
    loc_loadw.0
    movup.4 assert_eq
    movup.3 assert_eq
    movup.2 assert_eq
    assert_eq

    # 4) Load s2 (Due to the final norm test we do not need to range check the s2 coefficients)
    padw padw
    repeat.64
        adv_pipe hperm
    end

    # 5) Load claimed h * s2 in Z_Q[x]
    repeat.128
        adv_pipe hperm
    end

    # 6) Return the challenge point and the incremented pointer
    exec.rpo::squeeze_digest
    drop drop
    #=> [tau1, tau0, ptr + 512]
end

#! Checks that pi == h * s2 in Z_Q[x] by evaluating both sides at a random point.
#! The procedure takes as input a pointer h_ptr to h. The other two polynomials
#! are located at h_ptr + 128, for s2, and h_ptr + 256, for pi. The procedure takes
#! also a pointer zeros_ptr to a region of memory [zeros_ptr, zeros_ptr + 1024)
#! and a pointer tau_ptr to powers of the random point we are evaluating at stored
#! as [a_i, b_i, x, x] where (a_i, b_i) := tau^i for i in [0, 1023].
#! The procedure returns () if the check passes, otherwise it raises an exception
#! related to an unsatisfied assertion.
#!
#! Input: [h_ptr, zeros_ptr, tau_ptr, ...]
#! Output: [...]
#!
#! Cycles: 2504
export.probabilistic_product.4
    # 1) Save the pointers
    push.0 movdn.3
    loc_storew.0

    # 2) Compute the evaluation of the h polynomial at the random challenge

    # Accumulator to compute h(tau)
    padw

    # For mem_stream
    padw padw

    # Compute h(tau)
    repeat.64
        mem_stream
        repeat.8
            rcomb_base
        end
    end

    # Save the evaluation h(tau)
    swapdw loc_storew.1
    dropw
    #=> [X, X, X, ...]

    # 3) Compute the evaluation of the s2 polynomial at the random challenge
    loc_loadw.0
    add.128
    #=> [s2_ptr, zeros_ptr, tau_ptr, 0, X, X, ...]

    # Accumulator to compute s2(tau)
    padw
    swapdw
    #=> [X, X, 0, 0, 0, 0, s2_ptr, zeros_ptr, tau_ptr, 0, ...]

    # Compute s2(tau)
    repeat.64
        mem_stream
        repeat.8
            rcomb_base
        end
    end

    # Save the evaluation of s2(tau)
    swapdw loc_storew.2
    dropw
    #=> [X, X, X, ...]

    # 4) Compute the evaluation of the product polynomial pi := h * s2 at the random challenge where
    #       the product is over Z_Q[x]
    # We compute the evaluation of pi at tau as pi(tau) = pi1(tau) + tau^512 * pi2(tau) where pi1 and
    # pi2 are the first and second halves of the product polynomial. This is done in order to reduce
    # the number of powers of tau needed to compute pi(tau).

    # Setup the pointers
    loc_loadw.0
    add.256
    #=> [pi_ptr, zeros_ptr, tau_ptr, 0, X, X, ...]

    # Accumulator to compute pi(tau)
    padw
    swapdw
    #=> [X, X, 0, 0, 0, 0, pi_ptr, zeros_ptr, tau_ptr, 0, ...]

    # Compute pi1(tau)
    repeat.64
        mem_stream
        repeat.8
            rcomb_base
        end
    end
    #=> [X, X, ev1, ev0, ev1, ev0, pi_ptr, zeros_ptr, tau_ptr, 0, ...]
    # where (ev0, ev1) := pi1(tau)

    # Save pi_1(tau)
    swapw.2 loc_storew.3

    # Setup the pointers
    swapw.3 loc_loadw.0 add.384

    # Accumulator to compute pi2(tau)
    swapw dropw padw
    swapdw

    # Compute pi2(tau)
    repeat.64
        mem_stream
        repeat.8
            rcomb_base
        end
    end
    #=> [X, X, ev1, ev0, ev1, ev0, pi_ptr, zeros_ptr, tau_ptr, 0, ...]

    # Load tau^512
    swapw.3
    #=> [pi_ptr, zeros_ptr, tau_ptr, 0, X, ev1, ev0, ev1, ev0, X, ...]
    dup.2 mem_loadw drop drop
    #=> [tn1, tn0, X, ev1, ev0, ev1, ev0, X, ...]

    # Compute (a0, a1) := tau^512 * pi2(tau)
    movup.7 movup.7 ext2mul
    #=> [a1, a0, X, x, x, X, ...]

    # Compute (res0, res1) := pi(tau)
    swapw loc_loadw.3 drop drop
    ext2add
    #=> [res1, res0, x, x, X, ...]


    # 5) Check product

    ## a) Load h(tau)
    swapw
    loc_loadw.1

    ## b) Load s2(tau)
    push.0.0
    loc_loadw.2

    ## c) compute the product
    drop drop
    ext2mul

    ## d) assert equality
    movup.2
    assert_eq
    assert_eq

    ## e) clean up the stack
    drop drop
    #=> [...]
end


# SQUARE NORM OF Z_q[x]/(phi) POLYNOMIALS
# =============================================================================================

#! Normalizes an `e` in [0, q) to be in [-(q-1) << 1, (q-1) << 1) and returns its square norm.
#!
#! We use the following formula to do so:
#! normalize(e) = e^2 - phi * (2*q*e - q^2) where phi := (e > (q - 1)/2)
#!
#! The formula implements:
#!
#! if e > (q-1)/2:
#!   return (q - e)^2
#! else:
#!   return e^2
#!
#! The use of the formula avoids using the if-else block.
#!
#! Input: [e, ...]
#! Output [norm(e)^2, ...]
#!
#! Cycles: 21
export.norm_sq

    dup
    dup mul
    swap
    #=> [e, e^2, ...]

    dup
    push.6144
    u32gt
    #=> [phi, e, e^2, ...]

    swap
    mul.24578       # 2*q
    push.151019521  # q^2
    sub
    #=> [2*q*e - q^2, phi, e^2, ...]

    mul sub
    #=> [norm(e)^2, ...]
end

#! On input a tuple (u, w, v), the following computes (v - (u + (- w % q) % q) % q).
#! We can avoid doing three modular reductions by using the following facts:
#!
#! 1. q is much smaller than the Miden prime. Precisely, q * 2^50 < Q
#! 2. The coefficients of the product polynomial, u and w, are less than J := 512 * q^2
#! 3. The coefficients of c are less than q.
#!
#! This means that we can substitute (v - (u + (- w % q) % q) % q) with  v + w + J - u without
#! risking Q-overflow since |v + w + J - u| < 1025 * q^2
#!
#! To get the final result we reduce (v + w + J - u) modulo q.
#!
#! Input: [v, w, u, ...]
#! Output: [e, ...]
#!
#! Cycles: 44
export.diff_mod_q

    # 1) v + w + J
    add push.J add
    #=> [v + w + J, u]

    # 2) v + w + J - u
    swap sub
    #=> [v + w + J - u]

    # 3) Reduce modulo q
    exec.mod_12289
    #=> [e, ...]
end

#! Takes a pointer to a polynomial pi of degree less than 1024 with coefficients in Z_Q and
#! a polynomial c of degree 512 with coefficients also in Z_Q, where Q is the Miden prime.
#! The goal is to compute s1 = c - pi  = c - h * s2 in Z_q[x]/(phi) where q is the Falcon prime.
#! The pointer pi_ptr points both to pi and c through the relation c_ptr = pi_ptr + offset
#! where offset := 1281.
#! The naive way to compute s1 would be to first reduce the polynomial pi modulo the Falcon
#! prime q and then modulo the irreducible polynomial phi = x^512 + 1. Then we would need to negate
#! the coefficients of pi modulo q and only then can we add these coefficients to the coefficients
#! of c and then reduce the result modulo q one more time.
#! Knowing that the end goal of computing c is to compute its norm squared, we can do better.
#!
#! We can compute s1 in a single pass by delaying the q-modular reduction til the end. This can
#! be achieved through a careful analysis of the computation of the difference between pi and c.
#!
#! The i-th coefficient s1_i of s1 is equal to c_i - (pi_i - pi_{512 + i}) which is equal to
#! c_i  + pi_{512 + i} - pi_i. Now, we know that the size of the pi_i coefficients is bounded by
#! J := 512 * q^2 and this means that J + pi_{512 + i} - pi_i does not Q-underflow and since
#! J = 0 modulo q, the addition of J does not affect the final result. It is also important to
#! note that adding J does not Q-overflow by virtue of q * 2^50 < Q.
#! All of the above implies that we can compute s1_i with only one modular reduction at the end,
#! in addition to one modular reduction applied to c_i.
#! Moreover, since we are only interested in the square norm of s1_i, we do not have to store
#! s1_i and then load it at a later point, and instead we can immediately follow the computation
#! of s1_i with computing its square norm.
#! After computing the square norm of s1_i, we can accumulate into an accumulator to compute the
#! sum of the square norms of all the coefficients of polynomial c. Using the overflow stack, this
#! can be delayed til the end.
#!
#! Input: [pi_ptr, ...]
#! Output: [norm_sq(s1), ...]
#!
#! Cycles: 58888
export.compute_s1_norm_sq
    repeat.128
        # 1) Load the next 4 * 3 coefficients
        # load c_i
        padw
        dup.4 add.1281
        mem_loadw

        # load pi_{i+512}
        padw
        dup.8 add.128
        mem_loadw

        # load pi_i
        padw
        dup.12
        mem_loadw
        #=> [PI, PI_{i+512}, C, pi_ptr, ...]

        # 2) Compute the squared norm of (i + 0)-th coefficient of s1
        movup.8
        exec.mod_12289
        movup.5
        #=> [v, w, u, ...] where u is the i-th coefficient of `pi`, v is the i-th
        # coefficient of `c` and w is the (512 + i)-th coefficient of `pi` polynomial.

        exec.diff_mod_q
        #=> [e, ...]

        exec.norm_sq
        #=> [norm(e)^2, ...]

        # Move the result out of the way so that we can process the remaining coefficients
        movdn.10

        # 3) Compute the squared norm of (i + 1)-th coefficient of s1
        movup.6
        exec.mod_12289
        movup.4
        exec.diff_mod_q
        exec.norm_sq
        movdn.7

        # 4) Compute the squared norm of (i + 2)-th coefficient of s1
        movup.4
        exec.mod_12289
        movup.3
        exec.diff_mod_q
        exec.norm_sq
        movdn.4

        # 5) Compute the squared norm of (i + 3)-th coefficient of s1
        movup.2
        exec.mod_12289
        movup.2
        exec.diff_mod_q
        exec.norm_sq
        swap

        # 6) Increment the pointer
        add.1
    end

    # Sum up the squared norm of all the coefficients of s1
    drop    # drop the pointer
    repeat.511
        add
    end
    #=> [norm_sq(s1), ...]
end

#! Compute the square norm of the polynomial s2 given a pointer to its coefficients.
#!
#! Input: [s2_ptr, ...]
#! Output: [norm_sq(s2), ...]
#!
#! Cycles: 13322
export.compute_s2_norm_sq
    repeat.128
        padw
        dup.4 add.1 swap.5
        mem_loadw

        repeat.4
            exec.norm_sq
            movdn.4
        end

    end
    drop
    repeat.511
        add
    end
end


# FALCON SIGNATURE VERIFICATION ALGORITHM
# =============================================================================================

#! Verifies a signature against a public key and a message. The procedure gets as inputs the hash
#! of the public key and the hash of the message via the operand stack. The signature is provided
#! via the advice stack.
#! The signature is valid if and only if the procedure returns.
#!
#! Input: [PK, MSG, ...]
#! Output: [...]
#!
#! Cycles: ~ 92029
export.verify.1665

    # 1) Generate a Falcon signature using the secret key associated to PK on message MSG.
    adv.push_sig.rpo_falcon512
    #=> [PK, MSG, ...]

    # 2) Load the NONCE from the advice provider. This is encoded as 8 field elements
    padw adv_loadw padw adv_loadw
    #=> [NONCE1, NONCE0, PK, MSG...]

    # 3) Load the public key polynomial h and the signature polynomial s2 and the product of
    # the two polynomials pi := h * s2 in Z_Q[x]. This also checks that h hashes to the provided
    # digest PK. While loading the polynomials, the hash of the three polynomials is computed
    # and the first half of the digest is kept on the stack for later use by the
    # `probabilistic_product` procedure.

    swapdw
    #=> [PK, MSG, NONCE1, NONCE0...]
    locaddr.0
    #=> [h_ptr, PK, MSG, NONCE1, NONCE0...]
    exec.load_h_s2_and_product
    #=> [tau1, tau0, tau_ptr, MSG, NONCE1, NONCE1, ...]      (Cycles: 5050)

    exec.powers_of_tau
    #=> [zeros_ptr, MSG, NONCE1, NONCE1, ...]                (Cycles: 8323)

    exec.set_to_zero
    #=> [c_ptr, MSG, NONCE1, NONCE1, ...]                    (Cycles: 2607)

    # 4) Compute the hash-to-point of the message MSG from the provided NONCE and save
    # the resulting polynomial c in the memory region [c_ptr, c_ptr + 128)

    exec.hash_to_point
    #=> [...]                                       (Cycles: 1327)

    # 5) Check that we indeed have pi := h * s2 in Z_Q[x] by checking that pi(tau) = h(tau) * s2(tau)
    # where tau is a random (Fiat-Shamir) challenge resulting from hashing h, s2 and pi.

    locaddr.512     # tau_ptr
    locaddr.1025    # z_ptr
    locaddr.0       # h ptr
    #=> [h_ptr, zeros_ptr, tau_ptr, ...]

    exec.probabilistic_product
    #=> [...]                                       (Cycles: 2504)

    # 6) Compute the squared norm of s1 := c - h * s2 (in Z_q[x]/(phi))

    locaddr.256
    #=> [pi_ptr, ...]

    exec.compute_s1_norm_sq
    #=> [norm_sq(s1), ...]                          (Cycles: 58888)

    # 7) Compute the squared norm of s2

    locaddr.128
    #=> [s2_ptr, norm_sq(s1), ...]

    exec.compute_s2_norm_sq
    #=> [norm_sq(s2), norm_sq(s1), ...]             (Cycles: 13322)

    # 8) Check that ||(s1, s2)||^2 < K

    add
    #=> [norm_sq(s1) + norm_sq(s2), ...]

    push.SQUARE_NORM_BOUND
    u32assert2 u32lt assert
    #=> [...]                                       (Cycles: 8)
end